o
    wi%                     @   s   d dl mZmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ ddlmZmZmZ er\d dlmZ d dlmZ d dlm  m Z! G dd deZ"G dd dej#Z$dS )    )TYPE_CHECKINGCallableDictOptionalTupleN)parallel_state)TransformerConfig)Tensornn)llm))average_losses_across_data_parallel_group)MaskedTokenLossReduction)logging)unwrap_model   )#adjust_distillation_model_for_mcoreload_distillation_configteacher_provider)TokenizerSpec)OptimizerModulec                
       s`   e Zd ZdZ fddZdeeef dedeeeeef f fddZ	d	ed
efddZ
  ZS )_DistillationLossReductionzACustom masking and reduction callable used only in training mode.c                    s0   t  j|i | || _t | _t | _d S N)super__init___distillation_loss_fnr   get_context_parallel_world_size_cp_size$get_tensor_model_parallel_world_size_tp_size)selfdistillation_loss_fnargskwargs	__class__ h/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/modelopt/distill/model.pyr   (   s   
z#_DistillationLossReduction.__init__batchforward_outreturnc                    s   t |tr|\} d< | d }j| fddd}t|d |d |d g}|dd	 |d	d
 |d
d d}|d |fS )N	loss_maskc                    s    |  d S )Nr*   )_masked_token_loss)xr'   r   r%   r&   <lambda>:   s    z4_DistillationLossReduction.forward.<locals>.<lambda>)student_lossloss_reduction_fnkd_losslogits_lossintermediate_lossr   r         )avgkd_logits_train_losskd_intermediate_train_loss)
isinstancetupler+   r   r   )r   r'   r(   lm_losslosseslosses_averagedreportr%   r-   r&   forward.   s   



z"_DistillationLossReduction.forwardloss_outputmaskc                 C   s   t |tr|\}}}nd\}}|  }|r&t }tj|| jdd| }|	d }|
d }t|| }	| jdkr`t|		d|	dg}
tjj|
t d |
d |
d  }
n|	| }
|sh|rrtjj|
t d |
S )zIThe function takes as input per-token loss and masks non-required values.)FFr   )dim)groupr   )r9   r:   sumfloatr   get_tensor_model_parallel_ranktorchtensor_splitr   viewreshaper   catdistributed
all_reduceget_context_parallel_groupget_tensor_model_parallel_group)r   r@   rA   	tp_reduceis_sequence_parallelnum_valid_tokensidxr<   r*   loss_sumlossr%   r%   r&   r+   G   s$   

z-_DistillationLossReduction._masked_token_loss)__name__
__module____qualname____doc__r   r   strr	   r   r?   r+   __classcell__r%   r%   r#   r&   r   %   s
    .r   c                       s   e Zd ZdZ				ddedededee ded d	ed
 deeej	gej	f  f fddZ
dd ZdefddZedefddZdd Zedd Zd def fddZ fddZ  ZS )!DistillationGPTModelz;Custom GPT subclass for distillation-related modifications.Nconfigteacher_configteacher_ckpt_pathdistillation_config_pathoptimr   	tokenizerr   model_transformc                    sd   t  |||| || _|| _|| _d| _t|tjr!t|tjs&t	
d | jjdur0tddS )ae  Constructor.

        This subclass of GPTModel takes the configs of a student and teacher model and overrides
        the model construction step to create a ModelOpt `DistillationModel` as the underlying
        MCore model. This model abstracts both student and teacher as a single module whose forward
        pass runs both, and whose loss function automatically calculates a distillation loss on the
        output logits.

        NOTE: This class saves checkpoints which will be re-loaded as the student's original class.
        This allows one to continue using the model after distillation without this special class.

        Args:
            config: Config of student model.
            teacher_config: Config of teacher model.
            teacher_ckpt_path: Path to teacher checkpoint (to restore weights).
            distillation_config_path: Path to distillation config YAML file.
                If not provided, by default will perform logits-only distillation.
            optim: Optimizer.
            tokenizer: Tokenizer.
            model_transform: Transform to apply to model during setup.
        Fz{Student and Teacher configs should both inherit from llm.GPTConfig. Configs may not work properly with DistillationGPTModelNzFModelOpt Distillation incompatible with interleaved pipeline schedule.)r   r   _teacher_config_teacher_ckpt_path_distillation_config_path_train_calledr9   r   	GPTConfigr   warningr^   $virtual_pipeline_model_parallel_size
ValueError)r   r^   r_   r`   ra   rb   rc   rd   r#   r%   r&   r   i   s   zDistillationGPTModel.__init__c                 C   s   t | drd S | j| j}dD ]}t| j|t| j| qt| j| j| j}t	| j| j
g| j| jdf|j|jd}tj|d|fgd}t||d || _d S )Nmodule)tensor_model_parallel_sizepipeline_model_parallel_sizecontext_parallel_sizesequence_parallelpipeline_dtype)rc   trainer)teacher_model	criterionloss_balancerr1   )mode)distill_cfg)hasattrr^   configure_modelrc   setattrre   getattrr   rg   r   rf   rs   ru   rv   mtdconvertr   rm   )r   modelattrrx   	kd_configdistillation_modelr%   r%   r&   rz      s    

	
z$DistillationGPTModel.configure_modelr)   c                 O   s   t d)Nz_Please restore a checkpoint of this model to its original class to call `get_inference_wrapper`)NotImplementedError)r   r!   r"   r%   r%   r&   get_inference_wrapper   s   z*DistillationGPTModel.get_inference_wrapperc                 C   s   | j st| jjd| _ | j S )N)r    )_training_loss_reductionr   core_modulecompute_kd_lossr   r%   r%   r&   training_loss_reduction   s
   z,DistillationGPTModel.training_loss_reductionc                 O   s,   dd |  D }| jj|g|R i |S )Nc                 S   s   i | ]\}}| d d|qS )zmodule. )replace).0kvr%   r%   r&   
<dictcomp>   s    z8DistillationGPTModel.load_state_dict.<locals>.<dictcomp>)itemsr   load_state_dict)r   
state_dictr!   r"   r%   r%   r&   r      s   z$DistillationGPTModel.load_state_dictc                 C   s
   t | jS r   )r   rm   r   r%   r%   r&   r      s   
z DistillationGPTModel.core_moduleTrw   c                    s   d| _ t |S )NT)rh   r   train)r   rw   r#   r%   r&   r      s   zDistillationGPTModel.trainc                    s0   |dkr| j s| | d S d| _ t ||S )NtrainingF)rh   r   r   __setattr__)r   namevaluer#   r%   r&   r      s   
z DistillationGPTModel.__setattr__)NNNN)T)rW   rX   rY   rZ   r   r[   r   r   r
   Moduler   rz   r	   r   propertyr   r   r   r   boolr   r   r\   r%   r%   r#   r&   r]   f   s<    -"
r]   )%typingr   r   r   r   r   rH   megatron.corer   megatron.core.transformerr   r	   r
   nemo.collectionsr   2nemo.collections.nlp.modules.common.megatron.utilsr    nemo.lightning.megatron_parallelr   
nemo.utilsr   nemo.utils.model_utilsr   utilsr   r   r   1nemo.collections.common.tokenizers.tokenizer_specr   nemo.lightning.pytorch.optimr   modelopt.torch.distilldistillr}   r   GPTModelr]   r%   r%   r%   r&   <module>   s"   A