o
    }oi`&                     @   s   d dl mZmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ ddlmZmZmZ erbd dlmZ d dlm Z  ed\Z!Z"G dd deZ#G dd dej$Z%dS )    )TYPE_CHECKINGCallableDictOptionalTupleN)parallel_state)TransformerConfig)Tensornn)llm))average_losses_across_data_parallel_group)MaskedTokenLossReduction)logging)safe_import)unwrap_model   )#adjust_distillation_model_for_mcoreload_distillation_configteacher_provider)TokenizerSpec)OptimizerModulezmodelopt.torch.distillc                
       s`   e Zd ZdZ fddZdeeef dedeeeeef f fddZ	d	ed
efddZ
  ZS )_DistillationLossReductionzACustom masking and reduction callable used only in training mode.c                    s0   t  j|i | || _t | _t | _d S N)super__init___distillation_loss_fnr   get_context_parallel_world_size_cp_size$get_tensor_model_parallel_world_size_tp_size)selfdistillation_loss_fnargskwargs	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/modelopt/distill/model.pyr   )   s   
z#_DistillationLossReduction.__init__batchforward_outreturnc                    s   t |tr|\} d< | d }j| fddd}t|d |d |d g}|dd	 |d	d
 |d
d d}|d |fS )N	loss_maskc                    s    |  d S )Nr+   )_masked_token_loss)xr(   r    r&   r'   <lambda>;   s    z4_DistillationLossReduction.forward.<locals>.<lambda>)student_lossloss_reduction_fnkd_losslogits_lossintermediate_lossr   r         )avgkd_logits_train_losskd_intermediate_train_loss)
isinstancetupler,   r   r   )r    r(   r)   lm_losslosseslosses_averagedreportr&   r.   r'   forward/   s   



z"_DistillationLossReduction.forwardloss_outputmaskc                 C   s   t |tr|\}}}nd\}}|  }|r&t }tj|| jdd| }|	d }|
d }t|| }	| jdkr`t|		d|	dg}
tjj|
t d |
d |
d  }
n|	| }
|sh|rrtjj|
t d |
S )zIThe function takes as input per-token loss and masks non-required values.)FFr   )dim)groupr   )r:   r;   sumfloatr   get_tensor_model_parallel_ranktorchtensor_splitr   viewreshaper   catdistributed
all_reduceget_context_parallel_groupget_tensor_model_parallel_group)r    rA   rB   	tp_reduceis_sequence_parallelnum_valid_tokensidxr=   r+   loss_sumlossr&   r&   r'   r,   H   s$   

z-_DistillationLossReduction._masked_token_loss)__name__
__module____qualname____doc__r   r   strr	   r   r@   r,   __classcell__r&   r&   r$   r'   r   &   s
    .r   c                       s   e Zd ZdZ				ddedededee ded d	ed
 deeej	gej	f  f fddZ
dd ZdefddZedefddZdd Zedd Zd def fddZ fddZ  ZS )!DistillationGPTModelz;Custom GPT subclass for distillation-related modifications.Nconfigteacher_configteacher_ckpt_pathdistillation_config_pathoptimr   	tokenizerr   model_transformc                    sp   t stdt |||| || _|| _|| _d| _t|t	j
r't|t	j
s,td | jjdur6tddS )ae  Constructor.

        This subclass of GPTModel takes the configs of a student and teacher model and overrides
        the model construction step to create a ModelOpt `DistillationModel` as the underlying
        MCore model. This model abstracts both student and teacher as a single module whose forward
        pass runs both, and whose loss function automatically calculates a distillation loss on the
        output logits.

        NOTE: This class saves checkpoints which will be re-loaded as the student's original class.
        This allows one to continue using the model after distillation without this special class.

        Args:
            config: Config of student model.
            teacher_config: Config of teacher model.
            teacher_ckpt_path: Path to teacher checkpoint (to restore weights).
            distillation_config_path: Path to distillation config YAML file.
                If not provided, by default will perform logits-only distillation.
            optim: Optimizer.
            tokenizer: Tokenizer.
            model_transform: Transform to apply to model during setup.
        z5nvidia-modelopt is needed to use DistillationGPTModelFz{Student and Teacher configs should both inherit from llm.GPTConfig. Configs may not work properly with DistillationGPTModelNzFModelOpt Distillation incompatible with interleaved pipeline schedule.)HAVE_MODELOPTRuntimeErrorr   r   _teacher_config_teacher_ckpt_path_distillation_config_path_train_calledr:   r   	GPTConfigr   warningr_   $virtual_pipeline_model_parallel_size
ValueError)r    r_   r`   ra   rb   rc   rd   re   r$   r&   r'   r   j   s   zDistillationGPTModel.__init__c                 C   s   t | drd S | j| j}dD ]}t| j|t| j| qt| j| j| j}t	| j| j
g| j| jdf|j|jd}tj|d|fgd}t||d || _d S )Nmodule)tensor_model_parallel_sizepipeline_model_parallel_sizecontext_parallel_sizesequence_parallelpipeline_dtype)rd   trainer)teacher_model	criterionloss_balancerr2   )mode)distill_cfg)hasattrr_   configure_modelrd   setattrrh   getattrr   rj   r   ri   rv   rx   ry   mtdconvertr   rp   )r    modelattrr{   	kd_configdistillation_modelr&   r&   r'   r}      s    

	
z$DistillationGPTModel.configure_modelr*   c                 O   s   t d)Nz_Please restore a checkpoint of this model to its original class to call `get_inference_wrapper`)NotImplementedError)r    r"   r#   r&   r&   r'   get_inference_wrapper   s   z*DistillationGPTModel.get_inference_wrapperc                 C   s   | j st| jjd| _ | j S )N)r!   )_training_loss_reductionr   core_modulecompute_kd_lossr    r&   r&   r'   training_loss_reduction   s
   z,DistillationGPTModel.training_loss_reductionc                 O   s,   dd |  D }| jj|g|R i |S )Nc                 S   s   i | ]\}}| d d|qS )zmodule. )replace).0kvr&   r&   r'   
<dictcomp>   s    z8DistillationGPTModel.load_state_dict.<locals>.<dictcomp>)itemsr   load_state_dict)r    
state_dictr"   r#   r&   r&   r'   r      s   z$DistillationGPTModel.load_state_dictc                 C   s
   t | jS r   )r   rp   r   r&   r&   r'   r      s   
z DistillationGPTModel.core_moduleTrz   c                    s   d| _ t |S )NT)rk   r   train)r    rz   r$   r&   r'   r      s   zDistillationGPTModel.trainc                    s0   |dkr| j s| | d S d| _ t ||S )NtrainingF)rk   r   r   __setattr__)r    namevaluer$   r&   r'   r      s   
z DistillationGPTModel.__setattr__)NNNN)T)rX   rY   rZ   r[   r   r\   r   r   r
   Moduler   r}   r	   r   propertyr   r   r   r   boolr   r   r]   r&   r&   r$   r'   r^   g   s<    /"
r^   )&typingr   r   r   r   r   rI   megatron.corer   megatron.core.transformerr   r	   r
   nemo.collectionsr   2nemo.collections.nlp.modules.common.megatron.utilsr    nemo.lightning.megatron_parallelr   
nemo.utilsr   nemo.utils.import_utilsr   nemo.utils.model_utilsr   utilsr   r   r   1nemo.collections.common.tokenizers.tokenizer_specr   nemo.lightning.pytorch.optimr   r   rf   r   GPTModelr^   r&   r&   r&   r'   <module>   s$   A