o
    wi1                     @   s  d dl Z d dlmZmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dl m!Z! ddl"m#Z#m$Z$m%Z%m&Z& erd dl'm(Z( d dl)m*Z+ d dl,m-Z- d dl.m/Z/ d dl0m  m1Z2 d dl3m4Z4m5Z5 eG dd dZ6dee7 ddddde6fddZ8dd Z9dej:d e7d!d"d#ej;dd$f
d%d&Z<d'e5d(e6fd)d*Z=		+d2d'eej>j?e
ej>j? f d,e@d-e@d.ee@ d/eAdeedf fd0d1ZBdS )3    N)	dataclassfield)
MethodType)TYPE_CHECKINGCallableDictListOptionalTupleUnion)parallel_state)StrictHandlingparse_strict_flag)get_tensor_shapes)TransformerLayer)get_model_config)	lightning)llm)logging   )HiddenStateCosineLoss"LogitsAndIntermediatesLossBalancerLogitsKLLossProjectionLayer)ShardedStateDict)GPTModel)TransformerConfig)TokenizerSpec)DistillationLossBalancerDistillationModelc                   @   s   e Zd ZU dZeedZeee	e	f  e
d< dZee	e	f e
d< dZee
d< dZee
d	< d
Zeeee	e	f ejjf  e
d< d
Zee e
d< dd Zd
S )DistillationConfiga  Knowledge-Distillation config.

    Args:
        intermediate_layer_pairs: List of tuples of intermediate layer names.
        logit_layers: Tuple of logit layer names.
        skip_lm_loss: Whether to skip computing the standard language model loss (default: ``True``).
        kd_loss_scale: Relative scaling factor for the distillation loss if ``skip_lm_loss`` is ``False``.
    )default_factoryintermediate_layer_pairs)output_layerr#   logit_layersTskip_lm_lossg      ?kd_loss_scaleN	criterionloss_balancerc                 C   s`   t | jdksJ d| jtdd | jD s!J d| j| jdks.J d| jd S )N   zself.logit_layers=c                 s   s    | ]	}t |d kV  qdS )r)   N)len).0pair r-   h/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/modelopt/distill/utils.py	<genexpr>A   s    z3DistillationConfig.__post_init__.<locals>.<genexpr>zself.intermediate_layer_pairs=r   zself.kd_loss_scale=)r*   r$   allr"   r&   selfr-   r-   r.   __post_init__?   s   $z DistillationConfig.__post_init__)__name__
__module____qualname____doc__r   listr"   r   r
   str__annotations__r$   r%   boolr&   floatr'   r	   r   torchnnModuler(   r   r3   r-   r-   r-   r.   r    -   s   
 	$r    config_pathstudent_cfgr   teacher_cfgreturnc           
      C   s  | r#t | }t|}W d   n1 sw   Y  tdi |}ntd t }i }|jdks6t ryt	||t
|j< t||}|jD ]1\}}t dkrdtd| d|j d| d|j d		 t||}t||}t||d
|||f< qGt|j|jd}	||_|	|_|S )a  Read the distillation yaml config file specified by ``args.export_kd_cfg``.

    Args:
        config_path: Path to user-defined distillation settings yaml file.
            If `None`, uses default logits-only distillation mode for GPT models.
        student_cfg: Model config for student model.
        teacher_cfg: Model config for teacher model.

    WARNING: Assumes intermediate hidden sizes are always that found in the model config's ``hidden_size`` attribute.
    Nz0Distillation config not provided. Using default.r   r   z0Distillation: Adding intermediate loss between `z` of student (hidden size z) and `z` of teacher (hidden size z).)projection_layer)r&   skip_original_lossr-   )openyaml	safe_loadr    r   warningpipeline_model_parallel_sizer   is_pipeline_last_stager   tupler$   r   r"   $get_tensor_and_context_parallel_rankprinthidden_size_adjust_layer_index_for_ppr   r   r&   r%   r'   r(   )
r@   rA   rB   fcfgr'   rD   student_layerteacher_layerr(   r-   r-   r.   load_distillation_configE   sF   




rU   c                 C   s   t d| }|s
| S t|}t|d| }|dk r$td|  d| |dt|}t	
 dkr@td|  d| d |S )z[Adjust any sequence-based layer indices found in a submodule name for Pipeline Parallelism.z(?<=\.)\d+(?=\.)r   zLayer z  does not fall on final PP rank.zDistillation: Renamed layer "z" on final PP rank to "")researchr   _get_layer_offsetintgroup
ValueErrorreplacer9   r   rM   rN   )submodule_name	model_cfgmatchoffsetnew_layer_idxnew_submodule_namer-   r-   r.   rP   w   s   
rP   config	ckpt_path	tokenizerr   trainerMCoreGPTModelc                 C   s   t d | |}|jj|}d|jd|di}|jj}|jjj	|||d}dd |d 
 D }	|durLt|tsLt|}tjtjtjg}
||
v }|j|	|d tj  t d	 |S )
z?Teacher model factory (must be a non-local function to pickle).z(Distillation: Loading teacher weights...
state_dictmodule.)prefixmetadata)strictc                 S   s   i | ]\}}| d d|qS )rj    )r]   )r+   kvr-   r-   r.   
<dictcomp>   s    z$teacher_provider.<locals>.<dictcomp>Nz%Distillation: teacher weights loaded.)r   infoconfigure_modelstrategyunwrapped_checkpoint_ioload_content_metadatasharded_state_dictckpt_load_strictnesscheckpoint_ioload_checkpointitems
isinstancer;   r   r   ASSUME_OK_UNEXPECTEDRAISE_UNEXPECTED	RAISE_ALLload_state_dictr=   cudaempty_cache)rd   re   rf   rg   modelsharded_sd_metadatarw   rm   
checkpointri   strict_optionsr-   r-   r.   teacher_provider   s$   



r   r   distill_cfgc                    s   t |  }t|dkr|d d dksJ d|t j|  ddd}t|| | _dtjf fd	d
}t|| | _	dtjfdd
}t|| j
| j
_	dttt  fdd}dttj fdd}t|| | _t|| | _dd }t|| | _dS )zLExtra modifications to ``mtd.DistillationModel`` required for Megatron-Core.r   r   kd_losszexisting_state=rC   r   c                 _   sH   |    t| j| g|R i |W  d    S 1 sw   Y  d S )N)hide_teacher_modeltyperw   )r2   argskwargsr-   r-   r.   _sharded_state_dict   s   
$z@adjust_distillation_model_for_mcore.<locals>._sharded_state_dictc                    s.    j r| jrtj||jdS t| | ||S N)dtype)r%   trainingr=   
zeros_liker   r   compute_language_model_lossr2   labelslogitsr   r-   r.   _compute_language_model_loss   s   zIadjust_distillation_model_for_mcore.<locals>._compute_language_model_lossc                 S   s   t j||jdS r   )r=   r   r   r   r-   r-   r.   r      s   shapesc                 S   s   |d d | _ d S )Nr   _tensor_split_idx)r2   r   r-   r-   r.   _set_student_input_tensor_shape   s   zLadjust_distillation_model_for_mcore.<locals>._set_student_input_tensor_shapeinput_tensorsc                    sJ    fdd|D } fdd|D }t   j| t   | d S )Nc                    s*   g | ]}|d ur|d j d f n|qS N.r   r+   tr1   r-   r.   
<listcomp>      * zRadjust_distillation_model_for_mcore.<locals>._set_input_tensor.<locals>.<listcomp>c                    s*   g | ]}|d ur|dd  j f n|qS r   r   r   r1   r-   r.   r      r   )r   set_input_tensorteacher_model)r2   r   teacher_inputsstudent_inputsr-   r1   r.   _set_input_tensor   s   z>adjust_distillation_model_for_mcore.<locals>._set_input_tensorc                 _   s   | j s%|   t| j| g|R i |W  d    S 1 s w   Y  t  | j  | j|i |}W d    n1 sAw   Y  |   t| j| g|R i |}W d    n1 scw   Y  t	 sutj
||gddS |S )Nr   )dim)r   only_student_forwardr   forwardr=   no_grad_teacher_modelevalr   rK   cat)r2   r   r   teacher_outputstudent_outputr-   r-   r.   _forward   s   
 


z5adjust_distillation_model_for_mcore.<locals>._forwardN)rC   r   )mtoModeloptStateManagerri   r*   remove_stater   rw   r=   Tensorr   r   r   r
   rZ   set_student_input_tensor_shaper   r   )r   r   existing_stater   r   r   r   r   r-   r   r.   #adjust_distillation_model_for_mcore   s   *
r   F
seq_lengthmicro_batch_sizedecoder_seq_lengthforward_onlyc                    s   |st  dkst  durdS ttrd tdr&jtdstts-dS dtt	t
df  dtt	t
df  f fdd	}|S )
a  
    Return the function to adjust tensor shapes for Distillation in Megatron-Core's forward pass.

    Currently only used during non-interleaved pipelining for Distillation.
    Concatenates sizes of student and teacher output tensors for inter-process communication.
    r   Nr   modulerecv_tensor_shapes.send_tensor_shapesc           	         s   t j}t }t }t |||d}t |||d}|  t| D ]\}}t|}|d  |d d 7  < t	|| |< q*t|D ]\}}t|}|d  |d d 7  < t	|||< qI| |fS )N)r   r   r   rd   tp_groupcp_groupr   r   )
r   r   r   get_tensor_model_parallel_groupget_context_parallel_groupr   r   	enumerater8   rL   )	r   r   teacher_configr   r   teacher_recv_tensor_shapesteacher_send_tensor_shapesishaper   r   r   r   r-   r.   adjust_tensor_shapes  s:   

zJget_tensor_shapes_adjust_fn_for_distillation.<locals>.adjust_tensor_shapes)r   &get_pipeline_model_parallel_world_size.get_virtual_pipeline_model_parallel_world_sizer|   r8   hasattrr   r   r   r
   rZ   )r   r   r   r   r   r   r-   r   r.   ,get_tensor_shapes_adjust_fn_for_distillation   s   



4"r   )NF)CrW   dataclassesr   r   typesr   typingr   r   r   r   r	   r
   r   r=   rG   megatron.corer   +megatron.core.dist_checkpointing.validationr   r   )megatron.core.pipeline_parallel.schedulesr   megatron.core.transformerr   megatron.core.utilsr   nemor   nlnemo.collectionsr   
nemo.utilsr   lossr   r   r   r   (megatron.core.dist_checkpointing.mappingr   "megatron.core.models.gpt.gpt_modelr   rh   ,megatron.core.transformer.transformer_configr   1nemo.collections.common.tokenizers.tokenizer_specr   modelopt.torch.optoptr   modelopt.torch.distillr   r   r    r9   rU   rP   	GPTConfigTrainerr   r   r>   r?   rZ   r;   r   r-   r-   r-   r.   <module>   sz   $
2
B
