o
    }oi?3                     @   s  d dl Z d dlmZmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+ erd dl,m-Z- d dl.m/Z0 d dl1m2Z2 d dl3m4Z4 e%d\Z5Z6e&dde7d\Z8Z9e&dde7d\Z:Z9eG dd dZ;dee< ddddde;fd d!Z=d"d# Z>d$e!j?d%e<d&d'd(ej@dd)f
d*d+ZAd,e8d-e;fd.d/ZB		0d7d,eejCjDe
ejCjD f d1eEd2eEd3eeE d4eFdeedf fd5d6ZGdS )8    N)	dataclassfield)
MethodType)TYPE_CHECKINGCallableDictListOptionalTupleUnion)parallel_state)StrictHandlingparse_strict_flag)get_tensor_shapes)TransformerLayer)get_model_configget_model_typeget_model_xattn)	lightning)llm)logging)safe_importsafe_import_from   )HiddenStateCosineLoss"LogitsAndIntermediatesLossBalancerLogitsKLLossProjectionLayer)ShardedStateDict)GPTModel)TransformerConfig)TokenizerSpeczmodelopt.torch.optzmodelopt.torch.distillDistillationModel)altDistillationLossBalancerc                   @   s   e Zd ZU dZeedZeee	e	f  e
d< dZee	e	f e
d< dZee
d< dZee
d	< d
Zeeee	e	f ejjf  e
d< d
Zee e
d< dd Zd
S )DistillationConfiga  Knowledge-Distillation config.

    Args:
        intermediate_layer_pairs: List of tuples of intermediate layer names.
        logit_layers: Tuple of logit layer names.
        skip_lm_loss: Whether to skip computing the standard language model loss (default: ``True``).
        kd_loss_scale: Relative scaling factor for the distillation loss if ``skip_lm_loss`` is ``False``.
    )default_factoryintermediate_layer_pairs)output_layerr(   logit_layersTskip_lm_lossg      ?kd_loss_scaleN	criterionloss_balancerc                 C   s`   t | jdksJ d| jtdd | jD s!J d| j| jdks.J d| jd S )N   zself.logit_layers=c                 s   s    | ]	}t |d kV  qdS )r.   N)len).0pair r2   _/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/modelopt/distill/utils.py	<genexpr>C   s    z3DistillationConfig.__post_init__.<locals>.<genexpr>zself.intermediate_layer_pairs=r   zself.kd_loss_scale=)r/   r)   allr'   r+   selfr2   r2   r3   __post_init__A   s   $z DistillationConfig.__post_init__)__name__
__module____qualname____doc__r   listr'   r   r
   str__annotations__r)   r*   boolr+   floatr,   r	   r   torchnnModuler-   r$   r8   r2   r2   r2   r3   r%   /   s   
 	$r%   config_pathstudent_cfgr    teacher_cfgreturnc           
      C   s  | r#t | }t|}W d   n1 sw   Y  tdi |}ntd t }i }|jdks6t ryt	||t
|j< t||}|jD ]1\}}t dkrdtd| d|j d| d|j d		 t||}t||}t||d
|||f< qGt|j|jd}	||_|	|_|S )a  Read the distillation yaml config file specified by ``args.export_kd_cfg``.

    Args:
        config_path: Path to user-defined distillation settings yaml file.
            If `None`, uses default logits-only distillation mode for GPT models.
        student_cfg: Model config for student model.
        teacher_cfg: Model config for teacher model.

    WARNING: Assumes intermediate hidden sizes are always that found in the model config's ``hidden_size`` attribute.
    Nz0Distillation config not provided. Using default.r   r   z0Distillation: Adding intermediate loss between `z` of student (hidden size z) and `z` of teacher (hidden size z).)projection_layer)r+   skip_original_lossr2   )openyaml	safe_loadr%   r   warningpipeline_model_parallel_sizer   is_pipeline_last_stager   tupler)   r   r'   $get_tensor_and_context_parallel_rankprinthidden_size_adjust_layer_index_for_ppr   r   r+   r*   r,   r-   )
rE   rF   rG   fcfgr,   rI   student_layerteacher_layerr-   r2   r2   r3   load_distillation_configG   sF   




rZ   c                 C   s   t d| }|s
| S t|}t|d| }|dk r$td|  d| |dt|}t	
 dkr@td|  d| d |S )z[Adjust any sequence-based layer indices found in a submodule name for Pipeline Parallelism.z(?<=\.)\d+(?=\.)r   zLayer z  does not fall on final PP rank.zDistillation: Renamed layer "z" on final PP rank to "")researchr   _get_layer_offsetintgroup
ValueErrorreplacer>   r   rR   rS   )submodule_name	model_cfgmatchoffsetnew_layer_idxnew_submodule_namer2   r2   r3   rU   y   s   
rU   config	ckpt_path	tokenizerr!   trainerMCoreGPTModelc           
      C   s   t d | |}d|jddi}|jj}|jjj|||d}dd |d  D }|durDt	|t
sDt|}tjtjtjg}	||	v }|j||d tj  t d	 |S )
z?Teacher model factory (must be a non-local function to pickle).z(Distillation: Loading teacher weights...
state_dictmodule.)prefix)strictc                 S   s   i | ]\}}| d d|qS )ro    )rb   )r0   kvr2   r2   r3   
<dictcomp>   s    z$teacher_provider.<locals>.<dictcomp>Nz%Distillation: teacher weights loaded.)r   infoconfigure_modelsharded_state_dictstrategyckpt_load_strictnesscheckpoint_ioload_checkpointitems
isinstancer@   r   r   ASSUME_OK_UNEXPECTEDRAISE_UNEXPECTED	RAISE_ALLload_state_dictrB   cudaempty_cache)
ri   rj   rk   rl   modelrx   rq   
checkpointrn   strict_optionsr2   r2   r3   teacher_provider   s"   



r   r   distill_cfgc                    s   t |  }t|dkr|d d dksJ d|t j|  ddd}t|| | _dtjf fd	d
}t|| | _	dtjfdd
}t|| j
| j
_	dttt  fdd}dttj fdd}t|| | _t|| | _dd }t|| | _dS )zLExtra modifications to ``mtd.DistillationModel`` required for Megatron-Core.r   r   kd_losszexisting_state=rH   r   c                 _   sH   |    t| j| g|R i |W  d    S 1 sw   Y  d S )N)hide_teacher_modeltyperx   )r7   argskwargsr2   r2   r3   _sharded_state_dict   s   
$z@adjust_distillation_model_for_mcore.<locals>._sharded_state_dictc                    s.    j r| jrtj||jdS t| | ||S N)dtype)r*   trainingrB   
zeros_liker   r   compute_language_model_lossr7   labelslogitsr   r2   r3   _compute_language_model_loss   s   zIadjust_distillation_model_for_mcore.<locals>._compute_language_model_lossc                 S   s   t j||jdS r   )rB   r   r   r   r2   r2   r3   r      s   shapesc                 S   s   |d d | _ d S )Nr   _tensor_split_idx)r7   r   r2   r2   r3   _set_student_input_tensor_shape   s   zLadjust_distillation_model_for_mcore.<locals>._set_student_input_tensor_shapeinput_tensorsc                    sJ    fdd|D } fdd|D }t   j| t   | d S )Nc                    s*   g | ]}|d ur|d j d f n|qS N.r   r0   tr6   r2   r3   
<listcomp>      * zRadjust_distillation_model_for_mcore.<locals>._set_input_tensor.<locals>.<listcomp>c                    s*   g | ]}|d ur|dd  j f n|qS r   r   r   r6   r2   r3   r      r   )r   set_input_tensorteacher_model)r7   r   teacher_inputsstudent_inputsr2   r6   r3   _set_input_tensor   s   z>adjust_distillation_model_for_mcore.<locals>._set_input_tensorc                 _   s   | j s%|   t| j| g|R i |W  d    S 1 s w   Y  t  | j  | j|i |}W d    n1 sAw   Y  |   t| j| g|R i |}W d    n1 scw   Y  t	 sutj
||gddS |S )Nr   )dim)r   only_student_forwardr   forwardrB   no_grad_teacher_modelevalr   rP   cat)r7   r   r   teacher_outputstudent_outputr2   r2   r3   _forward   s   
 


z5adjust_distillation_model_for_mcore.<locals>._forwardN)rH   r   )mtoModeloptStateManagerrn   r/   remove_stater   rx   rB   Tensorr   r   r   r
   r_   set_student_input_tensor_shaper   r   )r   r   existing_stater   r   r   r   r   r2   r   r3   #adjust_distillation_model_for_mcore   s   *
r   F
seq_lengthmicro_batch_sizedecoder_seq_lengthforward_onlyc                    s   t sdS |st dkst durdS ttrd tdr*jtds"tts1dS dt	t
tdf  dt	t
tdf  f fdd	}|S )
a  
    Return the function to adjust tensor shapes for Distillation in Megatron-Core's forward pass.

    Currently only used during non-interleaved pipelining for Distillation.
    Concatenates sizes of student and teacher output tensors for inter-process communication.
    Nr   r   modulerecv_tensor_shapes.send_tensor_shapesc           
   	      s   t  }tj}tj}tj}t|d | ||d}t|| ||d}|  t| D ]\}}	t	|	}	|	d  |d d 7  < t
|	| |< q4t|D ]\}}	t	|	}	|	d  |d d 7  < t
|	||< qS| |fS )Nr   )rank
model_typer   r   r   ri   encoder_decoder_xattnr   r   )r    get_pipeline_model_parallel_rankr   r   r   r   r   r   	enumerater=   rQ   )
r   r   r   teacher_configteacher_model_typeteacher_encoder_decoder_xattnteacher_recv_tensor_shapesteacher_send_tensor_shapesishaper   r   r   r   r2   r3   adjust_tensor_shapes  s@   


	
	zJget_tensor_shapes_adjust_fn_for_distillation.<locals>.adjust_tensor_shapes)HAVE_MODELOPTr   &get_pipeline_model_parallel_world_size.get_virtual_pipeline_model_parallel_world_sizer~   r=   hasattrr   r"   r   r
   r_   )r   r   r   r   r   r   r2   r   r3   ,get_tensor_shapes_adjust_fn_for_distillation   s    



4%r   )NF)Hr\   dataclassesr   r   typesr   typingr   r   r   r   r	   r
   r   rB   rL   megatron.corer   +megatron.core.dist_checkpointing.validationr   r   )megatron.core.pipeline_parallel.schedulesr   megatron.core.transformerr   megatron.core.utilsr   r   r   nemor   nlnemo.collectionsr   
nemo.utilsr   nemo.utils.import_utilsr   r   lossr   r   r   r   (megatron.core.dist_checkpointing.mappingr   "megatron.core.models.gpt.gpt_modelr   rm   ,megatron.core.transformer.transformer_configr    1nemo.collections.common.tokenizers.tokenizer_specr!   r   r   objectr"   _r$   r%   r>   rZ   rU   	GPTConfigTrainerr   r   rC   rD   r_   r@   r   r2   r2   r2   r3   <module>   s~   $
2
B
