o
    wio                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZmZ d dlmZ d dlZd dlZd dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	l m!Z! d d
lmZ d dl"m#Z$ d dl"m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 e2d\Z3Z4erd dl5m6Z6 d dl7m8Z8 de
e9ej:f fddZ;dej:fddZ<dddddefddZ=dddddefdd Z>dddddefd!d"Z?eG d#d de!e*j@Z#eG d$d% d%e#ZAeG d&d' d'e#ZBeG d(d) d)e#ZCG d*d+ d+ejDe*j@e*jEe'jFZe*Ged,G d-d. d.e*jHd/ef ZIe*jJd0d1d2d3e*jKfd4d5ZLe*jJd6d7d2d3e*jKfd8d9ZMe*jJd:d;d2d3e*jKfd<d=ZNe*jJd>d?d2d@dA ZOe*jJdBdCd2dDdE ZPe*Qed,G dFdG dGe*jHed/f ZRe*jJd1d0d2d3e*jKfdHdIZSe*jJd7d6d2d3e*jKfdJdKZTe*jJd;d:d2d3e*jKfdLdMZUe*jJd?d>d2dNdO ZVe*jJdCdBd2dPdQ ZWg dRZXdS )S    N)	dataclass)Path)TYPE_CHECKINGAnyCallableDictLiteralOptionalUnion)InferenceWrapperConfig)T5Model)OptimizerConfig)
ModuleSpec)TransformerConfig)nnT5ConfigT5ForConditionalGeneration)fn)get_vocab_sizeioteardown)MaskedTokenLossReduction)MegatronOptimizerModuleOptimizerModule)safe_importtransformer_engineAutoTokenizer)TokenizerSpecreturnc           	         sH  ddl m} t| }t|trt|dkr|d }n|}|d dk }|d dk }|dd}|dd}||f}d}||d< ||d< ||d	< | D ],}|d	krk|| d jd
d|| d jd
df||< qN|dkrpqN|| jd
d||< qNt	   
g d | r 
d | r 
d  fdd| D }|S )z(Processing data for one step of T5 modelr   parallel_state   enc_maskg      ?dec_mask   Nenc_dec_maskT)non_blocking)r%   r&   r(   )text_enctext_dec)labels	loss_maskc                    s"   i | ]\}}|| v r|nd qS N ).0keyvalrequired_keysr/   ]/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/t5/model/t5.py
<dictcomp>^   s   " z t5_data_step.<locals>.<dictcomp>)megatron.corer#   next
isinstancetuplelen	unsqueezekeyscudasetupdateis_pipeline_first_stageis_pipeline_last_stageitems)	dataloader_iterr#   batch_batchr%   r&   r(   r1   outputr/   r3   r5   t5_data_step/   s<   
.

rH   c                 C   s8   |d |d |d |d |d |d d}| d	i |S )
z&Processing a forward step for T5 modelr*   r+   r%   r&   r(   r,   )encoder_input_idsdecoder_input_idsencoder_attn_maskdecoder_attn_maskencoder_decoder_attn_mask	lm_labelsNr/   r/   )modelrE   forward_argsr/   r/   r5   t5_forward_stepc   s   	rQ   encoder_configr   decoder_configc                 C   ,   ddl m}m} || j}||j}||gS )z>Spec for T5 when using transformer_engine mcore implementationr   )1get_t5_decoder_with_transformer_engine_block_spec1get_t5_encoder_with_transformer_engine_block_spec)megatron.core.models.T5.t5_specrU   rV   
num_layers)rR   rS   rU   rV   en_block_specde_block_specr/   r/   r5   transformer_engine_layer_specq      

r[   c                 C   rT   )z1Spec for T5 when using local mcore implementationr   )$get_t5_decoder_with_local_block_spec$get_t5_encoder_with_local_block_spec)rW   r]   r^   rX   )rR   rS   r]   r^   rY   rZ   r/   r/   r5   local_layer_spec~   r\   r_   c                 C   s   t rt| |S t| |S )zFSet layer spec conditioning on whether transformer_engine is available)HAVE_TEr[   r_   rR   rS   r/   r/   r5   default_layer_spec   s   

rb   c                   @   s  e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< d	Zeed
< dZed ed< dZeed< dZeed< dZeed< d	Zeed< dZeed< dZee ed< dZeed< d	Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed < dZeed!< dZ ee ed"< d#Z!eed$< dZ"eed%< dZ#eed&< e$Z%e&e'e(d ge'f f ed'< e)Z*e(ed(< e+Z,e(ed)< dZ-ee ed*< dZ.ee&e/e0e/e1f f  ed+< d1d,ee d-d.fd/d0Z2dS )2r   zPModel config for T5 model. Adpated from megatron.core.models.t5.t5_model.T5ModelNencoder_num_layersFfp16_lm_cross_entropyTparallel_output#share_embeddings_and_output_weights   make_vocab_size_divisible_bylearned_absolute)ri   ropeposition_embedding_typeapply_rope_fusioni   max_position_embeddings    relative_attention_num_bucketsrelative_attention_max_distance      ?rotary_percentseq_len_interpolation_factor
seq_lengthseq_length_decr   $encoder_pipeline_model_parallel_sizeattention_softmax_in_fp32bias_activation_fusionmasked_softmax_fusionpersist_layer_normbias_dropout_fusiondeallocate_pipeline_outputs"pipeline_model_parallel_split_ranknum_moe_expertsr'   recompute_num_layersdistribute_saved_activationsenable_autocasttransformer_layer_specforward_step_fndata_step_fn
vocab_sizetp_comm_overlap_cfgvp_stager!   MCoreT5Modelc           
      C   s8  | j du r	|du sJ d| j }|r"| j}| j| | dks"J dddlm} t| }| j|_| jdkrC| jdks?J d| j|_| j	}t
|tsQ||| d}| jdurq| j}|durptd	| d
|j d||j  d nt| |j| j}t| ||d |d || j| j| j| j| j| j| j| | d}	|	S ).Setup the T5 Model based on config definition.NzpVirtual pipeline model parallelism is temporarily unsupported in T5 due to upstream MCore T5Model API dependencyr   zLMake sure the number of model chunks is the same across all pipeline stages.r"   r'   z0Need to know how to shard the encoder & decoder.ra   zUse preset vocab_size: z, original vocab_size: z, dummy tokens: .)configrR   transformer_encoder_layer_spectransformer_decoder_layer_specr   max_sequence_lengthrd   re   rf   rk   rr   rs   pre_processpost_process)$virtual_pipeline_model_parallel_sizepipeline_model_parallel_sizerX   r7   r#   copydeepcopyrc   rv   r   r9   r   r   logginginfor   rh   r   rm   rd   re   rf   rk   rr   rs   rA   rB   )
self	tokenizerr   vp_sizep_sizer#   rR   r   r   rO   r/   r/   r5   configure_model   sb   



zT5Config.configure_modelr.   )3__name__
__module____qualname____doc__rc   int__annotations__rd   boolre   rf   rh   rk   r   rl   rm   ro   rp   rr   floatrs   r	   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   rb   r   r
   r   r   rQ   r   rH   r   r   r   strdictr   r   r/   r/   r/   r5   r      sD   
  c                   @   sN   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed	< d
S )T5Config220Mz
    NeMo's T5 model variant
    https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/main/launcher_scripts/conf/training/t5/220m.yaml
       rX   rc   i   hidden_sizei   ffn_hidden_sizenum_attention_headsNr   r   r   r   rX   r   r   rc   r   r   r   r/   r/   r/   r5   r      s   
 r   c                   @   N   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< d	Z
eed
< dS )
T5Config3BzConfig for 3B T5 model   rX   rc   i   r   i   r   rn   r   Nr   r/   r/   r/   r5   r         
 r   c                   @   r   )T5Config11BzConfig for 11B T5 modelr   rX   rc   i   r   i (  r   @   r   Nr   r/   r/   r/   r5   r   
  r   r   c                       s  e Zd ZdZ			d#dedee ded deeej	gej	f  f fdd	Z
d$ddZ		d%dejdejdejdejdejdeej d
ejfddZd
eeejf fddZd
ejfddZd&d
ejfddZd&d
ejfddZd
ejfddZed
efdd Zed
efd!d"Z  ZS )'r   zT5 Lightning ModuleNr   optimr   r    model_transformc                    sP   t    || _|| _|pttdddd| _| j|  || _d | _	d | _
d S )Ng-C6?T)lruse_distributed_optimizerr   )super__init__r   r   r   r   r   connectr   _training_loss_reduction_validation_loss_reduction)r   r   r   r   r   	__class__r/   r5   r     s   

zT5Model.__init__r!   c                 C   s"   t | ds| j| j| _dS dS )r   moduleN)hasattrr   r   r   r   r   r/   r/   r5   r   )  s   
zT5Model.configure_modelrI   rJ   rK   rL   rM   rN   c           	   	   C   s   | j |||||||d}|S )zPCall the forward method of the underlying model, and return whatever it outputs.)rI   rJ   rK   rL   rM   rN   inference_params)r   )	r   rI   rJ   rK   rL   rM   rN   r   output_tensorr/   r/   r5   forward.  s   
zT5Model.forwardc                 C   s   | j |S r.   )r   r   )r   rD   r/   r/   r5   	data_stepF  s   zT5Model.data_stepc                 C   s   | j | |S r.   )r   r   )r   rE   r/   r/   r5   forward_stepI  s   zT5Model.forward_stepc                 C   
   |  |S r.   r   r   rE   	batch_idxr/   r/   r5   training_stepL  s   
zT5Model.training_stepc                 C   r   r.   r   r   r/   r/   r5   validation_stepP  s   
zT5Model.validation_stepc                 C   sv   | j }|rt|tu rnt|dd}|s|du st|tur"tdt|jj||| jj	d}ddl
m} |||}|S )z=This is to get the MCore model required in T5InferenceWrapperr   Nz=Exact MCoreT5Model instance not found in the model structure.)r   params_dtype&inference_batch_times_seqlen_thresholdpadded_vocab_sizer   )T5InferenceWrapper)r   typer   getattr
ValueErrorr   r   r   r   r   Hmegatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapperr   )r   r   r   mcore_modelinference_wrapper_configr   model_inference_wrapperr/   r/   r5   get_inference_wrapperU  s"   
zT5Model.get_inference_wrapperc                 C   s   | j st | _ | j S r.   )r   r   r   r/   r/   r5   training_loss_reductionj  s   zT5Model.training_loss_reductionc                 C   s   | j s	tdd| _ | j S )NT)r   )r   r   r   r/   r/   r5   validation_loss_reductionq  s   z!T5Model.validation_loss_reduction)NNN)r!   N)NNr.   )r   r   r   r   r   r	   r   r   r   Moduler   r   torchTensorr   r   r   r   r   r   r   r   propertyr   r   r   __classcell__r/   r/   r   r5   r     sR    
	
r   hfc                   @   sX   e Zd ZdZdefddZdedefddZdd	 Ze	dddZ
e	defddZdS )HFT5Importerz<Importer Connector for converting HF Google T5 Model to NeMor!   c                 C   s   t | j| jdS )N)r   )r   r   r   r   r/   r/   r5   init}  s   zHFT5Importer.initoutput_pathc                 C   sr   ddl m} |jt| dd}|  }| |}| || | || td| d|j	 d t
|| ~~|S )Nr   r   auto)torch_dtypez+Converted T5 model to Nemo, model saved to z in r   )transformersr   from_pretrainedr   r   
nemo_setupconvert_state	nemo_saveprintdtyper   )r   r   r   sourcetargettrainerr/   r/   r5   apply  s   

zHFT5Importer.applyc              	   C   s   i ddddddddd	d
dddddddddddddddddddddd d!d"}t |jd#d$r?|d= tj|||tttttgd%gd&S )'z,Converting HF state dict to NeMo state dict.shared.weight embedding.word_embeddings.weightlm_head.weightlm_head.output_layer.weightDencoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight7encoder_relative_pos_emb.relative_attention_bias.weightDdecoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight7decoder_relative_pos_emb.relative_attention_bias.weight)encoder.block.*.layer.0.layer_norm.weight<encoder.layers.*.self_attention.linear_qkv.layer_norm_weight.encoder.block.*.layer.0.SelfAttention.o.weight2encoder.layers.*.self_attention.linear_proj.weight)encoder.block.*.layer.1.layer_norm.weight1encoder.layers.*.mlp.linear_fc1.layer_norm_weight0encoder.block.*.layer.1.DenseReluDense.wo.weight&encoder.layers.*.mlp.linear_fc2.weightencoder.final_layer_norm.weightencoder.final_layernorm.weight)decoder.block.*.layer.0.layer_norm.weight<decoder.layers.*.self_attention.linear_qkv.layer_norm_weight.decoder.block.*.layer.0.SelfAttention.o.weight2decoder.layers.*.self_attention.linear_proj.weight)decoder.block.*.layer.1.layer_norm.weight0decoder.layers.*.pre_cross_attn_layernorm.weight0decoder.block.*.layer.1.EncDecAttention.q.weight0decoder.layers.*.cross_attention.linear_q.weight0decoder.block.*.layer.1.EncDecAttention.o.weight3decoder.layers.*.cross_attention.linear_proj.weight)decoder.block.*.layer.2.layer_norm.weight1decoder.layers.*.mlp.linear_fc1.layer_norm_weight0decoder.block.*.layer.2.DenseReluDense.wo.weight&decoder.layers.*.mlp.linear_fc2.weightdecoder.final_layer_norm.weightdecoder.final_layernorm.weighttie_word_embeddingsFzoutput_layer.weightmapping
transformsstate_dict_ignored_entries)	r   r   r   apply_transforms_import_encoder_qkv_import_encoder_linear_fc1_import_decoder_qkv_import_decoder_kv_import_decoder_linear_fc1)r   r   r   r  r/   r/   r5   r     sj   	zHFT5Importer.convert_stater   c                 C   s&   ddl m} d}|| t| |dS )Retrieve Tokenizer from HFr   r   z<pad>)	bos_token)=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   save_hf_tokenizer_assetsr   )r   r   r  r/   r/   r5   r     s   zHFT5Importer.tokenizerc                 C   s   ddl m} |t| }dd }td!i d|jd|jd|jd|jd	|jd
|j	ddd|j
d|jdtjddd|jddd|jddd||jdt|dddddddtjdd}|S )"'Generate NeMo Config based on HF configr   r   c                 S   s(   d}| | dkr|d }| | dks|S )Nrg   r      r/   )r   baser/   r/   r5   rh     s
   z9HFT5Importer.config.<locals>.make_vocab_size_divisible_byrX   rc   r   r   kv_channelsr   rk   relativero   rp   activation_funcadd_bias_linearFinit_method_stdnormalizationRMSNormlayernorm_epsilongated_linear_unitTrh   rf   r  fp16bf16r   softmax_scalerq   Nr/   )r   r   r   r   rX   num_decoder_layersd_modeld_ffd_kv	num_headsro   rp   Fgeluinitializer_factorlayer_norm_epsilonr   r   r   float32)r   
HFT5Configr   rh   rG   r/   r/   r5   r     s^   	

zHFT5Importer.configN)r!   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r/   r/   r/   r5   r   y  s    .	r   r   )z.encoder.block.*.layer.0.SelfAttention.q.weightz.encoder.block.*.layer.0.SelfAttention.k.weightz.encoder.block.*.layer.0.SelfAttention.v.weightz1encoder.layers.*.self_attention.linear_qkv.weight)
source_key
target_keyctxc              	   C     | j j}|j}|j}|j}| }||f|dd   }	|j|	 }|j|	 }|j|	 }td|f|dd   }
t	|D ]>}t
|
|||d d d d d f f}
t
|
|||d d d d d f f}
t
|
|||d d d d d f f}
q;|
|d|  |g}
|
S Nr'   r   r$   r   r   r   r   r$  sizeviewr   emptyrangecatreshaper=  qkvmegatron_confighead_numr   	head_sizeold_tensor_shapenew_q_tensor_shapeqkv_weightsir/   r/   r5   r        



((*r  )z.decoder.block.*.layer.0.SelfAttention.q.weightz.decoder.block.*.layer.0.SelfAttention.k.weightz.decoder.block.*.layer.0.SelfAttention.v.weightz1decoder.layers.*.self_attention.linear_qkv.weightc              	   C   r>  r?  r@  rG  r/   r/   r5   r    rR  r  )z0decoder.block.*.layer.1.EncDecAttention.k.weightz0decoder.block.*.layer.1.EncDecAttention.v.weightz1decoder.layers.*.cross_attention.linear_kv.weightc              	   C   s   | j j}|j}|j}|j}| }||f|dd   }|j| }|j| }td|f|dd   }	t	|D ]*}
t
|	||
|
d d d d d f f}	t
|	||
|
d d d d d f f}	q6|	|d|  |g}	|	S )Nr'   r   r"  r@  )r=  rI  rJ  rK  rL  r   rM  rN  new_k_tensor_shape
kv_weightsrQ  r/   r/   r5   r  3  s   	

(*r  )z2encoder.block.*.layer.1.DenseReluDense.wi_0.weightz2encoder.block.*.layer.1.DenseReluDense.wi_1.weight&encoder.layers.*.mlp.linear_fc1.weightc                 C      t j| |fddS Nr   )axisr   rE  downgater/   r/   r5   r  Q     r  )z2decoder.block.*.layer.2.DenseReluDense.wi_0.weightz2decoder.block.*.layer.2.DenseReluDense.wi_1.weight&decoder.layers.*.mlp.linear_fc1.weightc                 C   rV  rW  rY  rZ  r/   r/   r5   r  \  r]  r  c                   @   sN   e Zd ZdZdddZdedefddZd	d
 Zedd Z	edddZ
dS )HFT5Exporterz5Exporter Connector for converting NeMo T5 Model to HFr!   r   c                 C   sB   ddl m} |  t| jdW  d    S 1 sw   Y  d S )Nr   )no_init_weightsr   )transformers.modeling_utilsr`  r   r   )r   r`  r/   r/   r5   r   k  s   
$zHFT5Exporter.initr   c                 C   sH   |  t| \}}|  }| ||}| }|| | j| |S r.   )	nemo_loadr   r   r   cpusave_pretrainedr   )r   r   r   _r   r/   r/   r5   r   q  s   
zHFT5Exporter.applyc                 C   s   i ddddddddd	d
dddddddddddddddddddddd d!d"}|j jr<|d= |j jd#krH|d= |d= tttg}|j jr\|t |t	 nd$|d%< d&|d'< t
j||||d(d)gd*S )+z#Convert NeMo state dict to HF styler   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r  r  r  r  r  r  r	  r  r  r
  r  r  r  r  r  r  r%  z0encoder.block.*.layer.1.DenseReluDense.wi.weightrU  z0decoder.block.*.layer.2.DenseReluDense.wi.weightr^  zencoder.embed_tokens.weightzdecoder.embed_tokens.weightr  )r   rf   rk   _export_encoder_qkv_export_decoder_qkv_export_decoder_kvr,  append_export_encoder_linear_fc1_export_decoder_linear_fc1r   r  )r   r   r   r  r  r/   r/   r5   r   |  sr   	

zHFT5Exporter.convert_statec                 C   s(   t t| jj}|j| _|j| _|jS )r  )r   load_contextr   rO   r   bos_idpad_id)r   nemo_tokenizerr/   r/   r5   r     s   zHFT5Exporter.tokenizerr:  c                 C   s   t jt| dd}ddlm} t t| jj}|j}|j}|j	}dd }|di d|j
d|jd	|jd
|jd|jd|jd|jd|jd|jd|jd|| jj|jd|jr_dndd|jd|d|d|S d|jd|d|d|S )r!  zmodel.config)subpathr   r   c                 S   s.   dd l }|dkrtdt|| | | S )Nr   zDivisor cannot be zero.)mathr   r   ceil)numberdivisorrq  r/   r/   r5   round_up_to_divisible  s   z2HFT5Exporter.config.<locals>.round_up_to_divisiblerX   r0  r1  r2  r3  r4  ro   rp   r7  r8  r   feed_forward_projz
gated-gelur6  r  decoder_start_token_idpad_token_ideos_token_idNr/   )r   rl  r   r   r   rO   r   rm  rn  eos_idrX   rc   r   r   r$  r   ro   rp   r(  r+  r   rh   r,  rf   )r   r   r:  ro  rm  rn  rz  ru  r/   r/   r5   r     sd   	
zHFT5Exporter.configN)r!   r   )r!   r:  )r   r   r   r   r   r   r   r   r   r   r   r/   r/   r/   r5   r_  g  s    
4
	r_  c                       | j j}|j}|j}||  |j}|j}|d|  }||||g}t fddt	|D }t
 | d }	t
 d | d }
|| d| }||	 d| }||
 d| }|||fS )Nr"  c                    ,   g | ]}t  d  |  d  |   qS r"  r   aranger0   rQ  heads_per_groupr/   r5   
<listcomp>      z'_export_encoder_qkv.<locals>.<listcomp>r'   r   r   r   num_query_groupsr   r$  rF  r   rE  rD  r  rc  r=  
linear_qkvrK  rL  r  r   rM  qkv_total_dimq_slicek_slicev_sliceq_projk_projv_projr/   r  r5   rf    &   	

rf  c                    r{  )Nr"  c                    r|  r}  r~  r  r  r/   r5   r    r  z'_export_decoder_qkv.<locals>.<listcomp>r'   r  r  r  r/   r  r5   rg    r  rg  c                 C   s~   | j j}|j}|j}|j}d| }||||g}td|d}td|d}|| d| }	|| d| }
|	|
fS )Nr"  r   r'   r  )	r   r   r  r   r$  rF  r   r  rc  )r=  	linear_kvrK  r  r   rM  kv_total_dimr  r  r  r  r/   r/   r5   rh  '  s   rh  c                 C      t j| ddd\}}||fS Nr"  r   )dimr   chunk
linear_fc1	gate_projup_projr/   r/   r5   rj  @     rj  c                 C   r  r  r  r  r/   r/   r5   rk  M  r  rk  )r   r   rH   rQ   r[   r_   )Yr   r   dataclassesr   pathlibr   typingr   r   r   r   r   r	   r
   lightning.pytorchpytorchLr   torch.distributedtorch.nn.functionalr   
functionalr5  Imegatron.core.inference.model_inference_wrappers.inference_wrapper_configr    megatron.core.models.T5.t5_modelr   r   megatron.core.optimizerr   $megatron.core.transformer.spec_utilsr   ,megatron.core.transformer.transformer_configr   r   r   r:  r   nemo.collections.llmr   nemo.lightningr   r   r    nemo.lightning.megatron_parallelr   nemo.lightning.pytorch.optimr   r   nemo.utils.import_utilsr   re  r`   r  r   1nemo.collections.common.tokenizers.tokenizer_specr    r   r   rH   rQ   r[   r_   rb   IOMixinr   r   r   LightningModuleConnectorMixinFNMixinmodel_importerModelConnectorr   state_transformTransformCTXr  r  r  r  r  model_exporterr_  rf  rg  rh  rj  rk  __all__r/   r/   r/   r5   <module>   s   $4]


dw


y

