o
    }oihE                  	   @   s   d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 zddl
mZmZmZ dZW n eefy;   eZd	ZY nw ejejjfZejejjfZejejjfZG d
d dejjZdd Zdd Zdd ZG dd deZdS )zMegatron Module    N)Variable)	Parameter)ApexGuardDefaults)logging)ModelParallelConfigparallel_statetensor_parallelTFc                       s   e Zd ZdZd!def fddZdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Zd"ddZdd Zdd Zdd  Z  ZS )#MegatronModulezMMegatron specific extensions of torch Module with support
    for pipelining.NTconfigc                    s*   t stdtt|   || _|| _d S )Nzmegatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt.)HAVE_MEGATRON_COREImportErrorsuperr	   __init__r
   share_token_embeddings)selfr
   r   	__class__ g/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/nlp/modules/common/megatron/module.pyr   0   s   
zMegatronModule.__init__c                 C   sb   | j r&t| dr| jjjjS t| dr| jjjS t| dr"| jjjS td| j	s-t
d| jjS Nlanguage_modelencoder_embeddingdecoder_embeddingPre_process is True, but no embedding is found on this rank. Looked for language_model.embedding, encoder_embedding, and decoder_embeddingzSword_embeddings_weight() called for last stage, but share_token_embeddings is false)pre_processhasattrr   	embeddingword_embeddingsweightr   r   
ValueErrorr   	Exceptionr   r   r   r   word_embeddings_weight9   s   




z%MegatronModule.word_embeddings_weightc                 C   sT   | j r&t| dr| jjjjS t| dr| jjjS t| dr"| jjjS tdtd)Nr   r   r   r   BPre_process is False, there is no position embedding on this rank.)	r   r   r   r   position_embeddingsr   r   r   r   r!   r   r   r   position_embeddings_weightM   s   




z)MegatronModule.position_embeddings_weightc                 C      t | dr
| jjjS tdN#encoder_relative_position_embeddingzNo encoder_relative_position_embedding found on this rank. Looking for encoder_relative_position_embedding.relative_position_embedding.weight)r   r(   relative_position_embeddingr   r   r!   r   r   r   +encoder_relative_position_embeddings_weight]   
   

z:MegatronModule.encoder_relative_position_embeddings_weightc                 C   r&   N#decoder_relative_position_embeddingzNo decoder_relative_position_embedding found on this rank. Looking for decoder_relative_position_embedding.relative_position_embedding.weight)r   r-   r)   r   r   r!   r   r   r   +decoder_relative_position_embeddings_weighte   r+   z:MegatronModule.decoder_relative_position_embeddings_weightc                 C   r&   N3decoder_cross_attention_relative_position_embeddingzNo decoder_cross_attention_relative_position_embedding found on this rank. Looking for decoder_cross_attention_relative_position_embedding.relative_position_embedding.weight)r   r0   r)   r   r   r!   r   r   r   ;decoder_cross_attention_relative_position_embeddings_weightm   r+   zJMegatronModule.decoder_cross_attention_relative_position_embeddings_weightc                 C   s   | j stdt dkrd S t r7| js7t rJ d| _tj	|||| j
d| _| jjjd d| jj_t sY| jr[t| drK| jj  d S t| dsRJ | j  d S d S d S )	NzKinitialize_word_embeddings() was called but share_token_embeddings is false   word_embeddings_for_head)init_methodr
   r   Tr   r   )r   r    r   &get_pipeline_model_parallel_world_sizeis_pipeline_last_stager   is_pipeline_first_stage_word_embeddings_for_head_keyr   VocabParallelEmbeddingr
   r   r   datafill_sharedr   r   r   zero_parametersr   )r   r4   
vocab_sizehidden_sizer   r   r   initialize_word_embeddingsu   s*   

z)MegatronModule.initialize_word_embeddingsc                 C   sL   t j rt r| jrt jj|  jt	 d d S d S d S t
d d S )NgroupzWARNING! Distributed processes aren't initialized, so word embeddings in the last layer are not synchronized. If you are just manipulating a model this is fine, but this needs to be handled manually. If you are training something is definitely wrong.)torchdistributedis_initializedr   is_rank_in_embedding_groupr   
all_reducer"   r:   get_embedding_groupr   warningr!   r   r   r   sync_initial_word_embeddings   s   

z+MegatronModule.sync_initial_word_embeddingsc                 C   s>   t  rt  d ur|  }tjj|jt  d d S d S d S NrA   )	r   #is_rank_in_position_embedding_group&get_pipeline_model_parallel_split_rankr%   rC   rD   rG   r:   get_position_embedding_groupr   r$   r   r   r    sync_initial_position_embeddings   s   z/MegatronModule.sync_initial_position_embeddings Fc                 C   s   |  |||S )zLUse this function to override the state dict for
        saving checkpoints.)
state_dictr   destinationprefix	keep_varsr   r   r   state_dict_for_save_checkpoint   s   z-MegatronModule.state_dict_for_save_checkpointc                 C   .   t  r|  }tjj|jt  d d S d S rK   )r   4is_rank_in_encoder_relative_position_embedding_groupr*   rC   rD   rG   r:   -get_encoder_relative_position_embedding_grouprO   r   r   r   1sync_initial_encoder_relative_position_embeddings   s   

z@MegatronModule.sync_initial_encoder_relative_position_embeddingsc                 C   rX   rK   )r   4is_rank_in_decoder_relative_position_embedding_groupr.   rC   rD   rG   r:   -get_decoder_relative_position_embedding_grouprO   r   r   r   1sync_initial_decoder_relative_position_embeddings      

z@MegatronModule.sync_initial_decoder_relative_position_embeddingsc                 C   rX   rK   )r   r\   r1   rC   rD   rG   r:   r]   rO   r   r   r   Async_initial_decoder_cross_attention_relative_position_embeddings   r_   zPMegatronModule.sync_initial_decoder_cross_attention_relative_position_embeddings)NTNrQ   F)__name__
__module____qualname____doc__r   r   r"   r%   r*   r.   r1   r@   rJ   rP   rW   r[   r^   r`   __classcell__r   r   r   r   r	   ,   s    	1
r	   c                    s>   t | ttfs | S  fdd| D }t | trt|}|S )zeApply conversion to val. Recursively apply conversion if `val`
    #is a nested tuple/list structure.c                    s   g | ]}t | qS r   conversion_helper).0v
conversionr   r   
<listcomp>   s    z%conversion_helper.<locals>.<listcomp>)
isinstancetuplelist)valrl   rtnr   rk   r   rh      s   
rh   c                    s    fdd}t | |S )zConvert fp32 `val` to fp16/bf16c                    s.   | }t |ttfr| j}t |tr | } | S N)rn   r   r   r:   _FLOAT_TYPESrq   val_typecheckfloat16_converterr   r   half_conversion   s   
z(fp32_to_float16.<locals>.half_conversionrg   )rq   rx   ry   r   rw   r   fp32_to_float16   s   
rz   c                 C   s   dd }t | |S )zConvert fp16/bf16 `val` to fp32c                 S   s2   | }t |ttfr| j}t |ttfr|  } | S rs   )rn   r   r   r:   _BF16_TYPES_HALF_TYPESfloatru   r   r   r   float_conversion   s   z)float16_to_fp32.<locals>.float_conversionrg   )rq   r~   r   r   r   float16_to_fp32   s   
r   c                       sp   e Zd Zddef fddZdd Zdd ZdddZdddZdd Z	dd Z
dd Zdd Zdd Z  ZS )Float16ModuleTr
   c                    s   t stdt j||d || _|dv r"| d|  dd }n|dv r3| d|  dd }ntd	| d
|| _	d S )NzMegatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt.)r
   r   )bf16z
bf16-mixedmodulec                 S      |   S rs   )bfloat16rq   r   r   r   rx        z1Float16Module.__init__.<locals>.float16_converter)   16z16-mixedc                 S   r   rs   )halfr   r   r   r   rx     r   z
precision zO is not supported. Float16Module (megatron_amp_O2) supports only fp16 and bf16.)
r   r   r   r   	precision
add_moduler   r   r    rx   )r   r
   r   r   r   rx   r   r   r   r     s    



zFloat16Module.__init__c                 C   s   | j |S rs   )r   set_input_tensor)r   input_tensorr   r   r   r   #  s   zFloat16Module.set_input_tensorc                 O   s^   t | jddrt|| j}| j|i |}| jdd d u s"J dt r-| jr-t	|}|S )Nr   T$virtual_pipeline_model_parallel_sizezHVirtual pipeline model parallel size is no longer supported for nemo 1.0)
getattrr   rz   rx   r
   getr   r6   trainingr   )r   inputskwargsoutputsr   r   r   forward&  s   zFloat16Module.forwardNrQ   Fc                 C      | j |||S rs   )r   rR   rS   r   r   r   rR   2     zFloat16Module.state_dictc                 C   r   rs   )r   rW   rS   r   r   r   rW   5  r   z,Float16Module.state_dict_for_save_checkpointc                 C   sr   | j jr-t| j dr| j jjjjS t| j dr| j jjjS t| j dr)| j jjjS t	d| j
s4td| j jjS r   )r   r   r   r   r   r   r   r   r   r   r   r    r!   r   r   r   r"   8  s   
z$Float16Module.word_embeddings_weightc                 C   sb   | j jr-t| j dr| j jjjjS t| j dr| j jjjS t| j dr)| j jjjS t	dt	d)Nr   r   r   zPre_process is True, but no embedding is found on this rank. Looked for language_model.position_embeddings, encoder_embedding.position_embedding_weight, and decoder_embedding.position_embedding_weightr#   )
r   r   r   r   r   r$   r   r   r   r   r!   r   r   r   r%   L  s   z(Float16Module.position_embeddings_weightc                 C       t | jdr| jjjjS tdr'   )r   r   r(   r)   r   r   r!   r   r   r   r*   \  
   z9Float16Module.encoder_relative_position_embeddings_weightc                 C   r   r,   )r   r   r-   r)   r   r   r!   r   r   r   r.   d  r   z9Float16Module.decoder_relative_position_embeddings_weightc                 C   r   r/   )r   r   r0   r)   r   r   r!   r   r   r   r1   l  r   zIFloat16Module.decoder_cross_attention_relative_position_embeddings_weight)Tra   )rb   rc   rd   r   r   r   r   rR   rW   r"   r%   r*   r.   r1   rf   r   r   r   r   r     s    

r   )re   rC   torch.autogradr   torch.nn.parameterr   2nemo.collections.nlp.modules.common.megatron.utilsr   
nemo.utilsr   megatron.corer   r   r   r   r   ModuleNotFoundErrorFloatTensorcudart   
HalfTensorr|   BFloat16Tensorr{   nnModuler	   rh   rz   r   r   r   r   r   r   <module>   s,    4