o
    }oiJ                     @   s  d Z ddlZddlZddlmZmZmZmZmZm	Z	 ddl
Z
ddl
mZ ddlmZmZ z"ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ dZW n eefy[   dZY nw zddlmZmZ ddlm Z  dZ!W n eefy{   dZ!Y nw defddZ"G dd de#Z$				dVde
jde
jde%de
jde%de%de%fddZ&dd Z'dd  Z(d!d" Z)d#d$ Z*d%d& Z+d'd( Z,e
j-j.d)d* Z/d+d, Z0ze
j1Z2W n   e
j-j.Z2Y e2d-d. Z3e
j-j.d/d0 Z4d1d2 Z5	dWd3d4Z6d5d6 Z7d7d8 Z8d9d: Z9d;d< Z:d=d> Z;d?d@ Z<dAdB Z=dCdD Z>dEdF Z?dGe	e
j@jAee
j@jA f dHeeBe
j@jCf fdIdJZDdGe	e
j@jAee
j@jA f dHeeeBee
j@jC f  fdKdLZEdWdMee% fdNdOZF	dWdPe	eee
j f dQeGdMee% dHefdRdSZHdTdU ZIdS )XzUtilities for models.    N)DictIteratorListOptionalTupleUnion)Tensor)logginglogging_mode)MixedFusedRMSNorm)FusedLayerNorm)AttnMaskType)FastLayerNorm)listify_modelTF)parallel_statetensor_parallel)1linear_with_grad_accumulation_and_async_allreduceinputc                 C   s   | t d|   S )zr
    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
    gZd;?)torchsigmoid)r    r   f/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/nlp/modules/common/megatron/utils.pyApproxGELUActivation1   s   r   c                       s(   e Zd ZdZ fddZdd Z  ZS )ApexGuardDefaultszQ
    This class can be used to replace missing classes when apex is missing.
    c                    s   t    d S N)super__init__)self	__class__r   r   r   =   s   zApexGuardDefaults.__init__c                 C   s   d S r   r   )r   itemr   r   r   __getattr__@   s   zApexGuardDefaults.__getattr__)__name__
__module____qualname____doc__r   r!   __classcell__r   r   r   r   r   8   s    r   input_word_embeddings_weightparallel_outputbias%async_tensor_model_parallel_allreducesequence_parallelgradient_accumulation_fusionc                 C   sX   t  dk}|o|o| }|s|r| }	nt| }	t|	|||||d}
|r'|
S t|
S )a  Language Model logits using word embedding weights.

    Args:
        input_ (torch.Tensor): [b, s, h]
        word_embeddings_weight (torch.Tensor): [(padded) vocab size, h]
        parallel_output (bool): False will gather logits from tensor model parallel region
        bias (torch.Tensor, optional): bias tensor. Defaults to None.
        async_tensor_model_parallel_allreduce (bool, optional): Defaults to False.
        sequence_parallel (bool, optional): If True will use sequence parallelism. Defaults to False.
        gradient_accumulation_fusioa (bool, optional): If True fuse gradient accumulation to WGRAD GEMM

    Returns:
        torch.Tensor: [b, s, (padded) vocab size]
       )r   weightr*   r-   allreduce_dgradr,   )r   $get_tensor_model_parallel_world_sizer   $copy_to_tensor_model_parallel_regionr   (gather_from_tensor_model_parallel_region)r'   r(   r)   r*   r+   r,   r-   tensor_model_parallelr0   input_parallellogits_parallelr   r   r   parallel_lm_logitsD   s    


r7   c                        fdd}|S )z!Init method based on N(0, sigma).c                       t jjj| d dS N        )meanstdr   nninitnormal_tensorsigmar   r   init_|      z!init_method_normal.<locals>.init_r   )rE   rF   r   rD   r   init_method_normaly   s   rH   c                    r8   )Nc                    s   t jjj|  dS )N)a)r   r?   r@   kaiming_uniform_rB   valr   r   rF      s   z*init_method_kaiming_uniform.<locals>.init_r   rL   rF   r   rK   r   init_method_kaiming_uniform      rN   c                    r8   )Nc                    s   t jj|  S r   )r   r?   r@   	constant_rB   rK   r   r   rF      s   z init_method_const.<locals>.init_r   rM   r   rK   r   init_method_const   rO   rQ   c                    s"   | t d|    fdd}|S )z3Init method based on N(0, sigma/sqrt(2*num_layers).g       @c                    r9   r:   r>   rB   r=   r   r   rF      rG   z(scaled_init_method_normal.<locals>.init_)mathsqrt)rE   
num_layersrF   r   rR   r   scaled_init_method_normal   s   rV   c                 C   s   |  |d | S )Ng     )masked_fill_)attention_scoresattention_maskr   r   r   attention_mask_func   s   rZ   c                 C   sP   t j| |}||j t   |j  W d   |S 1 s!w   Y  |S )z/Simple linear layer with weight initialization.N)r   r?   Linearr/   no_gradr*   zero_)rowscolumnsinit_methodlayerr   r   r   get_linear_layer   s   


rb   c                 C   s*   d|  dt d|  dd|  |      S )zOpenAI's gelu implementation.      ?g      ?gQ63E?gHm?)r   tanhxr   r   r   	gelu_impl   s   *rg   c                 C   s   t | S r   )rg   re   r   r   r   openai_gelu   s   rh   c                 C   s   t t jj| dS )N   )r   powr?   
functionalrelure   r   r   r   squared_relu   s   rm   c                 C   s4   | d t | d j| jdt | j| jd  S )Nrc   g:?dtype)r   erftoro   	ones_likere   r   r   r   erf_gelu   s   4rs   c                 C   sB   t dd | D }t jj|t d |t jjt d }|S )z*Reduce a tensor of losses across all GPUs.c                 S   s   g | ]}|   d qS )r.   )clonedetachview).0lossr   r   r   
<listcomp>       z=average_losses_across_data_parallel_group.<locals>.<listcomp>)group)r   catdistributed
all_reducer   get_data_parallel_groupget_world_size)lossesaveraged_lossesr   r   r   )average_losses_across_data_parallel_group   s   r   c              
   C   sf  |   \}}|r|}nd}d}	|r%ttj|||f| jd|d||}	tj|   tj| jd}
|r9d|
| |k< tj|tj| jd}|	d
|d}|rR| }|sV|rt|D ]M}||| | |kf }|rl| }d}t|  d D ]0}|| }|rd|	|d|d dd|d f< |r|||d df  |d | 8  < |d }qvqZ|r|	dk }	|	|
|fS )z4Build masks and position id for left to right model.r.   Ndevicero   r   r;   r   rc   )sizer   trilonesr   rv   floatarangelong	unsqueezerepeatrt   range)data	eod_tokenreset_position_idsreset_attention_maskeod_mask_losscompute_attention_maskmicro_batch_size
seq_lengthatt_mask_batchrY   	loss_maskposition_idsb	eod_index
prev_indexjir   r   r   get_ltor_masks_and_position_ids   sD    $

r   c                 C   s   | d u rd S |  d}|S )Nr.   )r   )	attn_maskextended_attention_maskr   r   r   attn_mask_postprocess  s   
r   c                 C   s   dd | D S )Nc                 S      g | ]}t |qS r   )r   )rw   r   r   r   r   ry         z3enc_dec_extended_attention_mask.<locals>.<listcomp>r   )attention_mask_listr   r   r   enc_dec_extended_attention_mask  s   r   c                 C   s6   |  d}tj|tj| jd}|d|  }|S )Nr.   r   r   )r   r   r   r   r   r   	expand_asrt   )	token_idsr   r   r   r   r   build_position_ids  s   
r   c                 C   s0   |dddddf | dddddf  }|S )|
    Returns a 3-dimensional (3-D) attention mask
    :param source_block: 2-D array
    :param target_block: 2-D array
    Nr   source_masktarget_maskmaskr   r   r   make_attention_mask_3d"  s   ,r   c                 C   s   t | |k||kS )r   r   )source_blocktarget_blockpad_idr   r   r    make_inference_attention_mask_3d,  s   r   c                 C   sH   | j \}}tj|| jd}|d |d d d f kd }||||}|S )Nr   r   )shaper   r   r   expand)blockbatchlengthr   history_maskr   r   r   make_inference_history_mask_3d6  s
   
r   c                 C   s   t | |}|dk S z
    Returns a 3D joint attention mask for Megatron given two 2D masks
    :param source_mask - True for non-masked, else masked [batch, src length]
    :param target_mask - True for non-masked, else masked [batch, tgt length]
    rc   r   r   r   r   r   build_attention_mask_3d_padding>  s   
r   c                 C   s"   t |}t| |}|| }|dk S r   )r   r   )r   r   causal_maskr   r   r   r   build_attention_mask_3d_causalI  s   
r   c                 C   s>   |t jkrt| |}|S |t jkrt| |}|S td| )a  
    Returns a 3D attention mask for Megatron given two 2D masks
    :param source_mask - < 0.5 for non-masked, else masked [batch, src length]
    :param target_mask - < 0.5 for non-masked, else masked [batch, tgt length]
    :param attn_mask_type - AttnMaskType enum
    z,Unsupported attention mask attn_mask_type = )r   paddingr   causalr   
ValueError)r   r   attn_mask_typer   r   r   r   build_attention_mask_3dV  s   



r   modelreturnc              	   C   s   t | }g dd}g dd}g ddd}dd }|D ]U}| D ]N}t|tttfr<|d ttd	d |j	
  q |j	 D ],\}}	|	d
u rJqA|drX|d |	g qA||	re|d |	g qA|d |	g qAq q|||g}
ttdd |
S )zDivide params into with-weight-decay and without-weight-decay groups.

    Layernorms and biases will have no weight decay but the rest will.
    Fparams	is_expertTr;   )r   weight_decayr   c                 S      t | dd S N	allreduceTgetattrparamr   r   r   <lambda>s      z:get_params_for_weight_decay_optimization.<locals>.<lambda>r   c                 S   s   | d uS r   r   )pr   r   r   r   y  s    Nr*   c                 S      t | d dkS Nr   r   lengr   r   r   r         )r   modules
isinstancer   r   r   extendlistfilter_parametersvaluesitemsendswithtuple)r   r   weight_decay_paramsweight_decay_expert_paramsno_weight_decay_paramsr   modulemodule_namer   param_groupsr   r   r   (get_params_for_weight_decay_optimizationg  s.   



r   c              	      s   t | }g dd}g dd}dd  |D ]$}|d  tt fdd| 7  < |d  tt | 7  < q||g}ttdd |S )	z Use all params for weight decay.Fr   Tc                 S   r   r   r   r   r   r   r   r     r   z>get_all_params_for_weight_decay_optimization.<locals>.<lambda>r   c                    s
    |  S r   r   re   r   r   r   r     s   
 c                 S   r   r   r   r   r   r   r   r     r   )r   r   r   
parametersr   )r   r   r   r   r   r   r   r   r   ,get_all_params_for_weight_decay_optimization  s   

& r   enforce_divisible_batchc                    sH   t |  |rt   dksJ d fddtdt  D S )z.
    Split a list into equal sized chunks
    r   $Issue with batch size configuration!c                    s   g | ]
}||   qS r   r   rw   r   
chunk_sizeinputsr   r   ry         zsplit_list.<locals>.<listcomp>)r   r   )r   
num_chunksr   r   r   r   
split_list  s    r   r   num_microbatchesc           
         s  t | trdd |  D }t|dkrtjd| tjd dd |  D } dd |  D }d	d |  D }t| rad d
 j	d  dkrat
dd d
 j	d  d dfddD d d
 j	d  dkrd d j	d fddD t|dkrfddtD }n.t| }fdd|D }dd D dd |D   |  fddtD }dd |D }n| d j	d  dksJ dg | D ]l}t|rtj|dd qt |tr>t |d tjr5fdd|D g }	tD ]|	fddttD  qt|	}	|	 qt| q|du rI| qt
dt| fddtD }t|S )a  
    Split a batch into k microbatches, where the batch size is divisible by k. Batch could be
    a dictionary of tensors or a list of tensors. A dictionary batch could also have items of List type,
    as long as the length of that list is the same as the batch size.
    c                 S   s$   g | ]\}}t |tjtfs|qS r   r   r   r   r   rw   kvr   r   r   ry        $ z(get_iterator_k_split.<locals>.<listcomp>r   zjOnly support splitting torch.Tensor and List[torch.Tensor]. Discarding the following keys from the batch: )modec                 S   s&   i | ]\}}t |tjtfr||qS r   r   r   r   r   r   
<dictcomp>  s   & z(get_iterator_k_split.<locals>.<dictcomp>c                 S   s"   i | ]\}}t |tjr||qS r   )r   r   r   r   r   r   r   r    s   " c                 S   s    i | ]\}}t |tr||qS r   )r   r   r   r   r   r   r         r.   z0Issue with batch size configuration: batch size z is not divisible by !c                    s    g | ]}t j|d   ddqS )r.   r   dimr   tensor_splitrw   r    r   r   r   ry     r  c                    s   g | ]} fd d|D qS )c                    s   g | ]}|d   qS r   r   )rw   r   r   r   r   ry         3get_iterator_k_split.<locals>.<listcomp>.<listcomp>r   r   r  r   r   ry     rz   c                    s*   g | ]  fd dt tD qS )c                    s$   g | ]} | d  |  fqS r   r   r   )r   r   split_batchr   r   ry     r   r  r   r   rw   )r   r  r   r   ry         c                    s   g | ]}t |d   dqS )r.   )r   )r   r	  )r   r   r   r   ry     s    c                 S   s   g | ]}|d  qS r  r   r	  r   r   r   ry     r   c                    s*   g | ]  fd dt tD qS )c                    s    g | ]} | |  fqS r   r   r   )all_keysall_split_batchr   r   r   ry     r  r  r  r  )r  r  r  r   ry     r  c                 S   r   r   )dictrw   elemr   r   r   ry     r   r   r  c                    s   g | ]
}t j| d dqS )r   r  r  r  r
  r   r   ry     r   c                    s   g | ]}|   qS r   r   r   )mbisplit_tensorsr   r   ry     r  NzUnsupported item type: c                    s   g | ]  fd dD qS )c                    s    g | ]}|d ur|  n|qS r   r   r  r   r   r   ry     r  r  r   r  )r  r  r   ry     s    )r   r  r   r   r	   warningr
   ONCEr   r   r   r   r   	is_tensorappendr  r   r   r   type	itertoolschain)
r   r   r   discard_itemstensor_items
list_itemsmicrobatchessplit_list_batchr    split_tupler   )	r  r  r   r   r   r  r   r  r  r   get_iterator_k_split  sn   

&


r*  c                 C   sV   t  r)t| t jr)| jjdkrt  }n| jjdkr t  }nt | j	|dS | S )Ncudacpurn   )
r   is_autocast_enabledr   r   r   r!  get_autocast_gpu_dtypeget_autocast_cpu_dtypeNotImplementedErrorrq   )rC   ro   r   r   r   _cast_if_autocast_enabled  s   

r1  )NFFF)T)Jr%   r"  rS   typingr   r   r   r   r   r   r   r   
nemo.utilsr	   r
   apex.normalizationr   #apex.normalization.fused_layer_normr   apex.transformer.enumsr   "apex.transformer.layers.layer_normr   3apex.transformer.pipeline_parallel.schedules.commonr   	HAVE_APEXImportErrorModuleNotFoundErrormegatron.corer   r   $megatron.core.tensor_parallel.layersr   HAVE_MEGATRON_COREr   objectr   boolr7   rH   rN   rQ   rV   rZ   rb   jitscriptrg   rh   compile	jit_fuserrm   rs   r   r   r   r   r   r   r   r   r   r   r   r?   Modulestr	Parameterr   r   r   intr*  r1  r   r   r   r   <module>   s    
5	
	





=		


$

S