o
    wi|t                     @   s  d dl mZ d dlmZ d dlmZmZmZmZ d dl	Z	d dl
mZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlm Z m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 d dl	m2Z2m3Z3 zd dl4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z: dZ;e9Z<W n e=y   d dl>m?Z? dZ5dZ7dZ8dZ:dZ;e?Z<Y nw eG dd dZ@G dd deZAG dd  d e(ZBG d!d" d"e,ZCG d#d$ d$e#ZDG d%d& d&eZEdS )'    )nullcontext)	dataclass)ListLiteralOptionalUnionN)InferenceParamsparallel_statetensor_parallel)ShardedStateDict)replace_prefix_for_sharding)get_bias_dropout_add)GPTModel)PackedSeqParams)	Attention)AttnMaskType)
IdentityOp)MLPMLPSubmodules)MegatronModule)
ModuleSpecbuild_module)TransformerBlock)TransformerConfig)TransformerLayerTransformerLayerSubmodules)sharded_state_dict_default)make_viewless_tensor)Tensornn)TEColumnParallelLinearTEDelayedScalingTEDotProductAttentionTELayerNormColumnParallelLinearTENormTERowParallelLinearT)WrappedTorchLayerNormFc                   @   s   e Zd ZU dZdZeeef ed< dZ	eeef ed< dZ
eeef ed< dZeeef ed< dZeeef ed< dZeeef ed< dS )	MLlamaCrossAttentionSubmodulesz_
    Defines the submodules required for cross-attention layers in the Llama architecture.
    Nlinear_q	linear_kvcore_attentionlinear_projq_layernormk_layernorm)__name__
__module____qualname____doc__r(   r   r   type__annotations__r)   r*   r+   r,   r-    r4   r4   g/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/mllama/model/language.pyr'   ?   s   
 r'   c                       s   e Zd ZdZ										d)ded	ed
edededededededed de	dede
e	 de
e f fddZdd Z								d*dededededed ed!e
e d"ed#ed$ed%ed&efd'd(Z  ZS )+CrossAttentionTextModelzV
    GPT-based model with integrated cross-attention layers for multimodal tasks.
    TFlearned_absolute      ?'  Nconfigtransformer_layer_spec
vocab_sizemax_sequence_lengthpre_processpost_processfp16_lm_cross_entropyparallel_output#share_embeddings_and_output_weightsposition_embedding_type)r7   ropenonerotary_percentrotary_baseseq_len_interpolation_factorvp_stagec                    s   t  j|||||||||	|
||||d t| j|| j| jd| _| jrAtjd| jj	| jj
d| jd| _| jjj| _| jd | _d S d S )N)rI   )r:   specr>   r?      F)num_embeddingsembedding_diminit_methodreduce_scatter_embeddingsr:      )super__init__CrossAttentionTransformerBlockr:   r>   r?   decoderr
   VocabParallelEmbeddinghidden_sizerN   learnable_embedding	embeddingword_embeddingsrL   num_frozen_embeddings_thresh)selfr:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rF   rG   rH   rI   	__class__r4   r5   rR   R   sB   z CrossAttentionTextModel.__init__c                 C   s   t j||jd}t j||jd}t |t j| j|jd}t |t j| jd |jd| j }t 	|| jk||
d}t 	|| jk ||
d}| |ddd}| ||}||| |||  S )z1Get word embedding w/ few extra learnable tokens.)devicerP   Nr   )torch
zeros_liker_   	ones_likeminimumtensorr[   maximumrZ   where	unsqueezerX   	transposerW   type_as)r\   xxzozx_origx_new	mask_origmask_newr4   r4   r5   !get_partially_trainable_embedding   s   $z9CrossAttentionTextModel.get_partially_trainable_embedding	input_idsposition_idsattention_maskdecoder_inputcross_attention_masksfull_text_row_masked_out_maskxattn_cacheslabelsinference_paramspacked_seq_paramsextra_block_kwargsreturnc                 C   s   |durn	| j rtdd}d}| jdkr'| jj|	| j|| jdd}| |}|dur/|j}ntj	}|
|t|j }| jd	|||	||
d|||d	|pNi }| jsV|S d}| jr_|  }| j||d\}}|du rt|dd S | ||}|S )
Forward.Nz?Require: decoder_input is not None or self.pre_process is FalserD   )r|   )	hidden_statesru   r{   rotary_pos_embr|   rw   rx   ry   cross_attention_bias)weightr   rP   r4   )r>   
ValueErrorrC   r   get_rotary_seq_lenrT   r:   dtypera   bfloat16tofinfominr?   rB   !shared_embedding_or_output_weightoutput_layerri   
contiguouscompute_language_model_loss)r\   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r   rotary_seq_lenr   r   r   output_weightlogits_lossr4   r4   r5   forward   sT   


zCrossAttentionTextModel.forward)
TTFTFr7   r8   r9   NNNNNNNNNN)r.   r/   r0   r1   r   r   intboolr   floatr   rR   rr   r   r   r   r   dictr   __classcell__r4   r4   r]   r5   r6   M   s    
	
6	
r6   c                       s   e Zd ZdZ fddZdd Z								ddeded	ee d
ededededede	de
fddZ	ddedededefddZ  ZS )rS   zX
    Transformer block with integrated cross-attention layers for multimodal tasks.
    c                    s   t  j|i |  fdd jjD  _g  _t jD ]E}| jv rXttt	tt
dtjitttttttddtttttttddtdd} jt| j|d	 d
 q jt jd qtj j _t jt jksyJ dd S )Nc                    s:   g | ]}d |      kr jk rn n|    qS )r   )_get_layer_offsetnum_layers_per_pipeline_rank).0rk   r\   r4   r5   
<listcomp>   s
    "
z;CrossAttentionTransformerBlock.__init__.<locals>.<listcomp>attn_mask_type)r(   r)   r*   r+   r,   r-   )moduleparams
submodules)
linear_fc1
linear_fc2)r   r   )cross_attentioncross_attn_bdapre_mlp_layernormmlpmlp_bdarP   )r:   layer_number)r:   z3Check PP implementation for cross attention layers!)rQ   rR   r:   fusion_schedulexattn_layersranger   r   CrossAttentionTransformerLayerr   MLlamaCrossAttentionr   no_maskr'   r#   r    r"   r%   r$   r   r   r   r   appendr   #DummyCrossAttentionTransformerLayerra   r   
ModuleListlenlayers)r\   argskwargsi
layer_specr]   r   r5   rR      sL   

 z'CrossAttentionTransformerBlock.__init__c                 C   s$   t | jdd}t | }|| j S )zAGet correct layer offset when encoder pipeline parallel size > 0.$encoder_pipeline_model_parallel_sizer   )getattrr:   r	    get_pipeline_model_parallel_rankr   )r\   r   $decoder_pipeline_model_parallel_rankr4   r4   r5   r     s   

z0CrossAttentionTransformerBlock._get_layer_offsetNr   ru   ry   rw   rx   r   attention_biasr   r{   r|   c                 C   s  | j s| j}t|ddd}| jjrt  }nt }| jj	reddl
}| jj	dkr0|jjjj}n| jj	dkr=|jjjj}ntdt| j|dd| jj fd	}d}t r[tjdd
}|jjd||d}nt }|ok|w | jjdkrx| jrxttt| j| jD ]Z\}\}}| j7 ||||| |||d|
d\}}||||||	|
d\}}|du s| jj r| jj!dks| jrJ W d   n1 sw   Y  t"# r| jj$r| j%dur| %|}qW d   n1 sw   Y  | j&dur| &|}t|ddd}|S )r   Tinprequires_grad
keep_graphr   Ne4m3hybridz3E4M3 and HYBRID are the only supported FP8 formats.F)r:   
fp8_formatoverride_linear_precision)with_context_parallel)enabled
fp8_recipe	fp8_groupfull)r   rw   xattn_cacherx   r   r   r{   r|   )r   ru   r   r   r{   r|   full_iteration)'r>   input_tensorr   r:   sequence_parallelr
   get_cuda_rng_trackerforkr   fp8transformer_enginecommonrecipeFormatE4M3HYBRIDr   r!   	fp8_wgradr	   model_parallel_is_initializedget_amax_reduction_grouppytorchfp8_autocastrecompute_granularitytrainingNotImplementedError	enumeratezipr   r   offload_contextenable_cuda_graphcuda_graph_scopera   is_grad_enabledcpu_offloading#group_prefetch_offload_commit_asyncfinal_layernorm)r\   r   ru   ry   rw   rx   r   r   r   r{   r|   rng_contextr   r   r   r   fp8_contextl_nolayerxattn_layercontextr4   r4   r5   r     s   







+
z&CrossAttentionTransformerBlock.forward r4   prefixsharded_offsetsmetadatar~   c                 C   sF  i }| d}| j j}| jD ]0}||j }|jd }	| |	|  d}
|}d|	|fg}||
||}t||
| || q| d}| jD ]7}t	|t
rOqG||j }|jd }	| |	|  d}
| |	 d}g }||
||}t||
| || qG|  D ]\}}|| jur|| jur|t|| | d|| q|S )z5Update shareded state dict for cross-attention layerszlayers.rP   .r   zxattn_layers.)r:   
num_layersr   r   r   sharded_state_dictr   updater   
isinstancer   named_childrenr   )r\   r   r   r   r   layer_prefixr   r   offsetglobal_layer_offsetstate_dict_prefixsharded_prefixsharded_pp_offsetlayer_sharded_state_dictxlayer_prefixxlayerxlayer_sharded_state_dictnamer   r4   r4   r5   r     s>   






z1CrossAttentionTransformerBlock.sharded_state_dictr   )r   r4   N)r.   r/   r0   r1   rR   r   r   r   r   r   r   r   strtupler   r   r   r   r4   r4   r]   r5   rS      sV    .	

irS   c                	       s`   e Zd ZdZ		ddedededef fdd	Zd
e	de	fddZ
						dddZ  ZS )r   zA
    Transformer layer with cross-attention for integration.
    rP   Nr:   r   r   hidden_dropoutc                    sL   t  j||||d ttjd| jjd| _ttjd| jjd| _	d S )N)r:   r   r   r  rP   )r   )
rQ   rR   r   	Parameterra   zerosr:   params_dtype	gate_attngate_ffn)r\   r:   r   r   r  r]   r4   r5   rR     s   z'CrossAttentionTransformerLayer.__init__xattn_tokensr~   c                 C   s   | j |S )z!Compute cross-attention kv cahce.)r   _compute_xattn_kv_cacher\   r  r4   r4   r5   compute_xattn_kv_cache  s   z5CrossAttentionTransformerLayer.compute_xattn_kv_cachec	              	      sF  |}	|  |}
| j|
||||||d}| j  t|ts!J dt fdd|D }|   | | j| j	j
||	| j}W d   n1 sIw   Y  |}	| |}| |}| j | t|tsjJ dtfdd|D }|   | | j| j	j
||	| j}W d   n1 sw   Y  t||jdd	}|dfS )
r   )rw   r   rx   r   r   r{   z:`attention_output_with_bias` needs to be tuple for gating.c                 3   $    | ]}|d ur | nd V  qd S Nr4   r   output)
_gate_attnr4   r5   	<genexpr>      
z9CrossAttentionTransformerLayer.forward.<locals>.<genexpr>Nz4`mlp_output_with_bias` needs to be tuple for gating.c                 3   r  r  r4   r  )	_gate_ffnr4   r5   r    r  Tr   )pre_cross_attn_layernormr   r  tanhr   r  bias_dropout_add_exec_handlerr   r   r:   bias_dropout_fusionr  r   r   r  r   r   r   )r\   r   rw   r   rx   r   r   r{   r|   residualpre_cross_attn_layernorm_outputattention_output_with_biaspre_mlp_layernorm_outputmlp_output_with_biasr  r4   )r  r  r5   r     sP   






z&CrossAttentionTransformerLayer.forward)rP   N)NNNNNN)r.   r/   r0   r1   r   r   r   r   rR   r   r  r   r   r4   r4   r]   r5   r     s*    r   c                   @   s4   e Zd ZdZdefddZdedee fddZd	S )
r   zRDummy cross-attention transformer block with tanh-gated attention and feedforward.r   c                 O   s   |d fS r  r4   )r\   r   r   r   r4   r4   r5   __call__  s   z,DummyCrossAttentionTransformerLayer.__call__r  r~   c                 C   s   d S r  r4   r  r4   r4   r5   r     s   z:DummyCrossAttentionTransformerLayer.compute_xattn_kv_cacheN)r.   r/   r0   r1   r   r#  r   r  r4   r4   r4   r5   r     s    
r   c                       sx   e Zd ZdZejfdededef fddZ	dd Z
d	d
 Zdd Z								dddZdedefddZ  ZS )r   z
    Cross-attention layer for Llama multimodal tasks.

    Cross-attention layer takes input with size [s, b, h] and context with size
    [s, b, h] and returns output of the same size.
    r:   r   r   c                    s   t  jd||||dd| | j| j dksJ t|j| jj| j| j| jjd| jj	ddd	| _t|j
| jjd| j | j| jjd| jj	ddd	| _
t|j| j| j| jjd| _t|j| j| j| jjd| _d S )	Ncross)r:   r   r   r   attention_typer   F)r:   rN   gather_outputbiasskip_bias_add	is_expert   )rV   r:   epsr4   )rQ   rR   query_projection_sizekv_projection_sizer   r(   r:   rV   rN   add_bias_linearr)   r,   hidden_size_per_attention_headlayernorm_epsilonr-   )r\   r:   r   r   r   r   r]   r4   r5   rR   -  sX   
zMLlamaCrossAttention.__init__c                 C   s^   |  |\}}| dd | jd| j f }|j| }t|d\}}| | }||fS )zGet key value tensors.Nr`   r*  )	r)   sizenum_query_groups_per_partitionr/  viewr
   split_tensor_along_last_dimr-   r   )r\   key_value_statesmixed_kvr   new_tensor_shapekeyvaluer4   r4   r5   get_key_value_tensorsg  s   
z*MLlamaCrossAttention.get_key_value_tensorsc                 C   sB   |  |\}}| dd | j| jf }|j| }| |}|S )z "Get query tensor.Nr`   )r(   r1  !num_attention_heads_per_partitionr/  r3  r,   )r\   r   queryr   r7  r4   r4   r5   get_query_tensorx  s   

z%MLlamaCrossAttention.get_query_tensorc                 C   s"   |  |}| |\}}|||fS )zGet query key value tensors.)r=  r:  )r\   r   r5  r<  r8  r9  r4   r4   r5   get_query_key_value_tensors  s   

z0MLlamaCrossAttention.get_query_key_value_tensorsNc              	   C   s  | j jrd}n
|du r|du sJ |durt|ts|fd }| |}|\}}| |||||||^}}}}}}|
durL|d}|d}|d}| jr_| jr_| j	||||||	|
d}n| j
||||||	|
d}|
dury||ddd}|| }| |\}}||fS )r   Nr*  rP   )r   r   r|   r   r`   )r:   flash_decoder   r  r=  _adjust_key_value_for_inferencesqueezecheckpoint_core_attentionr   _checkpointed_attention_forwardr*   reshaper1  r+   )r\   r   rw   r   rx   r{   r   rotary_pos_cosrotary_pos_sinr   r|   r<  r8  r9  r   r   core_attn_outr  r'  r4   r4   r5   r     sL   






zMLlamaCrossAttention.forwardr  r~   c                 C   s   |  |\}}t||gS r  )r:  ra   stack)r\   r  r8  r9  r4   r4   r5   r    s   z,MLlamaCrossAttention._compute_xattn_kv_cacher   )r.   r/   r0   r1   r   paddingr   r'   r   rR   r:  r=  r>  r   r   r  r   r4   r4   r]   r5   r   %  s.    :

Wr   )F
contextlibr   dataclassesr   typingr   r   r   r   ra   megatron.corer   r	   r
   (megatron.core.dist_checkpointing.mappingr   &megatron.core.dist_checkpointing.utilsr   (megatron.core.fusions.fused_bias_dropoutr   "megatron.core.models.gpt.gpt_modelr   MCoreGPTModelmegatron.core.packed_seq_paramsr   #megatron.core.transformer.attentionr   megatron.core.transformer.enumsr   %megatron.core.transformer.identity_opr   megatron.core.transformer.mlpr   r    megatron.core.transformer.moduler   $megatron.core.transformer.spec_utilsr   r   +megatron.core.transformer.transformer_blockr   ,megatron.core.transformer.transformer_configr   +megatron.core.transformer.transformer_layerr   r   megatron.core.transformer.utilsr   megatron.core.utilsr   r   r   :megatron.core.transformer.custom_layers.transformer_enginer    r!   r"   r#   r$   r%   HAVE_TELayerNormImplImportError*megatron.core.transformer.torch_layer_normr&   r'   r6   rS   r   r   r   r4   r4   r4   r5   <module>   sX    	  Mi