o
    wi                     @   s  d dl mZ d dlmZ d dlmZmZmZmZ d dl	m
Z d dlZd dlZd dlm  mZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  d dl!m"Z" d dlmZ d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 g dZ8d(ddZ9dee:ej;f fddZ<dej;fddZ=eG d d! d!e"e,j>Z?G d"d# d#eZ@G d$d% d%ejAe,j>e,jBe&jCZDed&d' ZEd%d!gZFdS ))    )contextmanager)	dataclass)CallableDictListOptionalN)InferenceParamsdist_checkpointingparallel_state)tensor_parallel)	ModelType)
LLaVAModel)OptimizerConfig)PackedSeqParams)TransformerConfig)nn)TokenizerSpec)fn)
AudioToken
ImageTokenMultiModalSampleConfig)io)ckpt_to_weights_subdir)$MaskedTokenLossReductionWithLossMask)MegatronOptimizerModuleOptimizerModule)logging)AppState)
num_layershidden_sizenum_attention_headsnum_query_groupsffn_hidden_sizekv_channelshidden_dropoutattention_dropoutfp32_residual_connection(apply_residual_connection_post_layernormlayernorm_epsilonlayernorm_zero_centered_gammaadd_bias_linearadd_qkv_biasgated_linear_unitactivation_funcactivation_func_fp8_input_storenum_moe_expertsrotary_interleavedwindow_sizenormalizationqk_layernorm	test_modecalculate_per_token_loss
seq_lengthFc                 C   sr   |dur7t | jddd}tjd|t|dddd|sdd	ini }d
d |d  D }| j||d dS dS )z
    Restores model weights from a checkpoint.

    Args:
        model: The model to restore weights for.
        checkpoint_path: Path to the checkpoint.
        strict: Whether to restore weights even if they are not the same.
    Nmodule.)prefix
state_dictF)	is_savingsharded_state_dictcheckpoint_dirvalidate_access_integritystrictlog_allc                 S   s   i | ]
\}}| d |qS )r8   )removeprefix).0kv rG   ]/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/avlm/model/base.py
<dictcomp>Y   s    z)restore_model_weights.<locals>.<dictcomp>r;   )rA   rG   )dictr>   r	   loadr   itemsload_state_dict)modelcheckpoint_pathrA   r>   loaded_state_dictrG   rG   rH   restore_model_weightsH   s   	
rQ   returnc                    s  ddl m} t| }t|trt|dkr|d }n|}t   d | r- d |	 r6 d |
dd} fd	d
| D }|durddD ]}t||d}|durct|||jdd qM||d< t dkrd}d|v r|d dur|d  }||d< |S )zAVLM Data Stepr   r
      )tokensattention_maskimagesnum_image_tilesimage_sizesaudiosaudio_lengths)position_ids)labels	loss_maskpacked_seq_paramsNc                    s2   i | ]\}}|| v r|d ur|j ddnd qS )NTnon_blocking)cuda)rD   keyvalrequired_keysrG   rH   rI      s     z"avlm_data_step.<locals>.<dictcomp>)cu_seqlens_qcu_seqlens_kvcu_seqlens_q_paddedcu_seqlens_kv_paddedTr_      r]   num_valid_tokens_in_ub)megatron.corer   next
isinstancetuplelensetupdateis_pipeline_first_stageis_pipeline_last_stagegetrL   getattrsetattrra   psget_context_parallel_world_sizesum)dataloader_iterr   batch_batchr^   attrvaluerk   rG   rd   rH   avlm_data_step]   s@   


	r   c                 C   sh   |d |d | dd| dd| dd|d | dd|d	 | d
d| ddd
}| di |S )zAVLM Forward SteprT   r[   r]   NrU   r\   rV   rW   rY   rZ   r^   )
	input_idsr[   r]   rU   r\   rV   rW   rY   rZ   r^   rG   )ru   )rN   r|   forward_argsrG   rG   rH   avlm_forward_step   s   





r   c                   @   sx  e Zd ZU dZdZee ed< dZee ed< dZ	ee ed< dZ
ee ed< dZee ed< dZeed	< d
Zeed< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZeed< dZeed< dZeed< dZeed< dZ eed < e!Z"e#ed!< e$Z%e#ed"< d#d$ Z&d)d'd(Z'dS )*
AVLMConfigzAVLM Model Base ConfigNlanguage_transformer_configvision_transformer_configvision_projection_configaudio_transformer_configaudio_projection_configTdrop_vision_class_tokenvision_feature_layeraudio_feature_layerr   $encoder_pipeline_model_parallel_sizerj   "encoder_tensor_model_parallel_sizer      r!   i   r7   language_model_from_pretrainedvision_model_from_pretrained!vision_projection_from_pretrainedaudio_model_from_pretrained audio_projection_from_pretrainedFfreeze_language_modelfreeze_vision_modelfreeze_vision_projectionfreeze_audio_modelfreeze_audio_projectionforward_step_fndata_step_fnc                 C   s>   | j d urtD ]}t| |t| j | q| jdu sJ dd S )NFz$AVLM does not return normalized loss)r   MODEL_CONFIG_ATTRrw   rv   r6   )selfr~   rG   rG   rH   __post_init__   s   
zAVLMConfig.__post_init__rR   MCoreAVLMModelc              	   C   s   d| j _| j| j _| j| j _| jd ur| j| j_| j| j_| jd ur,| j| j_| j| j_| j| j _| j	| j _	| j
dks?J dt| |t t t t pUt | j
k| jd}|S )NFr   zPAVLM `encoder_pipeline_model_parallel_size` has bug for now. Fix will come soon.)config	tokenizerpre_processpost_processadd_encoderadd_decoderr   )r   #scatter_embedding_sequence_paralleltensor_model_parallel_sizesequence_parallelr   r   r   r   pipeline_model_parallel_sizecontext_parallel_sizer   r   rx   rs   rt    get_pipeline_model_parallel_rankr   )r   r   rN   rG   rG   rH   configure_model   s2   









zAVLMConfig.configure_model)rR   r   )(__name__
__module____qualname____doc__r   r   r   __annotations__r   r   r   r   r   boolr   intr   r   r   r   r!   r7   r   strr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rG   rG   rG   rH   r      s:   
 r   c                       sL  e Zd ZdZ						d1dededededed	ed
eddf fddZdd Zdd Z	dd Z
dddddejddejdddfdejdejdeej deej deej deej deee  dee deej deee  dee dee d ee d!ee dejfd"d#Zd$ed%ed&ed'ed(ef
d)d*Zd2d+d,Zd-d. Zd/d0 Z  ZS )3r   zAVLM Model Base Model ClassNTr   r   r   r   r   r   r   rR   c                    sn  t t| j|d |j}|j}	|j}
|j}|j}|| _|| _	|| _
|| _|| _d | _d | _d | _d | _d | _d | _|j| _|j| _|j| _|j| _d| _| jry|j|||d| _| jj| _| jj| _|jdk| _ t!| j|j" t#$d|j"  n|j"d urt%j&t'i d|j"dd | j
r|	d ur|	 | _|
 | _|| _(t!| j|j) t#$d|j)  |d urt* }|j+r|j+,d	rt-|d | | _t!| j|j+ t#$d
|j+  W d    n1 sw   Y  n| | _t!| j|j+ t#$d
|j+  | | _| j.|j/|j0|j1|j2|j3d t4j5| _6t7|	d| _8|	j9| _:|r3|	j;r5|  j:|	j<8  _:d S d S d S )Nr   F)r   r   r   rj   z%Restored language model weights from r:   r=   z#Restored vision model weights from z.nemoz"Restored audio model weights from )r   r   r   r   r   
image_size)=superMCoreLLaVAModel__init__r   r   r   r   r   r   r   r   r   r   encoder_hidden_statevision_modelvision_projectionaudio_modelaudio_projectionlanguage_modelr   sequence_parallel_lmtp_comm_overlaptp_comm_overlap_lmr   context_parallel_lmr   tensor_model_parallel_size_lm#share_embeddings_and_output_weightsr   max_sequence_length_language_max_sequence_lengthr   _language_is_pipeline_parallelrQ   r   r   infor	   rK   rJ   _drop_vision_class_tokenr   r   r   endswithtemporary_model_parallel_sizefreezer   r   r   r   r   r   encoder_or_decoder
model_typehasattrvision_model_from_hfnum_image_embeddings_per_tile_img_seq_lenadd_class_tokenclass_token_len)r   r   r   r   r   r   r   r   r   r   r   r   r   	app_state	__class__rG   rH   r      s   








zMCoreAVLMModel.__init__c	                 C   sh   | j sJ d| js|S |r|S ||k}	||k}
|}|dur&t||	f|}|dur2t||
f|}|S )zk
        Combine image_embeddings, audio_embeddings, and language_embeddings into a single tensor.
        z>input text preprocessing is only needed for the language modelN)r   r   torch	index_put)r   r   image_embeddingsaudio_embeddingslanguage_embeddingsimage_token_indexaudio_token_indexuse_inference_kv_cacher^   image_token_maskaudio_token_maskcombined_embeddingsrG   rG   rH   combine_embeddingsc  s   z!MCoreAVLMModel.combine_embeddingsc                 C   s  | j r| j}n?| jdks| jrA| jr| jr| j}n-| |\}}|dur*|jd }n|dur4|jd }nd}|d | | | }n|||fS | jdkrN| j}|duru|jd |k ru||jd  }tj|d|ft	j
d}tj|d|fdd}|dur|jd |k r||jd  }ddd|f}	tjjj| |	ddd}|duo|jdk}
|
r|jd |kr|jd |jd	  }||jd	  }||ksJ d
||jd< ||jd< t||j|_t||j|_|||fS )z
        Pad the sequence (labels, loss_mask, combined_embeddings, packed_seq_params) to the
        language model's max sequence length.
        combined_embeddings's shape is [batch_size, seq_len, attention head, embed_dim]

        rj   Nr   )r   constant)padmoder   thdr   zW`language_max_sequence_length` needs to increase for sequence packing to work properly.)r   r   r   r   r   _get_shard_factorshapeFr   r   ignore_place_holderr   r   
functional
contiguous
qkv_formatrf   rh   ri   maxmax_seqlen_qmax_seqlen_kv)r   r   r\   r]   r^   max_seq_lenshard_factor_pad_size	pad_shapepacked_sequencelast_seqlenlast_seqlen_paddedrG   rG   rH   pad_sequence  sT   	





zMCoreAVLMModel.pad_sequencec                 C   sN  |duo|j d | jk}|r$|ddd| jf }|ddd| jf }|dur;|j d | jkr;|ddd| jf }|duoC|jdk}|r|jd | jkr|jd | j }|jd |jd  }|jd |jd  }	||	 }
||
8 }| j|jd< | j|jd< |dkr|jd  |8  < |jd  |8  < |jd |jd ksJ d|||fS )z
        Truncate the sequence (labels, loss_mask, combined_embeddings) to the language model's max sequence length.
            combined_embeddings's shape is [batch_size, seq_len, attention head, embed_dim]
        Nrj   r   r   r   r   zLwith packed sequence, the truncation can only truncate on the last sequence.)r   r   r   rh   rf   ri   rg   )r   r   r\   r]   r^   truncate_labelsr   truncate_lenfinal_seq_len_paddedfinal_seq_len_unpaddedfinal_paddingrG   rG   rH   truncate_sequence  s0   
z MCoreAVLMModel.truncate_sequencer   r[   r]   rU   r\   rV   rW   r   rY   rZ   r   inference_paramsruntime_gather_outputr^   c              
   C   s  |duod|j v }|rd}d}n| jr| jdur|duo"|jd dk}|s(d}nm|t| j j}| jrL| j	 | _| j|dd}|d | j
j }n| j|| j
j d d}| jrot| jd	d}|dd|dddf }|ddd
 }| |}|dur||k |j d< |j d |j d< nd}| jdur|	duo|	jd dk}|sd}nQ| j|	|
ddd\}}|d
dd }| |}|dur||k |j d< d|j v r|j d  |j d 7  < n|j d |j d< nd}n| jr| j\}}nd\}}| js||fS d}| jrm| }d||dk < | jj||d}|dd }|dur:|jd }|ddd
d| }|dura|jd |jd }}t|d|j|dk }|| }|}|  ||||||||}| !||||\}}}| "||||\}}}|dur|dd }|dur| }| j#dks| j$r| j#dkr|dd }| %||||\}}}}| jdd||||||d}|du s|du r|S ||fS )a7  Forward function of the LLaVA model.

        Args:
            images (torch.Tensor): input image of shape [num_tiles, img_h, img_w]. num_tiles means the number of
            image tiles in this batch.
            input_ids (torch.Tensor): input text ids [batch, text_seq_len].
            position_ids (torch.Tensor): input text position ids [batch, text_seq_len].
            attention_mask (torch.Tensor): Attention mask for the language model [batch, 1, combined_seq_len,
            combined_seq_len].
            labels (torch.Tensor): Optional target text labels [batch, combined_seq_len].
            loss_mask (torch.Tensor): Text loss mask [batch, text_seq_len].
            inference_params (InferenceParams): Inference-time parameters including KV cache.
            num_image_tiles (list of int): Number of tiles per image. Default 1 tile per image.
            image_token_index (int): ID for input images. Default None means `image_token_index`
                arg in the constructor will be used.
            runtime_gather_output (bool): Gather output at runtime. Default None means
                `parallel_output` arg in the constructor will be used.
            packed_seq_params (PackedSeqParams): Dict with padded token information.
                Required for using SP/CP with padding mask type.

        Returns:
            output (torch.Tensor): Loss of shape [b, s] if labels are provided,
                otherwise logits of shape [b, s, vocab_size].
            loss_mask (torch.Tensor): Loss mask expanded to combined sequence length. Shape [b, s].
        Nmedia_tokens_countr   T)output_hidden_statesr   rj   )num_unused_layersr      image_tokens_count)input_signalinput_signal_lengthprocessed_signalprocessed_signal_lengthaudio_tokens_count)NN)r   r[   )r   r[   rU   decoder_inputr\   r  r  r^   )&key_value_memory_dictr   r   r   torm   
parametersdtyper   evalr   r   r   rv   permuter   r   rz   r   r   r   r   r   cloner   	embedding	transposereshaper   arange	unsqueezedevicer   r   r  r   r   !_process_embedding_token_parallel)r   r   r[   r]   rU   r\   rV   rW   r   rY   rZ   r   r  r  r^   r   r   r   
has_imagesr   
has_audiosaudio_embedding_lensr   input_ids_textr   	embed_dimaudio_embeddings_max_seq_lenr   nonpadded_masknonpadded_audio_embeddingsoutputrG   rG   rH   forward  s   .












zMCoreAVLMModel.forwardr   r   r   r   r   c           	      C   s   g }|r| j dur|| j  |r| jdur|| j |r)| jdur)|| j |r6| jdur6|| j |rC| jdurC|| j |D ]}| D ]}d|_qKqEdS )a  Freeze model modules.

        Make specific modules non-trainable by setting requires_grad to False.

        Args:
            freeze_language_model (bool): Freeze the language model module.
            freeze_vision_model (bool): Freeze the vision model module.
            freeze_vision_projection (bool): Freeze the vision projection module.
            freeze_audio_model (bool): Freeze the audio model module.
            freeze_audio_projection (bool): Freeze the audio projection module.
        NF)r   appendr   r   r   r   r  requires_grad)	r   r   r   r   r   r   modulesmoduleparamrG   rG   rH   r     s    zMCoreAVLMModel.freezec                 C   s   t |ts|g}t|dksJ d| jr"| jr"| j|d  dS | jr/| j|d  dS | jr9|d | _dS | j	|d  dS )zSet model chunk input tensor.rj   z.input_tensor should only be length 1 for llavar   N)
rn   listrp   r   r   r   set_input_tensorr   r   r   )r   input_tensorrG   rG   rH   r/    s   
zMCoreAVLMModel.set_input_tensorc                 C   s   |dur|j dkrd}nd}| jdkr"| jr"| j| j d }||fS | jdkr0| jd }||fS | jr:| j}||fS d}d}||fS )z&Get shard factor of sequence dimensionNr   rj   r   r	  )r   r   r   r   )r   r^   seq_dimr   rG   rG   rH   r     s   	

z MCoreAVLMModel._get_shard_factorc              
   C   s  | j s| js||||fS | j r`| jdkr"| jr"| j| j d }d}n| jdkr/| jd }d}n| jr7| j}d}|j| | dksNJ d| d|j d| | jr`| jr`|j| | jks`J d| jdkrt }| j rr|	d|i | jr}|	||d	 |d
u s|j
dkrddlm} ||}n?zdd
l}	W n ty }
 ztd |
d
}
~
ww t }t }| D ]\}}|	|j|d||}|d|||< q| j r|d }|dd }| jr|d }|d }| jr| j rt|}||||fS )z7Processes the input data for model parallelism support.rj   r	  r   z'Sequence length should be divisible by z2 for                 Sequence/Context parallelism z
 with dim zhTP Comm overlap either requires Vision+Text token length                 == language_max_sequence_lengthr   )
new_labelsnew_loss_maskNsbhd)get_batch_on_this_cp_rankztPlease update Transformer Engine to >= 1.10 to use                             Context Parallel with THD format datar2  r3  )r   r   r   r   r   r   r   r   rJ   rr   r   megatron.training.utilsr5  transformer_engine_torchModuleNotFoundErrorr   errorrx   ry   get_context_parallel_rankrL   thd_get_partitioned_indicesrh   sizeindex_selectr  r   r   #scatter_to_sequence_parallel_region)r   r   r2  r3  r^   r   r1  r|   r5  texecp_sizecp_rankrb   dataindexrG   rG   rH   r  &  s   



z0MCoreAVLMModel._process_embedding_token_parallel)NTTTTTrR   N)r   r   r   r   r   r   r   r   r   r   r  r   token_idr   r   Tensorr   r   r   r   r(  r   r/  r   r  __classcell__rG   rG   r   rH   r      s    	d$D,
	


 l

$r   c                       sz  e Zd ZdZ			d)dedee ded deeej	gej	f  f fdd	Z
d*ddZdddddejddejdddfdejdejdeej deej deej deej deee  dee deej deee  dee dee dee dee d
ejfddZd
eeejf fddZd
ejfdd Zd+d
ejfd!d"Zd+d
ejfd#d$Zed
efd%d&Zed
efd'd(Z   Z!S ),	AVLMModelz Lightning Wrapper for AVLM ModelNr   optimr   r   model_transformc                    sP   t    || _|| _|pttdddd| _| j|  || _d | _	d | _
d S )Ng-C6?T)lruse_distributed_optimizerr   )r   r   r   r   r   r   rJ  connectrK  _training_loss_reduction_validation_loss_reduction)r   r   rJ  r   rK  r   rG   rH   r   y  s   

zAVLMModel.__init__rR   c                 C   s"   t | ds| j| j| _d S d S )Nr,  )r   r   r   r   r,  r   rG   rG   rH   r     s   
zAVLMModel.configure_modelr   r[   r]   rU   r\   rV   rW   r   rY   rZ   r   r  r  r^   c                 C   s*   | j |||||||||	|
||||d}|S )N)r   r[   r]   rU   r\   rV   rW   r   rY   rZ   r   r  r  r^   )r,  )r   r   r[   r]   rU   r\   rV   rW   r   rY   rZ   r   r  r  r^   output_tensorrG   rG   rH   r(    s"   zAVLMModel.forwardc                 C   s   | j |S N)r   r   )r   r{   rG   rG   rH   	data_step  s   zAVLMModel.data_stepc                 C   s   | j | |S rS  )r   r   )r   r|   rG   rG   rH   forward_step  s   zAVLMModel.forward_stepc                 C   
   |  |S rS  rU  r   r|   	batch_idxrG   rG   rH   training_step  s   
zAVLMModel.training_stepc                 C   rV  rS  rW  rX  rG   rG   rH   validation_step  s   
zAVLMModel.validation_stepc                 C   s   | j st | _ | j S rS  )rO  r   rQ  rG   rG   rH   training_loss_reduction  s   z!AVLMModel.training_loss_reductionc                 C   s   | j s	tdd| _ | j S )NT)r[  )rP  r   rQ  rG   rG   rH   validation_loss_reduction  s   z#AVLMModel.validation_loss_reduction)NNNrE  rS  )"r   r   r   r   r   r   r   r   r   Moduler   r   r   rF  r   r   rG  r   r   r   r   r   r(  r   r   rT  rU  rZ  r[  propertyr   r\  r]  rH  rG   rG   r   rH   rI  v  s    
	
	

(rI  c                 c   s*    | j }|| _ z	dV  W || _ dS || _ w )zE
    Context manager to temporarily set the model parallel size.
    N)model_parallel_size)r   
temp_valueoriginal_valuerG   rG   rH   r     s   r   )F)G
contextlibr   dataclassesr   typingr   r   r   r   lightning.pytorchpytorchLr   torch.distributedtorch.nn.functionalr   r   r   rl   r   r	   r   rx   r   megatron.core.enumsr   +megatron.core.models.multimodal.llava_modelr   r   megatron.core.optimizerr   megatron.core.packed_seq_paramsr   ,megatron.core.transformer.transformer_configr   1nemo.collections.common.tokenizers.tokenizer_specr   nemo.collections.llmr   /nemo.collections.multimodal.data.energon.configr   r   r   nemo.lightningr   nemo.lightning.io.plr    nemo.lightning.megatron_parallelr   nemo.lightning.pytorch.optimr   r   
nemo.utilsr   nemo.utils.app_stater   r   rQ   r   rG  r   r   IOMixinr   r   LightningModuleConnectorMixinFNMixinrI  r   __all__rG   rG   rG   rH   <module>   sT   
?K    ~d
