o
    }oi                     @   s"  d dl mZ d dlmZmZmZmZ d dlmZ	 d dl
Z
d dlZ
d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlm Z  d dl
m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> d dl?m@Z@mAZA d dlBmCZC deeDe
jEf fdd ZFde
jEfd!d"ZGd#d$ ZHeG d%d& d&ee<jIZJeG d'd( d(ee<jIZKG d)d* d*eZLG d+d, d,e	jMe<jIe<jNe%jOZPg d-ZQdS ).    )	dataclass)CallableDictOptionalTupleN)dist_checkpointingparallel_state)	ModelType)InferenceParams)
LLaVAModel)OptimizerConfig)#scatter_to_sequence_parallel_region)
ModuleSpec)TransformerConfig)get_batch_on_this_cp_rank)nn)TokenizerSpec)fn)
quick_gelu)get_packed_seq_params)Qwen2Config)get_layer_spec_te)MODEL_CONFIG_ATTRrestore_model_weights)IGNORE_INDEXIMAGE_TOKEN_INDEXVIDEO_TOKEN_INDEX)Qwen2VisionModel)MultimodalProjectorConfig)get_image_sequence_length)io)$MaskedTokenLossReductionWithLossMask)MegatronOptimizerModuleOptimizerModule)loggingreturnc                    s   ddl m} t| }t|trt|dkr|d }n|}t   d | r- d |	 r6 d  fdd|
 D }t|}|S )	zQwen2VL Data Stepr   r      )	input_idspixel_valuesimage_grid_thwpixel_values_videosvideo_grid_thw)position_ids)labels	loss_maskc                    s2   i | ]\}}|| v r|d ur|j ddnd qS )NT)non_blocking)cuda).0keyvalrequired_keys [/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/qwen2vl/model/base.py
<dictcomp>K   s     z%qwen2vl_data_step.<locals>.<dictcomp>)megatron.corer	   next
isinstancetuplelensetupdateis_pipeline_first_stageis_pipeline_last_stageitemsr   )dataloader_iterr	   batch_batchoutputr7   r5   r8   qwen2vl_data_step2   s$   



rH   c              
   C   sj   |d | dd | dd | dd | dd | dd | dd d}d	|v r.t||d
< | di |S )Nr(   r)   r*   r+   r,   r/   r.   )r(   r)   r*   r+   r,   r/   r.   
cu_seqlenspacked_seq_paramsr7   )getr   )modelrE   forward_argsr7   r7   r8   qwen2vl_forward_stepU   s   






rN   c                 C   s   d S Nr7   )selftensorr7   r7   r8   set_input_tensorg   s   rR   c                   @   s  e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed	< dZeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< eZeed< d Zeed!< dZeed"< dZeed#< dZ eed$< dZ!eed%< dZ"eed&< dZ#eed'< d(Z$e%ed)< dZ&eed*< d+Z'eed,< d-Z(e)ed.< d3d1d2Z*d-S )4Qwen2VLVisionConfigzQwen2VL Vision Model ConfigFadd_class_token   class_token_len   	patch_dimiP  img_himg_w    
num_layers   num_attention_headsTadd_bias_linearadd_qkv_biasi   	embed_dimhidden_size   spatial_merge_sizespatial_patch_sizetemporal_patch_sizeg        hidden_dropoutattention_dropouti   ffn_hidden_sizegated_linear_unitactivation_funcP   kv_channelsnum_query_groupslayernorm_zero_centered_gammaapply_query_key_layer_scalingbias_activation_fusionbias_dropout_fusionattention_softmax_in_fp32	LayerNormnormalizationapply_rope_fusiongư>layernorm_epsilonNtransformer_layer_specr&   r   c                 C   sJ   | j }t|tstdd}t| || j| j| j| j| j	| j
| j| jd
}|S )NT)is_vit)rT   rV   rX   rf   rd   re   rY   rZ   )rx   r<   r   r   r   rT   rV   rX   rf   rd   re   rY   rZ   )rP   rx   rL   r7   r7   r8   configure_model   s    

z#Qwen2VLVisionConfig.configure_model)r&   r   )+__name__
__module____qualname____doc__rT   bool__annotations__rV   intrX   rY   rZ   r\   r^   r_   r`   ra   rb   rd   re   rf   rg   floatrh   ri   rj   r   rk   r   rm   rn   ro   rp   rq   rr   rs   ru   strrv   rw   rx   r   rz   r7   r7   r7   r8   rS   l   sB   
 rS   c                   @   s"  e Zd ZU dZdZee ed< dZee	 ed< dZ
ee ed< dZeed< dZeed	< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZee ed< dZeed< dZeed< dZeed< eZeed< eZ eed< dd Z!d"dee ddfd d!Z"dS )#Qwen2VLConfigzQwen2VL Model Base ConfigNlanguage_transformer_configvision_transformer_configvision_projection_configFdrop_vision_class_tokenvision_feature_layerr   $encoder_pipeline_model_parallel_sizerU   "encoder_tensor_model_parallel_sizer\      r^   i   
seq_lengthlanguage_model_from_pretrainedvision_model_from_pretrained!vision_projection_from_pretrainedfreeze_language_modelfreeze_vision_modelfreeze_vision_projectionforward_step_fndata_step_fnc                 C   sD   | j d ur tD ]}t| |t| j | qd| j _g d| j _d S d S )Nmrope)r]      r   )r   r   setattrgetattrposition_embedding_typemrope_section)rP   attrr7   r7   r8   __post_init__   s   
zQwen2VLConfig.__post_init__vp_stager&   MCoreQwen2VLModelc              
   C   s   d| j _| j| j _| j| j _| j| j_| j| j_| j| j _| jdkrI| jdks+J d| j| j_| j| j_| j| j _| jdkrI| j| j_| j| j_|pLd}t	| |t
jd|dp]t
 | jkt
jd|dt
jd|dt
jd|dpvt
 | jk| j|d}|S )NFr   rU   z&ViT can only live on 1 pipeline stage.)ignore_virtualr   )config	tokenizerpre_processpost_processadd_encoderadd_decoderr   r   )r   #scatter_embedding_sequence_paralleltensor_model_parallel_sizesequence_parallelr   r   pipeline_model_parallel_sizer   r   r   psrA    get_pipeline_model_parallel_rankrB   r   )rP   r   r   rL   r7   r7   r8   rz      s:   










zQwen2VLConfig.configure_modelrO   )#r{   r|   r}   r~   r   r   r   r   r   rS   r   r   r   r   r   r   r   r   r\   r^   r   r   r   r   r   r   r   r   rN   r   r   rH   r   r   rz   r7   r7   r7   r8   r      s,   
 r   c                       s  e Zd ZdZ							d'dedededed	ed
ededee ddf fddZ			d(de	j
dee	j
 dee	j
 dee	j dee	je	jf f
ddZ										d)de	jdee	j dee	j
 dee	j dee	j dee dee	j dee	j dee	j
 dee	j
 dee de	jfddZ								d*de	jdee	j dee	j dee	j d ee	j d!ee	j dee	j d"ee dee	j fd#d$Zd+d%d&Z  ZS ),r   zQwen2VL Model Base Model ClassNTFr   r   r   r   r   r   r   r   r&   c	                    sn  t t| j|d |j}	|j}
|j}|| _|| _|| _|| _	|| _
d | _d | _d | _d | _|	j| _|	j| _d| _| j	rf|	j||||d| _| jj| _| jj| _|	jdk| _t| j|j td|j  n|jd urwtjti d|jdd | jr|
 | _| | _|| _ t| j|j! td|j!  | j"|j#|j$|j%d	 t&j'| _(t)|
j*|
j+|
j,| |
j-d
| _.d S )Nr   F)r   r   r   r   rU   z%Restored language model weights from )
state_dict)sharded_state_dictcheckpoint_dirvalidate_access_integrityz#Restored vision model weights from )r   r   r   )rY   rZ   rX   rT   rV   )/superMCoreLLaVAModel__init__r   r   r   r   r   r   r   r   encoder_hidden_statevision_modelvision_projectionlanguage_modelr   sequence_parallel_lmtp_comm_overlaptp_comm_overlap_lm#share_embeddings_and_output_weightsrz   max_sequence_length_language_max_sequence_lengthr   _language_is_pipeline_parallelr   r   r%   infor   loaddict_drop_vision_class_tokenr   freezer   r   r   r
   encoder_or_decoder
model_typer    rY   rZ   rX   rV   _img_seq_len)rP   r   r   r   r   r   r   r   r   r   r   r   	__class__r7   r8   r      sj   




zMCoreQwen2VLModel.__init__r(   r*   r,   attention_maskc           +   	   C   sx  d}t }t}d}g }	|dus|dur| }
|du r t|
}tjd|jd |jd |j|jd}d\}}t	|
D ]\}}||| dk }t
||kd}||d  }||k }||k }| }g }d}||}}t|| D ]}||v r|dkr|||}nt|d }||v r|dkr|||}nt|d }||k r|| d || d || d }}}|d7 }|d8 }|} n|| d || d || d }}}|d7 }|d8 }|} | | | | | }!}"}#| | }$t|dkr|d	  d nd}%|t|$dd	dd	|%  t|!d	dd	|"|#  }&t|"dd	d|!d	|# }'t|#ddd	|!|"d	 }(|t|&|'|(g|$ |%  | |!|" |#  }qv|t|k rt|dkr}|d	  d nd}%t|| }$|t|$dd	dd	|%  tj|dd
dd	})|)|j|d||| dkf< |	|) d t|
|   q:tj|	|jdd}	||	fS |dur| d	d }| |dkd |ddd	d	|j}|jdddd jd	ddd }*|*d |jd	  }	||	fS tj|jd |jdddd	d|jd d	}tj!|jd dg|j|jd}	||	fS )a$
  
        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.

        Explanation:
            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.

            For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
            Examples:
                input_ids: [T T T T T], here T is for text.
                temporal position_ids: [0, 1, 2, 3, 4]
                height position_ids: [0, 1, 2, 3, 4]
                width position_ids: [0, 1, 2, 3, 4]

            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
            and 1D rotary position embeddin for text part.
            Examples:
                Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
                vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
                text temporal position_ids: [3, 4, 5, 6, 7]
                text height position_ids: [3, 4, 5, 6, 7]
                text width position_ids: [3, 4, 5, 6, 7]
                Here we calculate the text start position_ids as the max vision position_ids plus 1.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should
                you provide it.
            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
                The temporal, height and width of feature shape of each image in LLM.
            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
                The temporal, height and width of feature shape of each video in LLM.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

        Returns:
            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
        rc   idP Nr'   r   rU   dtypedevice)r   r   )dim.)r   F)keepdimT)r   r   )"r   r   clonetorch	ones_likeonesshaper   r   	enumerateargwheresqueezesumtolistrangeindexr>   itemmaxappendarangeviewexpandflattenstackcatreshapetorQ   	unsqueezelongcumsummasked_fill_zeros)+rP   r(   r*   r,   r   rd   image_token_idvideo_token_idvision_start_token_idmrope_position_deltastotal_input_idsr-   image_indexvideo_indexiinput_ids_item
_input_idsvision_start_indicesvision_tokens
image_nums
video_numsinput_tokensllm_pos_ids_liststremain_imagesremain_videos_ed_imageed_videothwed
llm_grid_t
llm_grid_h
llm_grid_wtext_lenst_idxt_indexh_indexw_indexllm_positionsmax_position_idsr7   r7   r8   get_rope_indexI  s   3












"$""""$ 
 
z MCoreQwen2VLModel.get_rope_indexr-   r/   r.   inference_paramsr)   r+   runtime_gather_outputc              	   C   s*  |duod|j v }|du}|du}d}|rd}nP| jr |s d}nH| jre|re|t| j j}| j||	d}| jrNt| jdd}|dd|dddf }| 	|}|durd|j
d |j
d  |j d< n| j}d}| jr|r|t| j j}| j||
d}| 	|}| js|S d}|j
d }|| jkr|ddd| jf }|dur|ddddd| jf }|dur|j
d | jkr|ddd| jf }|ddd| jf }| jr|| jk r| j| }tjj|d|f}|durtjj|d|f}|du r|dur| ||	|
|\}}| jrg| jrMt }t|j
d | d | | |j
d  }|dkrMtjj|d|f}|durMtjj|d|f}| }d||dk < | jj|dd	}|dd }| j|||||||d
\}}}}| jd||||||d}|du s|du r|S || fS )ar  Forward function of the Qwen2VL model.

        Args:
            input_ids (torch.Tensor): input text ids [batch, decoder_seq_len].
            attention_mask (torch.Tensor): Attention mask for the language model [batch, 1, combined_seq_len,
            combined_seq_len].
            position_ids (torch.Tensor): input text position ids [batch, decoder_seq_len].
            loss_mask (torch.Tensor): Text loss mask [batch, decoder_seq_len].
            labels (torch.Tensor): Optional target text labels [batch, combined_seq_len].
            inference_params (InferenceParams): Inference-time parameters including KV cache.
            pixel_values (torch.Tensor): input image of shape [images_total_patches,
            num_channels * temporal_size * patch_size * patch_size].
            pixel_values_videos (torch.Tensor): input video of shape [videos_total_patches,
            num_channels * temporal_size * patch_size * patch_size].
            image_grid_thw (torch.Tensor): The temporal, height and width of feature shape of each image.
            Shape [num_images, 3].
            video_grid_thw (torch.Tensor): The temporal, height and width of feature shape of each video.
            Shape [num_videos, 3].
            runtime_gather_output (bool): Gather output at runtime. Default None means
                `parallel_output` arg in the constructor will be used.
        Returns:
            output (torch.Tensor): Loss of shape [b, s] if labels are provided,
                otherwise logits of shape [b, s, vocab_size].
            loss_mask (torch.Tensor): Loss mask expanded to combined sequence length. Shape [b, s].
        Nimage_tokens_count)grid_thwrV   rU   rc   media_tokens_countr   )r(   r-   )r/   r.   language_embeddingsimage_embeddingsvideo_embeddingsr   )r(   r-   r   decoder_inputr.   r  r  )key_value_memory_dictr   r   r;   r   
parametersr   r   r   r   r   r   r   r   r   r   r   
functionalpadr  r   r   r   $get_tensor_model_parallel_world_sizer   r   r   	embedding	transpose
contiguous_preprocess_data)rP   r(   r   r-   r/   r.   r  r)   r+   r*   r,   r  use_inference_kv_cache
has_images
has_videosr  rV   r  r  language_seq_lenpadded_seq_lenr  tp_world_sizeinput_ids_textcombined_embeddingsfinal_labelsfinal_loss_maskfinal_attention_maskrG   r7   r7   r8   forward  s   (







&



zMCoreQwen2VLModel.forwardr  r  r  r%  c
                 C   s(  | j sJ d| js| jsdS |r||||	fS |j\}
}|du}|r4|j|jks4J d|j d|j |du}|du}d}| jr|}|r||tk  }|jd }||kr`td| d| |tkd		|
|j}|
|j|j}|||}|r|tk  }|jd }||krtd
| d| |tkd		|
|j}|
|j|j}|||}d\}}| jr|r| jr|jd | jk r| j}tj|
|ft|j|jd}tj|
|fd|j|jd}|ddddf |ddd|jd f< |ddddf |ddd|jd f< n||}}|dur?|dur?|jdd |j  kr:|jks?J d J d|durh|jd | jkrX|ddd| jf }|dd }| jrht|}|duot|jd | jk}|r|ddd| jf }|ddd| jf }||||	fS )a  
        MCoreQwen2VLModel uses its own version of _preprocess_data instead of MCoreLLaVAModel's (in
        megatron-lm/megatron/core/models/multimodal/llava_model.py)

        This function handles several data preprocess requirements:
            - merge image and/or video embeddings into language embedding
            - padding inputs variables (e.g. labels/loss masks) for pipeline_parallel case
            - truncate inputs variables (e.g. labels/loss masks) if exceeding max seq length

        This function won't shift labels as forward() and _preprocess_data() in MCoreQwen2VLModel
        expect labels from input arguments already handle this shift.

        About merging image/video embeddings: language_embeddings may include num of imgage_token
        placeholders, and this function will put each imgage_token from image_embeddings into
        placeholder within language_embeddings(1:1 mapping), when image_embeddings/video_embeddings
        is available and it's the 1st pipeline_parallel stage
        z>input text preprocessing is only needed for the language model)NNNNNzmismatching labels shape z and loss mask shape r   z6Image features and image tokens do not match: tokens: z, features r   z6Video features and video tokens do not match: tokens: )NNrU   r   rc   z*unexpected shapes after data preprocessing)r   r   r   r   r   r   r   
ValueErrorr   	expand_asr   r   r   masked_scatterr   r   r   r   fullr   r"  r#  r   r   )rP   r(   r/   r.   r  r  r  r-   r%  r   
batch_sizer(  
has_labelsr&  r'  final_embeddingn_image_tokensn_image_features
image_maskn_video_tokensn_video_features
video_maskr-  r.  max_seq_lentruncate_labelsr7   r7   r8   r$    s   


*,
&
z"MCoreQwen2VLModel._preprocess_datac                 C   s   t |ts|g}t|dksJ d| jr"| jr"| j|d  dS | jr/| j|d  dS | jr9|d | _dS | j	|d  dS )zSet model chunk input tensor.rU   z.input_tensor should only be length 1 for llavar   N)
r<   listr>   r   r   r   rR   r   r   r   )rP   input_tensorr7   r7   r8   rR     s   
z"MCoreQwen2VLModel.set_input_tensor)NTTTTFNNNN)
NNNNNNNNNN)NNNNNNFN)r&   N)r{   r|   r}   r~   r   r   r   r   r   r   
LongTensorTensorr   r  r   FloatTensorr0  r$  rR   __classcell__r7   r7   r   r8   r      s    	
S
 	

 &	

 r   c                       s^  e Zd ZdZ			d&dedee ded deeej	gej	f  f fdd	Z
d'd
ee ddfddZ									d(dejdeej deej deej deej dee deej deej deej deej dejfddZdeeejf fddZdejfddZd'dejfddZd'dejfd d!Zedefd"d#Zedefd$d%Z  ZS ))Qwen2VLModelz#Lightning Wrapper for Qwen2VL ModelNr   optimr   r   model_transformc                    sP   t    || _|| _|pttdddd| _| j|  || _d | _	d | _
d S )Ng-C6?T)lruse_distributed_optimizerr   )r   r   r   r   r#   r   rH  connectrI  _training_loss_reduction_validation_loss_reduction)rP   r   rH  r   rI  r   r7   r8   r   '  s   

zQwen2VLModel.__init__r   r&   c                 C   s&   t | ds| jj| j|d| _d S d S )Nmodule)r   )hasattrr   rz   r   rO  )rP   r   r7   r7   r8   rz   8  s   
zQwen2VLModel.configure_modelr(   r   r-   r/   r.   r  r)   r+   r*   r,   c                 C   s"   | j |||||||||	|
d
}|S )N)
r(   r   r-   r/   r.   r  r)   r+   r*   r,   )rO  )rP   r(   r   r-   r/   r.   r  r)   r+   r*   r,   output_tensorr7   r7   r8   r0  =  s   zQwen2VLModel.forwardc                 C   s   | j |S rO   )r   r   )rP   rD   r7   r7   r8   	data_stepZ  s   zQwen2VLModel.data_stepc                 C   s   | j | |S rO   )r   r   )rP   rE   r7   r7   r8   forward_step^  s   zQwen2VLModel.forward_stepc                 C   
   |  |S rO   rS  rP   rE   	batch_idxr7   r7   r8   training_stepb  s   
zQwen2VLModel.training_stepc                 C   rT  rO   rU  rV  r7   r7   r8   validation_stepg  s   
zQwen2VLModel.validation_stepc                 C   s   | j st | _ | j S rO   )rM  r"   rP   r7   r7   r8   training_loss_reductionm  s   z$Qwen2VLModel.training_loss_reductionc                 C   s   | j s	tdd| _ | j S )NT)rY  )rN  r"   rZ  r7   r7   r8   validation_loss_reductionu  s   z&Qwen2VLModel.validation_loss_reductionrB  rO   )	NNNNNNNNN)r{   r|   r}   r~   r   r   r$   r   r   Moduler   r   rz   r   rD  rC  r   rE  r0  r   r   rR  rS  rX  rY  propertyr"   r[  r\  rF  r7   r7   r   r8   rG  $  sn    	

rG  )rG  r   rH   rN   )Rdataclassesr   typingr   r   r   r   lightning.pytorchpytorchLr   torch.distributedr:   r   r	   r   megatron.core.enumsr
   megatron.core.inference_paramsr   +megatron.core.models.multimodal.llava_modelr   r   megatron.core.optimizerr   megatron.core.tensor_parallelr   $megatron.core.transformer.spec_utilsr   ,megatron.core.transformer.transformer_configr   megatron.core.utilsr   r   1nemo.collections.common.tokenizers.tokenizer_specr   nemo.collections.llmr   "nemo.collections.llm.fn.activationr   #nemo.collections.llm.gpt.model.baser   $nemo.collections.llm.gpt.model.qwen2r    nemo.collections.vlm.layer_specsr   $nemo.collections.vlm.neva.model.baser   r   3nemo.collections.vlm.qwen2vl.data.multimodal_tokensr   r   r   )nemo.collections.vlm.qwen2vl.model.visionr   nemo.collections.vlm.visionr    nemo.collections.vlm.vision.baser    nemo.lightningr!    nemo.lightning.megatron_parallelr"   nemo.lightning.pytorch.optimr#   r$   
nemo.utilsr%   r   rD  rH   rN   rR   IOMixinrS   r   r   LightningModuleConnectorMixinFNMixinrG  __all__r7   r7   r7   r8   <module>   sZ   #8P    2Z