o
    wi                     @   sF  d dl mZmZ d dlmZmZmZmZmZ d dl	m
Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" d dlm#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4m5Z5m6Z6 d dl7m8Z8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZCmDZD d dlEmFZF deeGejHf fdd ZIdejHfd!d"ZJd#d$ ZKeG d%d& d&e e?jLZMeG d'd( d(e e?jLZNeG d)d* d*e e?jLZOG d+d, d,eZPG d-d. d.ejQe?jLe?jRe'jSZTg d/ZUdS )0    )	dataclassfield)CallableDictListOptionalTupleN)dist_checkpointingparallel_state)	ModelType)InferenceParams)
LLaVAModel)OptimizerConfig)#scatter_to_sequence_parallel_region)
ModuleSpec)TransformerConfig)get_batch_on_this_cp_rank)nn)TokenizerSpec)fn)
quick_gelu)get_packed_seq_params)Qwen2Config)get_layer_spec_te)MODEL_CONFIG_ATTRrestore_model_weights)IGNORE_INDEXIMAGE_TOKEN_INDEXVIDEO_TOKEN_INDEX)Qwen2VisionModelQwen25VisionModel)MultimodalProjectorConfig)get_image_sequence_length)io)$MaskedTokenLossReductionWithLossMask)MegatronOptimizerModuleOptimizerModule)loggingreturnc                    s   ddl m} t| }t|trt|dkr|d }n|}t  |dkr) d n	|dkr2 d | r; d |	 rD d	  fd
d|
 D }t|}|S )zQwen2VL Data Stepr   r
      qwen2-vl)	input_idspixel_valuesimage_grid_thwpixel_values_videosvideo_grid_thw	qwen25-vl)r,   r-   r.   r/   r0   second_per_grid_ts)position_ids)labels	loss_maskc                    s2   i | ]\}}|| v r|d ur|j ddnd qS )NT)non_blocking)cuda).0keyvalrequired_keys d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/qwen2vl/model/base.py
<dictcomp>W   s     z%qwen2vl_data_step.<locals>.<dictcomp>)megatron.corer   next
isinstancetuplelensetupdateis_pipeline_first_stageis_pipeline_last_stageitemsr   )dataloader_itermodel_versionr   batch_batchoutputr=   r;   r>   qwen2vl_data_step2   s.   



rO   c                 C   st   |d | dd | dd | dd | dd | dd | dd | dd d	}d
|v r3t||d< | di |S )Nr,   r-   r.   r/   r0   r2   r5   r4   )r,   r-   r.   r/   r0   r2   r5   r4   
cu_seqlenspacked_seq_paramsr=   )getr   )modelrL   forward_argsr=   r=   r>   qwen2vl_forward_step`   s   







rU   c                 C   s   d S Nr=   )selftensorr=   r=   r>   set_input_tensorq   s   rY   c                   @   s  e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed	< dZeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< eZeed< d Zeed!< dZeed"< dZeed#< dZ eed$< dZ!eed%< dZ"eed&< dZ#eed'< d(Z$e%ed)< dZ&eed*< d+Z'eed,< d-Z(e)ed.< d/Z*e%ed0< d5d3d4Z+d-S )6Qwen2VLVisionConfigzQwen2VL Vision Model ConfigFadd_class_token   class_token_len   	patch_dimP  img_himg_w    
num_layers   num_attention_headsTadd_bias_linearadd_qkv_bias   	embed_dimhidden_size   spatial_merge_sizespatial_patch_sizetemporal_patch_size        hidden_dropoutattention_dropouti   ffn_hidden_sizegated_linear_unitactivation_funcP   kv_channelsnum_query_groupslayernorm_zero_centered_gammaapply_query_key_layer_scalingbias_activation_fusionbias_dropout_fusionattention_softmax_in_fp32	LayerNormnormalizationapply_rope_fusionư>layernorm_epsilonNtransformer_layer_specr+   rK   r)   r    c                 C   J   | j }t|tstdd}t| || j| j| j| j| j	| j
| j| jd
}|S NT)is_vit)r[   r]   r_   ro   rm   rn   ra   rb   )r   rB   r   r   r    r[   r]   r_   ro   rm   rn   ra   rb   rW   r   rS   r=   r=   r>   configure_model       

z#Qwen2VLVisionConfig.configure_model)r)   r    ),__name__
__module____qualname____doc__r[   bool__annotations__r]   intr_   ra   rb   rd   rf   rg   rh   rj   rk   rm   rn   ro   rq   floatrr   rs   rt   r   ru   r   rw   rx   ry   rz   r{   r|   r}   r   strr   r   r   r   rK   r   r=   r=   r=   r>   rZ   v   sD   
 rZ   c                   @   s  e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed	< dZeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< ejjjZeed< d Z eed!< dZ!eed"< dZ"eed#< dZ#eed$< dZ$eed%< dZ%eed&< dZ&eed'< d(Z'e(ed)< dZ)eed*< d+Z*eed,< d-Z+e,ed.< e-d/d0 d1Z.e/e ed2< d3Z0e(ed4< d9d7d8Z1d-S ):Qwen25VLVisionConfigzQwen2.5VL Vision Model ConfigFr[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   Trg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   i\  rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   RMSNormr   r   r   r   Nr   c                   C   s   g dS )N)            r=   r=   r=   r=   r>   <lambda>   s    zQwen25VLVisionConfig.<lambda>)default_factoryfullatt_block_indexesr1   rK   r)   r!   c                 C   r   r   )r   rB   r   r   r!   r[   r]   r_   ro   rm   rn   ra   rb   r   r=   r=   r>   r      r   z$Qwen25VLVisionConfig.configure_model)r)   r!   )2r   r   r   r   r[   r   r   r]   r   r_   ra   rb   rd   rf   rg   rh   rj   rk   rm   rn   ro   rq   r   rr   rs   rt   torchr   
functionalsiluru   r   rw   rx   ry   rz   r{   r|   r}   r   r   r   r   r   r   r   r   r   rK   r   r=   r=   r=   r>   r      sF   
 r   c                   @   s&  e Zd ZU dZdZee ed< dZee	e
B  ed< dZee ed< dZeed< dZeed	< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZee ed< dZeed< dZeed< dZeed< eZeed< e Z!eed< dd Z"d"dee ddfd d!Z#dS )#Qwen2VLConfigzQwen2VL Model Base ConfigNlanguage_transformer_configvision_transformer_configvision_projection_configFdrop_vision_class_tokenvision_feature_layerr   $encoder_pipeline_model_parallel_sizer\   "encoder_tensor_model_parallel_sizerd      rf   i   
seq_lengthlanguage_model_from_pretrainedvision_model_from_pretrained!vision_projection_from_pretrainedfreeze_language_modelfreeze_vision_modelfreeze_vision_projectionforward_step_fndata_step_fnc                 C   sD   | j d ur tD ]}t| |t| j | qd| j _g d| j _d S d S )Nmrope)re      r   )r   r   setattrgetattrposition_embedding_typemrope_section)rW   attrr=   r=   r>   __post_init__  s   
zQwen2VLConfig.__post_init__vp_stager)   MCoreQwen2VLModelc              
   C   sZ  d| j _| j| j _| j| j _| j| j _| j| j_| j| j_| j| j _| jdkrN| jdks0J d| j| j_| j| j_| j| j _| j	dkrN| j	| j_| j	| j_g d}| j | j| jfD ]}|D ]}t
||t| | q^qZ| j| j _d| j_d| j_|p{d}t| |tjd|dpt | jktjd|dtjd|dtjd|dpt | jk| j|d}|S )NFr   r\   z&ViT can only live on 1 pipeline stage.)cross_entropy_loss_fusionenable_cuda_graphuse_te_rng_trackergradient_accumulation_fusionr{   r|   masked_softmax_fusionr}   r   overlap_p2p_commbatch_p2p_comm)ignore_virtualr   )config	tokenizerpre_processpost_processadd_encoderadd_decoderr   r   )r   #scatter_embedding_sequence_paralleltensor_model_parallel_sizesequence_parallelcontext_parallel_sizer   r   pipeline_model_parallel_sizer   r   r   r   tp_comm_overlapr   psrG    get_pipeline_model_parallel_rankrH   r   )rW   r   r   config_attrsr   r   rS   r=   r=   r>   r     sR   












zQwen2VLConfig.configure_modelrV   )$r   r   r   r   r   r   r   r   r   rZ   r   r   r"   r   r   r   r   r   r   rd   rf   r   r   r   r   r   r   r   r   rU   r   r   rO   r   r   r   r=   r=   r=   r>   r      s,   
 r   c                       s  e Zd ZdZ							d(dedededed	ed
ededee ddf fddZ					d)dee	j
 dee	j
 dee	j
 dee	j dee	j dee	je	jf fddZ											d*de	jdee	j dee	j
 dee	j dee	j dee dee	j dee	j dee	j
 dee	j
 dee dee	j de	jfddZ								d+de	jdee	j dee	j d ee	j d!ee	j d"ee	j dee	j d#ee dee	j fd$d%Zd,d&d'Z  ZS )-r   zQwen2VL Model Base Model ClassNTFr   r   r   r   r   r   r   r   r)   c	                    s  t t| j|d |j}	|j}
|j}|
j| _| jd usJ || _|| _|| _	|| _
|| _|| _d | _d | _d | _d | _|	j| _|	j| _|	j| _d| _| jrx|	j||||d| _| jj| _| jj| _|	jdk| _t| j|j t d|j  n|jd urt!j"t#i d|jdd | j
r|
 | _| | _|| _$t| j|j% t d|j%  | j&|j'|j(|j)d	 t*j+| _,t-|
j.|
j/|
j0| |
j1d
| _2d S )Nr   F)r   r   r   r   r\   z%Restored language model weights from )
state_dict)sharded_state_dictcheckpoint_dirvalidate_access_integrityz#Restored vision model weights from )r   r   r   )ra   rb   r_   r[   r]   )3superMCoreLLaVAModel__init__r   r   r   rK   r   r   r   r   r   r   encoder_hidden_statevision_modelvision_projectionlanguage_modelr   sequence_parallel_lmr   tp_comm_overlap_lmr   context_parallel_lm#share_embeddings_and_output_weightsr   max_sequence_length_language_max_sequence_lengthr   _language_is_pipeline_parallelr   r   r(   infor	   loaddict_drop_vision_class_tokenr   freezer   r   r   r   encoder_or_decoder
model_typer#   ra   rb   r_   r]   _img_seq_len)rW   r   r   r   r   r   r   r   r   r   r   r   	__class__r=   r>   r   ]  sr   




zMCoreQwen2VLModel.__init__r,   r.   r0   r2   attention_maskc           2   	   C   s   d}t }t}d}	d}
|dur| }g }|dus|dur$| }|du r*t|}tjd|jd |jd |j|j	d}d\}}|
|j	}t|D ]\}}||| dk }d\}}t||	kd}||d  }||k }||k }| }g }d}||}}t|| D ]/}||v r|dkr|||}nt|d }||v r|dkr|||}nt|d }||k r|| d || d || d }} }!d}"|d7 }|d8 }|}#n/|| d || d || d }} }!| jd	kr|dur|| }"nd
}"|d7 }|d8 }|}#| |  | |! | }$}%}&|#| }'t|dkr0|d  d nd}(|t|'dddd|(  | jdkr\t|$ddd|%|&  })n%| jd	krt|$dd}*|*d|%|& }+|+|" |
 },|, }-|- })t|%ddd|$d|& }.t|&ddd|$|%d }/|t|)|.|/g|' |(  |#|$|% |&  }q|t|k rt|dkr|d  d nd}(t|| }'|t|'dddd|(  tj|dddd}0|0
|j	|d||| dkf< ||0 d t||   qJtj||j	d d}||fS |dure| !dd }|"|dkd | dddd
|j	}|jdddd jdddd }1|1d |jd  }||fS tj|jd |j	ddddd|jd d}tj#|jd dg|j	|jd}||fS )aX  
        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.

        Explanation:
            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.

            For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
            Examples:
                input_ids: [T T T T T], here T is for text.
                temporal position_ids: [0, 1, 2, 3, 4]
                height position_ids: [0, 1, 2, 3, 4]
                width position_ids: [0, 1, 2, 3, 4]

            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
            and 1D rotary position embedding for text part.
            Qwen2-VL and Qwen25-VL has differnt type:
            Qwen2-VL Examples:
                Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
                vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
                text temporal position_ids: [3, 4, 5, 6, 7]
                text height position_ids: [3, 4, 5, 6, 7]
                text width position_ids: [3, 4, 5, 6, 7]
            Qwen25-VL Examples:
                Temporal (Time): 3 patches, representing different segments of the video in time.
                Height: 2 patches, dividing each frame vertically.
                Width: 2 patches, dividing each frame horizontally.
                We also have some important parameters:
                fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each
                    second.
                tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens"
                    are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens
                    per second. So each second of the video will be represented with 25 separate time points. It
                    essentially defines the temporal granularity.
                temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
                interval: The step size for the temporal position IDs, calculated as
                    tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each
                    temporal patch will be have a difference of 50 in the temporal position IDs.
                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
                vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
                text temporal position_ids: [101, 102, 103, 104, 105]
                text height position_ids: [101, 102, 103, 104, 105]
                text width position_ids: [101, 102, 103, 104, 105]
                Here we calculate the text start position_ids as the max vision position_ids plus 1.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.
            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
                The temporal, height and width of feature shape of each image in LLM.
            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
                The temporal, height and width of feature shape of each video in LLM.
            second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
                The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

        Returns:
            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
        rl   idP Nr*   r   r\   dtypedevice)r   r   r1   g      ?r+   )dim.)r   F)keepdimT)r   r   )$r   r   cpucloner   	ones_likeonesshaper   r   to	enumerateargwheresqueezesumtolistrangeindexrD   rK   itemmaxappendarangeviewexpandflattenlongstackcatreshaperX   	unsqueezecumsummasked_fill_zeros)2rW   r,   r.   r0   r2   r   rm   image_token_idvideo_token_idvision_start_token_idtokens_per_secondmrope_position_deltastotal_input_idsr3   image_indexvideo_indexiinput_ids_item
_input_ids
image_nums
video_numsvision_start_indicesvision_tokensinput_tokensllm_pos_ids_liststremain_imagesremain_videos_ed_imageed_videothwsecond_per_grid_ted
llm_grid_t
llm_grid_h
llm_grid_wtext_lenst_idxt_indexrange_tensorexpanded_rangetime_tensortime_tensor_longh_indexw_indexllm_positionsmax_position_idsr=   r=   r>   get_rope_index  s   M













"$$"""$ 
 
z MCoreQwen2VLModel.get_rope_indexr3   r5   r4   inference_paramsr-   r/   runtime_gather_outputc              	      s  |duod|j v }|du}|du}d}|rd}n| jr |s d}n| jr|r|t| j j}| jjrPt	
  | j||	d}W d   n1 sJw   Y  n| j||	d}| jdkr`| jjnd}| jrt| jdd |dd dddf }| jdkr fdd|D }nd}| |}| jdkrt	|}||ddf }|dur|jd |jd	  |j d
< n| j}d}| jr|r|t| j j}| jjrt	
  | j||
d}W d   n1 sw   Y  n| j||
d}| |}| js|S d}|jd }|| jkrH|ddd| jf }|dur$|ddddd| jf }|durH|jd | jkrH|ddd| jf }|ddd| jf }| jrp|| jk rp| j| }t	jj|d|f}|durpt	jj|d|f}|du r|dur| ||	|
||\}}| jr| jrt }t|jd | d | | |jd  }|dkrt	jj|d|f}|durt	jj|d|f}| }d||dk < | j j!|dd}|"dd# }| j$|||||||d\}}}}| j d||||||d}|du s|du r|S ||# fS )ar  Forward function of the Qwen2VL model.

        Args:
            input_ids (torch.Tensor): input text ids [batch, decoder_seq_len].
            attention_mask (torch.Tensor): Attention mask for the language model [batch, 1, combined_seq_len,
            combined_seq_len].
            position_ids (torch.Tensor): input text position ids [batch, decoder_seq_len].
            loss_mask (torch.Tensor): Text loss mask [batch, decoder_seq_len].
            labels (torch.Tensor): Optional target text labels [batch, combined_seq_len].
            inference_params (InferenceParams): Inference-time parameters including KV cache.
            pixel_values (torch.Tensor): input image of shape [images_total_patches,
            num_channels * temporal_size * patch_size * patch_size].
            pixel_values_videos (torch.Tensor): input video of shape [videos_total_patches,
            num_channels * temporal_size * patch_size * patch_size].
            image_grid_thw (torch.Tensor): The temporal, height and width of feature shape of each image.
            Shape [num_images, 3].
            video_grid_thw (torch.Tensor): The temporal, height and width of feature shape of each video.
            Shape [num_videos, 3].
            runtime_gather_output (bool): Gather output at runtime. Default None means
                `parallel_output` arg in the constructor will be used.
        Returns:
            output (torch.Tensor): Loss of shape [b, s] if labels are provided,
                otherwise logits of shape [b, s, vocab_size].
            loss_mask (torch.Tensor): Loss mask expanded to combined sequence length. Shape [b, s].
        Nimage_tokens_count)grid_thwr1   r]   r\   c                    s   g | ]
}| kr|  qS r=   r=   )r8   idxr]   r=   r>   
<listcomp>  s    z-MCoreQwen2VLModel.forward.<locals>.<listcomp>rl   media_tokens_countr   )r,   r3   )r5   r4   language_embeddingsimage_embeddingsvideo_embeddingsr   )r,   r3   r   decoder_inputr4   r>  r?  )%key_value_memory_dictr   r   rA   r   
parametersr   r   r   r   no_gradrK   window_indexr   r   r   argsortr   r   r   r   r   r   r   padr=  r   r   r   $get_tensor_model_parallel_world_sizer   r   r   	embedding	transpose
contiguous_preprocess_data)rW   r,   r   r3   r5   r4   r>  r-   r/   r.   r0   r?  r2   use_inference_kv_cache
has_images
has_videosrG  rM  reverse_indicesrH  rF  language_seq_lenpadded_seq_lenr'  tp_world_sizeinput_ids_textcombined_embeddingsfinal_labelsfinal_loss_maskfinal_attention_maskrN   r=   rC  r>   forwardr  s   )














&



zMCoreQwen2VLModel.forwardrF  rG  rH  rU  c
                 C   s(  | j sJ d| js| jsdS |r||||	fS |j\}
}|du}|r4|j|jks4J d|j d|j |du}|du}d}| jr|}|r||tk  }|jd }||kr`td| d| |tkd		|
|j}|
|j|j}|||}|r|tk  }|jd }||krtd
| d| |tkd		|
|j}|
|j|j}|||}d\}}| jr|r| jr|jd | jk r| j}tj|
|ft|j|jd}tj|
|fd|j|jd}|ddddf |ddd|jd f< |ddddf |ddd|jd f< n||}}|dur?|dur?|jdd |j  kr:|jks?J d J d|durh|jd | jkrX|ddd| jf }|dd }| jrht|}|duot|jd | jk}|r|ddd| jf }|ddd| jf }||||	fS )a  
        MCoreQwen2VLModel uses its own version of _preprocess_data instead of MCoreLLaVAModel's (in
        megatron-lm/megatron/core/models/multimodal/llava_model.py)

        This function handles several data preprocess requirements:
            - merge image and/or video embeddings into language embedding
            - padding inputs variables (e.g. labels/loss masks) for pipeline_parallel case
            - truncate inputs variables (e.g. labels/loss masks) if exceeding max seq length

        This function won't shift labels as forward() and _preprocess_data() in MCoreQwen2VLModel
        expect labels from input arguments already handle this shift.

        About merging image/video embeddings: language_embeddings may include num of imgage_token
        placeholders, and this function will put each imgage_token from image_embeddings into
        placeholder within language_embeddings(1:1 mapping), when image_embeddings/video_embeddings
        is available and it's the 1st pipeline_parallel stage
        z>input text preprocessing is only needed for the language model)NNNNNzmismatching labels shape z and loss mask shape r   z6Image features and image tokens do not match: tokens: z, features r   z6Video features and video tokens do not match: tokens: )NNr\   r   rl   z*unexpected shapes after data preprocessing)r   r   r   r   r   r   r  
ValueErrorr  	expand_asr   r   r   masked_scatterr   r   r   r   fullr   rR  rS  r   r   )rW   r,   r5   r4   rF  rG  rH  r3   rU  r   
batch_sizerY  
has_labelsrV  rW  final_embeddingn_image_tokensn_image_features
image_maskn_video_tokensn_video_features
video_maskr^  r_  max_seq_lentruncate_labelsr=   r=   r>   rT  -  s   


*,
&
z"MCoreQwen2VLModel._preprocess_datac                 C   s   t |ts|g}t|dksJ d| jr"| jr"| j|d  dS | jr/| j|d  dS | jr9|d | _dS | j	|d  dS )zSet model chunk input tensor.r\   z.input_tensor should only be length 1 for llavar   N)
rB   listrD   r   r   r   rY   r   r   r   )rW   input_tensorr=   r=   r>   rY     s   
z"MCoreQwen2VLModel.set_input_tensor)NTTTTFN)NNNNN)NNNNNNNNNNN)NNNNNNFN)r)   N)r   r   r   r   r   r   r   r   r   r   
LongTensorTensorr   r=  r   FloatTensorra  rT  rY   __classcell__r=   r=   r   r>   r   Z  s    	
V
 E	

 ?	

 r   c                       sn  e Zd ZdZ			d(dededee ded deee	j
ge	j
f  f
 fd	d
Zd)dee ddfddZ										d*dejdeej deej deej deej dee deej deej deej deej deej dejfddZdeeejf fddZdejfddZd)dejfd d!Zd)dejfd"d#Zedefd$d%Zedefd&d'Z  ZS )+Qwen2VLModelz#Lightning Wrapper for Qwen2VL ModelNr   rK   optimr   r   model_transformc                    sh   t    || _|| _|pttdddd| _| j|  || _d | _	d | _
|| _| jdv s2J dd S )Ng-C6?T)lruse_distributed_optimizerr   )r+   r1   z3model_version only supports qwen2-vl and qwen25-vl.)r   r   r   r   r&   r   rx  connectry  _training_loss_reduction_validation_loss_reductionrK   )rW   r   rK   rx  r   ry  r   r=   r>   r     s   
	zQwen2VLModel.__init__r   r)   c                 C   s&   t | ds| jj| j|d| _d S d S )Nmodule)r   )hasattrr   r   r   r  )rW   r   r=   r=   r>   r     s   
zQwen2VLModel.configure_modelr,   r   r3   r5   r4   r>  r-   r/   r.   r0   r2   c                 C   s$   | j |||||||||	|
|d}|S )N)r,   r   r3   r5   r4   r>  r-   r/   r.   r0   r2   )r  )rW   r,   r   r3   r5   r4   r>  r-   r/   r.   r0   r2   output_tensorr=   r=   r>   ra    s   zQwen2VLModel.forwardc                 C   s   | j || jS rV   )r   r   rK   )rW   rJ   r=   r=   r>   	data_step  s   zQwen2VLModel.data_stepc                 C   s   | j | |S rV   )r   r   )rW   rL   r=   r=   r>   forward_step  s   zQwen2VLModel.forward_stepc                 C   
   |  |S rV   r  rW   rL   	batch_idxr=   r=   r>   training_step  s   
zQwen2VLModel.training_stepc                 C   r  rV   r  r  r=   r=   r>   validation_step  s   
zQwen2VLModel.validation_stepc                 C   s   | j st | _ | j S rV   )r}  r%   rW   r=   r=   r>   training_loss_reduction  s   z$Qwen2VLModel.training_loss_reductionc                 C   s   | j s	tdd| _ | j S )NT)r  )r~  r%   r  r=   r=   r>   validation_loss_reduction'  s   z&Qwen2VLModel.validation_loss_reduction)NNNrV   )
NNNNNNNNNN)r   r   r   r   r   r   r   r'   r   r   Moduler   r   r   r   rt  rs  r   ru  ra  r   r  r  r  r  propertyr%   r  r  rv  r=   r=   r   r>   rw    sx    	

rw  )rw  r   rO   rU   )Vdataclassesr   r   typingr   r   r   r   r   lightning.pytorchpytorchLr   torch.distributedr@   r	   r   r   megatron.core.enumsr   megatron.core.inference_paramsr   +megatron.core.models.multimodal.llava_modelr   r   megatron.core.optimizerr   megatron.core.tensor_parallelr   $megatron.core.transformer.spec_utilsr   ,megatron.core.transformer.transformer_configr   megatron.core.utilsr   r   1nemo.collections.common.tokenizers.tokenizer_specr   nemo.collections.llmr   "nemo.collections.llm.fn.activationr   #nemo.collections.llm.gpt.model.baser   $nemo.collections.llm.gpt.model.qwen2r    nemo.collections.vlm.layer_specsr   $nemo.collections.vlm.neva.model.baser   r   3nemo.collections.vlm.qwen2vl.data.multimodal_tokensr   r   r   )nemo.collections.vlm.qwen2vl.model.visionr    r!   nemo.collections.vlm.visionr"    nemo.collections.vlm.vision.baser#   nemo.lightningr$    nemo.lightning.megatron_parallelr%   nemo.lightning.pytorch.optimr&   r'   
nemo.utilsr(   r   rt  rO   rU   rY   IOMixinrZ   r   r   r   LightningModuleConnectorMixinFNMixinrw  __all__r=   r=   r=   r>   <module>   s^   .9:n    {_