o
    wiz                     @   sr  d dl mZ d dlmZmZ d dlZd dlm  mZ	 d dl
mZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z#m$Z$ d dl%m&Z& d dl'm(Z(m)Z)m*Z* d dlm+Z+ z
d dl,m-Z- dZ.W n e/y   dZ.Y nw G dd de#Z0G dd dejj1Z2G dd deZ3G dd deZ4dS )    )nullcontext)OptionalUnionN)parallel_statetensor_parallel)has_config_logger_enabledlog_config_to_disk)	Fp8Recipe)get_fp8_context)BaseInferenceContext)	jit_fuser)VisionModulePackedSeqParams)ModelCommProcessGroups)	ModelType)
ModuleSpec)TransformerBlockTransformerBlockSubmodules)TransformerConfig)WrappedTensordeprecate_inference_paramsmake_viewless_tensor)Tensor)te_checkpointTFc                       s  e Zd ZdZ					d dedeeef dededed	e	d
e
e f fddZ										d!dddeeef de
e de
e de
e de
e de
e de
e de
e de
e de
e de
e de
e de
e fddZdededededededededefddZ  ZS )"Qwen25VLVisionTransformerBlockz
    Qwen25-VL Vision Transformer block, with either window attention or full attention in each layer.
    Window attention is achieved by specifying packed_seq_params.
    TNconfigspecpost_layer_normpre_processpost_processmodel_comm_pgsvp_stagec              	      s   t  ||||||| d S N)super__init__)selfr   r   r   r   r    r!   r"   	__class__ f/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/qwen2vl/model/vision.pyr%   1   s   
z'Qwen25VLVisionTransformerBlock.__init__)inference_paramshidden_statesattention_maskcontextcontext_maskrotary_pos_embrotary_pos_cosrotary_pos_sinattention_biasinference_contextpacked_seq_paramssequence_len_offsetpacked_seq_params_fullr+   c                C   s  t |	|}	t|tr| }| js| j}t|ddd}| jjr&t	
  }nt }| jjo3| jjtjk}| jjo>| jjtjk}|rFt| jnt }| | | jjdkrg| jrg| j|||||||
||d	}nqt| jD ]k\}}| jjdur|| jjv r|}n|
}|rt| j|jd nt }| j+ | ||||||||||	||d\}}W d   n1 sw   Y  W d   n1 sw   Y  t r| jjr| jdur| |}qlW d   n1 sw   Y  W d   n1 sw   Y  | jdur| |}t|ddd}|S )a  
        Perform the forward pass through the transformer block.

        This method handles the core computation of the transformer, including
        self-attention, optional cross-attention, and feed-forward operations.

        Args:
            hidden_states (Union[Tensor, WrappedTensor]): Input tensor of shape [s, b, h]
                where s is the sequence length, b is the batch size, and h is the hidden size.
                Can be passed as a WrappedTensor during inference to avoid an obsolete
                reference in the calling function.
            attention_mask (Tensor): Boolean tensor of shape [1, 1, s, s] for masking
                self-attention.
            context (Tensor, optional): Context tensor for cross-attention.
            context_mask (Tensor, optional): Mask for cross-attention context
            rotary_pos_emb (Tensor, optional): Rotary positional embeddings.
            attention_bias (Tensor): Bias tensor for Q * K.T of shape in shape broadcastable
                to [b, num_head, sq, skv], e.g. [1, 1, sq, skv].
                Used as an alternative to apply attention mask for TE cuDNN attention.
            inference_context (BaseInferenceContext, optional): Parameters for inference-time
                optimizations.
            packed_seq_params (PackedSeqParams, optional): Parameters for packed sequence
                processing.
            packed_seq_params_full (PackedSeqParams, optional): Parameters for packed sequence
                processing for full attention.

        Returns:
            Union[Tensor, Tuple[Tensor, Tensor]]: The output hidden states tensor of shape
            [s, b, h], and optionally the updated context tensor if cross-attention is used.
        T)inprequires_grad
keep_graphfull)	r,   r-   r.   r/   r0   r3   r5   use_inner_fp8_contextr7   N   )r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   )r   
isinstancer   unwrapr   input_tensorr   r   sequence_parallelr   get_cuda_rng_trackerforkr   fp8
fp8_reciper	   delayedr
   recompute_granularitytraining_checkpointed_forward	enumeratelayersfullatt_block_indexeslayer_numberoffload_contexttorchis_grad_enabledcpu_offloading#group_prefetch_offload_commit_asyncfinal_layernorm)r&   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r+   rng_contextuse_outer_fp8_contextr<   outer_fp8_contextl_nolayerpacked_seq_params_nowinner_fp8_contextr)   r)   r*   forwardE   s   
/
 

 0
z&Qwen25VLVisionTransformerBlock.forwardr<   c
                    s  dt dt f 	fdd}
fdd}jjdkrAd}|jk r?||
||jj \|jj7 }|jk s'S jjd	krd}tjD ]4}jjr[js[|d
7 }||krs|jj| k rs||
||d
 \qN|
||d
 \qNS td)z-Forward method with activation checkpointing.startendc                    s    fdd}|S )Nc           	         s   t D ]F}jjd ur|jjv r}n}|}r(tj|jd nt }| || |||| d |d\} }W d    n1 sFw   Y  q| |fS )Nr=   )r,   r-   r.   r/   r0   r3   r4   r5   )ranger   rL   
_get_layerr
   rM   r   )	r,   r-   r.   r/   r0   indexrY   rX   rZ   )r3   r]   r5   r7   r&   r\   r<   r)   r*   custom_forward   s.   
z\Qwen25VLVisionTransformerBlock._checkpointed_forward.<locals>.custom.<locals>.custom_forwardr)   )r\   r]   ra   )r3   r5   r7   r&   r<   )r]   r\   r*   custom   s   zDQwen25VLVisionTransformerBlock._checkpointed_forward.<locals>.customc              
      sF   j jrt| j jtjjt  	S t	| j j S )zMDetermines whether to use the `te_checkpoint` or `tensor_parallel.checkpoint`)
r   rD   r   distribute_saved_activationsr   randomrB   r   get_tensor_model_parallel_group
checkpoint)forward_func)r-   r.   r/   r,   r0   r&   r)   r*   checkpoint_handler   s*   zPQwen25VLVisionTransformerBlock._checkpointed_forward.<locals>.checkpoint_handleruniformr   blockr=   z$Invalid activation recompute method.)	intr   recompute_methodnum_layers_per_pipeline_rankrecompute_num_layersr^   rD   r9   
ValueError)r&   r,   r-   r.   r/   r0   r3   r5   r<   r7   rb   rh   	layer_idxrecompute_skip_num_layersr)   )
r3   r-   r.   r/   r,   r5   r7   r0   r&   r<   r*   rI      s0   



z4Qwen25VLVisionTransformerBlock._checkpointed_forward)TTTNN)
NNNNNNNNNN)__name__
__module____qualname____doc__r   r   r   r   boolr   r   rk   r%   r   r   r   r   r[   rI   __classcell__r)   r)   r'   r*   r   +   s    	

	

 	
r   c                       sD   e Zd Zddededdf fddZededejfd	d
Z	  Z
S )VisionRotaryEmbedding     @dimthetareturnNc                    s>   t    d|tjd|dtjd|   }| jd|dd d S )Ng      ?r      )dtypeinv_freqF)
persistent)r$   r%   rO   arangefloatregister_buffer)r&   rz   r{   r   r'   r)   r*   r%   B  s   
 zVisionRotaryEmbedding.__init__seqlenc                 C   s*   t j|| jj| jjd}t || j}|S )Ndevicer~   )rO   r   r   r   r~   outer)r&   r   seqfreqsr)   r)   r*   r[   G  s   zVisionRotaryEmbedding.forward)ry   )rr   rs   rt   rk   r   r%   r   rO   r   r[   rw   r)   r)   r'   r*   rx   ?  s    rx   c                       s   e Zd ZdZ								d!deded	ed
edededededededdf fddZde	j
ddfddZdd Zdd Z	d"de	j
de	j
dee	j
 de	j
fdd Z  ZS )#Qwen2VisionModelzQwen2-VL vision model.Fr=      r}   P  transformer_configtransformer_layer_specadd_class_tokenclass_token_len	patch_dimtemporal_patch_sizespatial_merge_sizespatial_patch_sizeimg_himg_wr|   Nc                    sp  t  j|d t|rt|t t| jd || _|j| _	|| _
|| _|| _|| _| j	|d  | _|	| _|
| _d| _| j| j
 dksDJ | j| j
 dksNJ | j| j
 | _| j| j
 | _| j| j | _|| _|| _| j| jrq| jnd | _|||g}tjj| j| j	||dd| _|j|j }t|d | _|| _| jrtjtd| j| j	| _ t!j"| _#t$||d	d	d
| _%d S N)r   )prefixr}      r   F)in_channelsout_channelskernel_sizestridebiasr=   T)r   r   r   r    )&r$   r%   r   r   localstyperr   r   	embed_dimvisual_hidden_sizer   r   r   r   merge_hidden_sizer   r   r   num_patches_per_dim_hnum_patches_per_dim_wnum_patchesr   
seq_lengthrO   nnConv3dconv1num_attention_headsrx   r0   	Parameterrandnclass_tokenr   encoder_or_decoder
model_typer   decoder)r&   r   r   r   r   r   r   r   r   r   r   r   head_dimr'   r)   r*   r%   R  sR   
zQwen2VisionModel.__init__r@   c                 C      | j | dS zSets input tensor to the model.

        Args:
            input_tensor (Tensor): Sets the input tensor for the model.
        Nr   set_input_tensorr&   r@   r)   r)   r*   r        z!Qwen2VisionModel.set_input_tensorc                 C     g }|D ]e\}}}t |dd|}||| j | j|| j | j}|dddd}| }t |d|d}||| j | j|| j | j}|dddd}| }|t j	||gdd
|d qt j|dd}|d d dd f  }| |}	|	| d}
|
S Nr=   r   r}   r   rz   rO   r   	unsqueezeexpandreshaper   permuteflattenappendstackrepeatcatmaxr0   r&   grid_thwpos_idsthwhpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullr0   r)   r)   r*   rot_pos_emb  s4   "
zQwen2VisionModel.rot_pos_embc                 C   s   ddl m} t|d d df |d d df  |d d df jdtjd}tj|ddd}| }|dd  |d d  	 
 }|||||d	d
S Nr   r   r=   r}   )rz   r~   )r=   r   )valuer   thd)cu_seqlens_qcu_seqlens_kvmax_seqlen_qmax_seqlen_kv
qkv_formatmegatron.core.packed_seq_paramsr   rO   repeat_interleavecumsumint32Fpadsqueezer   item)r&   r   r   
cu_seqlens
max_seqlenr)   r)   r*   get_packed_seq_params  s   4 z&Qwen2VisionModel.get_packed_seq_paramsxr   r-   c                 C   s   | d| j| j| j| j}| | d| j}|d}| |}tj	||fdd}|ddddddf }| 
|}| j||||d}|d d| j}|S )a  Forward function of the Qwen2 Vision Model. This function passes the input tensors
        through the embedding layer and then the transformer.

        Args:
            x (torch.Tensor): input data of shape [batch, img_h, img_w]
            grid_thw (torch.Tensor): The temporal, height and width of feature shape of each image/frame.
            attention_mask (torch.Tensor with dtype=bool): Attention mask to use.

        Returns:
            x (torch.Tensor): output after final transformer block.
        r   r=   r   N)r0   r5   )viewr   r   r   r   r   r   r   rO   r   r   r   r   r   )r&   r   r   r-   r0   r5   r)   r)   r*   r[     s   


zQwen2VisionModel.forward)Fr=   r   r}   r}   r   r   r   r#   )rr   rs   rt   ru   r   r   rv   rk   r%   rO   r   r   r   r   r   r[   rw   r)   r)   r'   r*   r   O  s\    	
Fr   c                       s   e Zd ZdZ									d&ded	ed
edededededededededdf fddZde	j
ddfddZedd Ze	d'dee	j
 dee	j
 fddZed d! Z	d'd"e	j
de	j
d#ee	j
 de	j
fd$d%Z  ZS )(Qwen25VisionModelzQwen2.5-VL vision model.Fr=   r   r}   r   p   r   r   r   r   r   r   r   r   r   r   window_sizer|   Nc                    s  t  j|d t|rt|t t| jd || _|j| _	|| _
|| _|| _|| _|| _| j	|d  | _| j| j | _|	| _|
| _d| _| j| j
 dksNJ | j| j
 dksXJ | j| j
 | _| j| j
 | _| j| j | _|| _|| _| j| jr{| jnd | _|||g}tjj| j| j	||dd| _|j|j }t|d | _|| _| jrtj t!d| j| j	| _"t#j$| _%t&||d	d	d
| _'d | _(d S r   ))r$   r%   r   r   r   r   rr   r   r   r   r   r   r   r   r   r   spatial_merge_unitr   r   r   r   r   r   r   r   rO   r   r   r   r   rx   r0   r   r   r   r   r   r   r   r   window_index)r&   r   r   r   r   r   r   r   r   r   r   r   r   r   r'   r)   r*   r%     sX   

zQwen25VisionModel.__init__r@   c                 C   r   r   r   r   r)   r)   r*   r   E  r   z"Qwen25VisionModel.set_input_tensorc                 C   r   r   r   r   r)   r)   r*   r   M  s4   "
zQwen25VisionModel.rot_pos_embr   r   c                 C   s   ddl m} |d ur9t|d d df |d d df  |d d df }|jdtjd}tj|ddd}| }n| }|dd  |d d  }|	 
 }|||||d	d
S r   r   )r&   r   r   r   seqlensr   r)   r)   r*   r   l  s    4
z'Qwen25VisionModel.get_packed_seq_paramsc                 C   sj  g }dg}d}| j | j | j }|D ]\}}}|| j || j }	}
t||	 |
 ||	|
}||	|  }||
|  }|	| | }|
| | }t|d|d|fdd}||||||}|ddddd||| ||}|dk	ddgd}|d}||dk }|
||  |d| j |d  }||  |||	 |
  7 }qtj|dd	}||fS )
Nr   constantir=   r   r}      r   r   )r   r   r   rO   r   r   r   r   r   sumr   r   r   extendtolistr   r   )r&   r   r   cu_window_seqlenswindow_index_idvit_merger_window_sizegrid_tgrid_hgrid_w
llm_grid_h
llm_grid_wr`   pad_hpad_wnum_windows_hnum_windows_windex_paddedr   	index_newcu_seqlens_tmpr)   r)   r*   get_window_index  sH   
z"Qwen25VisionModel.get_window_indexr   r-   c                 C   s\  | d| j| j| j| j}| | d| j}| |\}}tj||j	tj
d}t|}| \}}||| j | jd}||ddddf }||d}|d}| |}||| j | jd}||ddddf }||d}tj||fdd}|ddddddf }| |}	| d|}
| j||||
|	d}|d d| j}|| _|S )a  Forward function of the Qwen2.5 Vision Model. This function passes the input tensors
        through the embedding layer and then the transformer.

        Args:
            x (torch.Tensor): input data of shape [batch, img_h, img_w]
            grid_thw (torch.Tensor): The temporal, height and width of feature shape of each image/frame.
            attention_mask (torch.Tensor with dtype=bool): Attention mask to use.

        Returns:
            x (torch.Tensor): output after final transformer block.
        r   r   Nr=   r   )r0   r5   r7   )r   r   r   r   r   r   r  rO   tensorr   r   unique_consecutivesizer   r   r   r   r   r   r   r   r   r   )r&   r   r   r-   r   r   seq_len_r0   r7   r5   r)   r)   r*   r[     s@   



zQwen25VisionModel.forward)	Fr=   r   r}   r}   r   r   r   r   r#   )rr   rs   rt   ru   r   r   rv   rk   r%   rO   r   r   r   r   r   r   r  r[   rw   r)   r)   r'   r*   r     st    	
K

+r   )5
contextlibr   typingr   r   rO   torch.nn.functionalr   
functionalr   megatron.corer   r   megatron.core.config_loggerr   r   megatron.core.enumsr	   megatron.core.fp8_utilsr
    megatron.core.inference.contextsr   megatron.core.jitr   7megatron.core.models.common.vision_module.vision_moduler   r   r   #megatron.core.process_groups_configr   megatron.core.transformer.enumsr   $megatron.core.transformer.spec_utilsr   +megatron.core.transformer.transformer_blockr   r   ,megatron.core.transformer.transformer_configr   megatron.core.utilsr   r   r   r   +megatron.core.extensions.transformer_enginer   HAVE_TEImportErrorr   Modulerx   r   r   r)   r)   r)   r*   <module>   s@      )