o
    }oi!                     @   s   d dl mZ d dlZd dlm  mZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ G d	d
 d
ejjZG dd deZdS )    )OptionalN)has_config_logger_enabledlog_config_to_disk)VisionModule)	ModelType)
ModuleSpec)TransformerBlock)TransformerConfigc                       s@   e Zd Zddededdf fddZdedejfd	d
Z  Z	S )VisionRotaryEmbedding     @dimthetareturnNc                    s>   t    d|tjd|dtjd|   }| jd|dd d S )Ng      ?r      )dtypeinv_freqF)
persistent)super__init__torcharangefloatregister_buffer)selfr   r   r   	__class__ ]/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/qwen2vl/model/vision.pyr      s   
 zVisionRotaryEmbedding.__init__seqlenc                 C   s*   t j|| jj| jjd}t || j}|S )N)devicer   )r   r   r   r   r   outer)r   r   seqfreqsr   r   r   forward$   s   zVisionRotaryEmbedding.forward)r   )
__name__
__module____qualname__intr   r   r   Tensorr#   __classcell__r   r   r   r   r
      s    r
   c                       s   e Zd ZdZ								d!deded	ed
edededededededdf fddZde	j
ddfddZdd Zdd Z	d"de	j
de	j
dee	j
 de	j
fdd Z  ZS )#Qwen2VisionModelzQwen2-VL vision model.F      r   P  transformer_configtransformer_layer_specadd_class_tokenclass_token_len	patch_dimtemporal_patch_sizespatial_merge_sizespatial_patch_sizeimg_himg_wr   Nc                    sp  t  j|d t|rt|t t| jd || _|j| _	|| _
|| _|| _|| _| j	|d  | _|	| _|
| _d| _| j| j
 dksDJ | j| j
 dksNJ | j| j
 | _| j| j
 | _| j| j | _|| _|| _| j| jrq| jnd | _|||g}tjj| j| j	||dd| _|j|j }t|d | _|| _| jrtjtd| j| j	| _ t!j"| _#t$||d	d	d
| _%d S )N)config)prefixr      r   F)in_channelsout_channelskernel_sizestridebiasr+   T)r8   specpre_processpost_process)&r   r   r   r   localstyper$   r1   	embed_dimvisual_hidden_sizer2   r3   r4   r5   merge_hidden_sizer6   r7   r;   num_patches_per_dim_hnum_patches_per_dim_wnum_patchesr0   
seq_lengthr   nnConv3dconv1num_attention_headsr
   rotary_pos_emb	Parameterrandnclass_tokenr   encoder_or_decoder
model_typer   decoder)r   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r=   head_dimr   r   r   r   .   sR   
zQwen2VisionModel.__init__input_tensorc                 C   s   | j | dS )zSets input tensor to the model.

        Args:
            input_tensor (Tensor): Sets the input tensor for the model.
        N)rV   set_input_tensor)r   rX   r   r   r   rY   t   s   z!Qwen2VisionModel.set_input_tensorc                 C   s  g }|D ]e\}}}t |dd|}||| j | j|| j | j}|dddd}| }t |d|d}||| j | j|| j | j}|dddd}| }|t j	||gdd
|d qt j|dd}|d d dd f  }| |}	|	| d}
|
S )Nr+   r   r   r:   r   )r   r   	unsqueezeexpandreshaper4   permuteflattenappendstackrepeatcatmaxrP   )r   grid_thwpos_idsthwhpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullrP   r   r   r   rot_pos_emb|   s4   "
zQwen2VisionModel.rot_pos_embc                 C   s   ddl m} t|d d df |d d df  |d d df jdtjd}tj|ddd}| }|dd  |d d  	 
 }|||||d	d
S )Nr   )PackedSeqParamsr+   r   )r   r   )r+   r   )valuerZ   thd)cu_seqlens_qcu_seqlens_kvmax_seqlen_qmax_seqlen_kv
qkv_format)megatron.core.packed_seq_paramsrp   r   repeat_interleavecumsumint32Fpadsqueezere   item)r   rf   rp   
cu_seqlens
max_seqlenr   r   r   get_packed_seq_params   s   4 z&Qwen2VisionModel.get_packed_seq_paramsxrf   attention_maskc                 C   s   | d| j| j| j| j}| | d| j}|d}| |}tj	||fdd}|ddddddf }| 
|}| j||||d}|d d| j}|S )a  Forward function of the Qwen2 Vision Model. This function passes the input tensors
        through the embedding layer and then the transformer.

        Args:
            x (torch.Tensor): input data of shape [batch, img_h, img_w]
            grid_thw (torch.Tensor): The temporal, height and width of feature shape of each image/frame.
            attention_mask (torch.Tensor with dtype=bool): Attention mask to use.

        Returns:
            x (torch.Tensor): output after final transformer block.
        rZ   r+   r[   N)rP   packed_seq_params)viewr;   r3   r2   rN   rF   r\   ro   r   rd   r   rV   r~   rG   )r   r   rf   r   rP   r   r   r   r   r#      s   


zQwen2VisionModel.forward)Fr+   r,   r   r   r,   r-   r-   )N)r$   r%   r&   __doc__r	   r   boolr'   r   r   r(   rY   ro   r   r   r#   r)   r   r   r   r   r*   +   s\    	
Fr*   )typingr   r   torch.nn.functionalrL   
functionalr|   megatron.core.config_loggerr   r   7megatron.core.models.common.vision_module.vision_moduler   megatron.core.transformer.enumsr   $megatron.core.transformer.spec_utilsr   +megatron.core.transformer.transformer_blockr   ,megatron.core.transformer.transformer_configr	   Moduler
   r*   r   r   r   r   <module>   s   