o
    
۾iS                  
   @   s  d dl mZ d dlmZ d dlmZ d dlZd dlmZ d dl	m  m
Z d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" dd Z#dej$dej$dej$de%ej$ej$f fddZ&G dd dej'Z(G dd dej'Z)G dd dej'Z*G dd dej'Z+G dd  d ej'Z,G d!d" d"ej'Z-	#d+d$ej$d%ej$d&e.e/e/f de.ej$ fd'd(Z0G d)d* d*eZ1dS ),    )Sequence)deepcopy)cached_propertyN)ACT2FN)PreTrainedModel)divide$get_tensor_model_parallel_world_size)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)maybe_prefix)is_vit_use_data_parallel)current_platform)MoonViTConfigc                 C   s   | j |j d ksJ | j|jf| jd d |jd d ks&J | j|jf| jd d|jd  ks:J | j|jf|jtjksEJ |jd S )N      )ndimshapedtypetorch	complex64)x	freqs_cis r   V/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/moonvit.py_apply_rope_input_validationE   s    ,(r   xqxkr   returnc                 C   s   t | | t || |d}t|  jg | jdd ddR  }t| jg | jdd ddR  }t|| d}t|| d}|	| |	|fS )a  
    Args: (The leading dimensions of all inputs should be the same)
        xq: query, tensor of shape (..., num_heads, head_dim)
        xk: key, tensor of shape (..., num_heads, head_dim)
        freqs_cis: tensor of shape (..., head_dim/2), dtype=torch.complex64. It contains the precomputed cis(freqs) for each position in the 2D grid.
    Returns:
        xq_out, xk_out: tensors of shape (..., num_heads, head_dim)
    r   Nr   r   )
r   	unsqueezer   view_as_complexfloatviewr   view_as_realflattentype_as)r    r!   r   xq_xk_xq_outxk_outr   r   r   
apply_ropeL   s   


,,r.   c                       sZ   e Zd Z	ddededededdf
 fdd	Zd
d ZdejdejdejfddZ	  Z
S )Learnable2DInterpPosEmbbicubicheightwidthdiminterpolation_moder"   Nc                    s>   t    || _|| _|| _tt|||| _	| 
  d S N)super__init__r1   r2   r4   nn	Parameterr   emptyweightreset_parameters)selfr1   r2   r3   r4   	__class__r   r   r7   d   s   
z Learnable2DInterpPosEmb.__init__c                 C   s   t j| j d S r5   )r8   initnormal_r;   r=   r   r   r   r<   n   s   z(Learnable2DInterpPosEmb.reset_parametersr   grid_hwsc                 C   s   g }|  D ]4}|| jjd d kr|| jjdd q|tj| jdd|| j	d
ddjdd q|t| }|S )Nr   r   )end_dim)r   r   r   r   )sizemode)r   r   r   )tolistr;   r   appendr(   Finterpolatepermuter#   r4   squeezer   cat)r=   r   rC   pos_embsr   outr   r   r   forwardq   s"   
zLearnable2DInterpPosEmb.forward)r0   )__name__
__module____qualname__intstrr7   r<   r   TensorrP   __classcell__r   r   r>   r   r/   c   s    
$r/   c                       sd   e Zd Z				ddededeeeef B dedef
 fd	d
ZdejdejdejfddZ  Z	S )MoonVisionPatchEmbed      r[   r[   out_dimin_dim
patch_sizepos_emb_heightpos_emb_widthc                    s   t    t|ttfsJ dt| t|tr||f}t|dks+J d| || _t||||d| _	t
|||d| _d S )NzInvalid patch_size type: r   z,Expected patch_size to be a tuple of 2, got )kernel_sizestride)r1   r2   r3   )r6   r7   
isinstancerT   r   typelenr^   r
   projr/   pos_emb)r=   r\   r]   r^   r_   r`   r>   r   r   r7      s    

zMoonVisionPatchEmbed.__init__r   grid_hwr"   c                 C   s(   |  ||dd}| ||}|S )z
        Args:
            x (L, Channels): input tensor
            grid_hw (N, 2): grid height and width

        Returns:
            (L, Cout) tensor
        r   r   )rf   r&   rE   rg   )r=   r   rh   r   r   r   rP      s   	zMoonVisionPatchEmbed.forward)rY   rZ   r[   r[   )
rQ   rR   rS   rT   tupler7   r   rV   rP   rW   r   r   r>   r   rX      s"    $rX   c                       s   e Zd ZdZdejfdededef fddZdd	 Ze	d
e
jfddZde
jd
e
jfddZde
jde
jd
e
jfddZ  ZS )Rope2DPosEmbaR  2D rotary position embedding with multi-resolution support.

    This class is intended to be used in the following way:
    1. Before training, create an instance of Rope2DPosEmb. This instance will hold the precomputed cis.
    2. Before each forward pass, call `get_freqs_cis_by_*` to get the `freqs_cis` tensor for this iteration.
    3. During the forward pass, pass the `freqs_cis` tensor to each attention layer, and call `apply` just before each attention operation.
        The rope is shared across all attention layers and all heads.

    Refs:
    - RoFormer: https://arxiv.org/abs/2104.09864
    - VisionLLaMA: https://arxiv.org/abs/2403.00522
    - https://github.com/Meituan-AutoML/VisionLLaMA/blob/main/dit/models.py

    Args:
        dim (int): usually the multi-head attention dimension, should be divisible by 4 (TODO: relax this constraint if needed)
        max_height (int): the maximum height of the 2D grid
        max_width (int): the maximum width of the 2D grid
        theta_base (float): the base of the theta
        device (str): the device to store the precomputed cis
    i'  r3   
max_height	max_widthc                    sB   t    || _| jd dksJ d|| _|| _|| _|| _d S )N   r   zdim must be divisible by 4)r6   r7   r3   rk   rl   
theta_basedevice)r=   r3   rk   rl   rn   ro   r>   r   r   r7      s   

zRope2DPosEmb.__init__c                 C   s$   d| j  d| j d| j d| j S )Nzdim=z, max_height=z, max_width=z, theta_base=)r3   rk   rl   rn   rB   r   r   r   
extra_repr   s   $zRope2DPosEmb.extra_reprr"   c                 C   s   | j | j }td| | j}|| j }|| j }td| jdd| jd   | j}d| j|| j   }t	|| }t	|| }t
t||}	t
t||}
tj|	jdd|
jddgdd}|| j | jd}|S )a  Calculate the cis(freqs) for each position in the 2D grid.

        Return: complex tensor of shape (max_height, max_width, dim//2) and value:
            height axis: ret[h, w, 2*i] = cis(h * theta_base**(-4*i/dim))
            weight axis: ret[h, w, 2*i+1] = cis(w * theta_base**(-4*i/dim))   with (i in [0, dim//4))
            note: `cis` is a mathematical notation defined by cis x = cos x + i sin x,
        r   rm   Ng      ?r   r3   )rk   rl   r   aranger%   toro   r3   rn   outerpolar	ones_likerM   r#   reshape)r=   Nflat_posx_posy_pos	dim_rangefreqsx_freqsy_freqsx_cisy_cisr   r   r   r   precomputed_freqs_cis   s    	

(z"Rope2DPosEmb.precomputed_freqs_cisrC   c                    sP   |  }t fdd|D sJ | j jftj fdd|D dd}|S )z
        Args:
            grid_hws (torch.Tensor): containing list of (height, width) or (t, height, width) tuples.
        Returns:
            freqs_cis: tensor of shape (sum(t * height * width), dim//2)
        c                 3   sH    | ]\}}d |  ko j kn  od |  ko jkn  V  qdS )r   N)rk   rl   .0hwrB   r   r   	<genexpr>   s    8
z8Rope2DPosEmb.get_freqs_cis_by_seqlens.<locals>.<genexpr>c                    s4   g | ]\}} j d |d |f d jd qS )Nr   r   )r   rw   r3   r   rB   r   r   
<listcomp>  s    "z9Rope2DPosEmb.get_freqs_cis_by_seqlens.<locals>.<listcomp>r   rq   )rG   allrk   rl   r   rM   )r=   rC   shapesr   r   rB   r   get_freqs_cis_by_seqlens   s    

z%Rope2DPosEmb.get_freqs_cis_by_seqlenspos_idxpos_idx_maskc                 C   s   |j dd |j kr|j d dkr|j|jd ks!J |j |j f|jtjks,J |j|j | jd f }tj|tj| jd}| j	|d | |d | f ||< |S )a  
        Args:
            pos_idx: tensor of shape (..., 2), It contains the (h, w) position indices of each 2D token.
            pos_idx_mask: a mask of shape (...), the leading dimensions should be the same as pos_idx.
                Rope will only be applied to the tokens with True mask. `freqs_cis` for the tokens with False mask with be ones.
        Return:
            freqs_cis: tensor of shape (..., dim//2)
        Nr   r   r   )r   ro   ).r   ).r   )
r   r   r   r   boolr3   onesr   ro   r   )r=   r   r   shpr   r   r   r   get_freqs_cis_by_idx  s   

z!Rope2DPosEmb.get_freqs_cis_by_idx)rQ   rR   rS   __doc__r   device_typerT   r7   rp   r   r   rV   r   r   r   rW   r   r   r>   r   rj      s,    rj   c                       sN   e Zd ZdZ		ddee dedef fddZd	e	j
d
e	j
fddZ  ZS )MLP2zn
    Args:
        dims: [in_dim, hidden_dim, out_dim]
        bias: whether to use bias in linear layer.
    T dimsbiasprefixc                    st   t    t|dksJ t | _t|d |d |t|d| jd| _t|d |d |t|d| jd| _	|| _
d S )NrY   r   r   fc0r   r   
disable_tpr   fc1)r6   r7   re   r   use_data_parallelr   r   r   r   r   
activation)r=   r   r   r   r   r>   r   r   r7   1  s$   

zMLP2.__init__r   r"   c                 C   s*   |  |\}}| |}| |\}}|S r5   )r   r   r   )r=   r   _r   r   r   rP   K  s   
zMLP2.forward)Tr   )rQ   rR   rS   r   listrT   r   rU   r7   r   rV   rP   rW   r   r   r>   r   r   *  s    
r   c                       s   e Zd Z	dejdddededededef
 fd	d
Z	dde	j
de	j
de	j
dB fddZ	dde	j
de	j
de	j
dB de	j
fddZ  ZS )MoonVitEncoderLayerr   F)r   	attn_bias	num_heads
hidden_dimmlp_dimr   r   c             	      s   t    t | _|| _|| _| j| j | _| jrdnt | _t	|| j| _
t|| _t|| _t|||g|| dd| _t|| j|||| d| jd| _t|||| d| jd| _t| j
| j| jd | d	d
| _d S )Nr   z.mlp)r   z.wqkv)hidden_size	head_sizetotal_num_headstotal_num_kv_headsr   r   r   z.wor   g      z.attn)r   r   scaler   )r6   r7   r   r   r   r   hidden_size_per_attention_headr   tp_sizer   !num_attention_heads_per_partitionr8   	LayerNormnorm0norm1r   mlpr   wqkvr   wor	   attn)r=   r   r   r   r   r   r   r>   r   r   r7   S  sJ   

	zMoonVitEncoderLayer.__init__Nr   
cu_seqlensrope_freqs_cisc                 C   s   | d}| |\}}|  dd d| j| jf }|j| }tj|dd\}}	}
t||	|\}}	|dd |dd   }| j	|
d|	
d|

d||d}||| j| j }| |\}}|S )	zq
        Args:
            x (torch.Tensor): (seqlen, hidden_dim)
            cu_seqlens (torch.Tensor):
        r   Nr   rY   rq   r   )r   
max_seqlen)rE   r   r   r   r&   r   unbindr.   maxr   r#   rw   r   )r=   r   r   r   
seq_lengthxqkvr   	qkv_shaper    r!   xvr   attn_outr   r   r   attention_qkvpacked  s4   

z'MoonVitEncoderLayer.attention_qkvpackedhidden_statesr"   c                 C   sF   |}|  |}| j|||d}|| }|}| | |}|| }|S )a  
        Args:
            hidden_states: non-packed (B, N, D) or packed (L, D). if non-packed, seqlens should be None, if packed, seqlens should be set

        Returns:
            output: same shape of input, non-packed (B, N, D) for non-packed input, (L, D) for packed input
        r   )r   r   r   r   )r=   r   r   r   residualr   r   r   r   rP     s   
zMoonVitEncoderLayer.forwardr   r5   )rQ   rR   rS   rI   gelurT   rU   r   r7   r   rV   r   rP   rW   r   r   r>   r   r   R  sD    7
-r   c                       sR   e Zd Z	ddededededdf
 fdd	Zd
ejdejdejfddZ	  Z
S )MoonVitEncoderr   r   
num_layers	block_cfgr   r"   Nc                    sT   t    t d  d  dd| _t fddt|D | _t|| _	d S )Nr   r   i   c                    s(   g | ]}t dd  d| i qS )r   z.blocks.r   )r   )r   	layer_idxr   r   r   r   r     s    z+MoonVitEncoder.__init__.<locals>.<listcomp>)
r6   r7   rj   rope_2dr8   
ModuleListrangeblocksr   final_layernorm)r=   r   r   r   r   r>   r   r   r7     s   
	zMoonVitEncoder.__init__r   rh   c                 C   s   | j j|d}ttjd|j|jd|d d df |d d df  |jf}|jdtj	d}t
| jD ]\}}||||d}q5| |}|S )N)rC   r   )ro   r   r   )r3   r   r   )r   r   r   rM   zerosro   r   rs   cumsumint32	enumerater   r   )r=   r   rh   r   lengthsr   r   blockr   r   r   rP     s   &
zMoonVitEncoder.forwardr   )rQ   rR   rS   rT   dictrU   r7   r   rV   rP   rW   r   r   r>   r   r     s(    r   r   r   r   rh   merge_kernel_sizec                 C   s   |  d}g }d}| D ]L}|d |d }}| ||||   }	|\}
}||
 || }}|	||
|||}|ddddd }||| |
| d}|| ||| 7 }q|S )Nr   r   r   r   rY   rm   )rE   rG   r&   rK   
contiguousrH   )r   rh   r   d_modeloutputspre_sumx_shaper1   r2   seqkernel_heightkernel_width
new_height	new_widthreshaped_seq
padded_seqr   r   r   patch_merger  s$   


r   c                       s\   e Zd ZeZdZdgZdZdZ	ddede	f fddZ
d	ejd
ejdejfddZ  ZS )MoonVitPretrainedModelmoonvitPackingTransformerTr   configr   c              	      s   t  j|g|R i | t|}|j| _|j| _|j| _d| _t|j|j|j|j	d| _
t|j|j|j|j|jtd dd| dd| _d S )Nr   )r\   r^   r_   r`   gelu_pytorch_tanhT)r   r   r   r   r   z.encoder)r   r   r   r   )r6   r7   r   r   r   r^   vit_processing_typerX   init_pos_emb_heightinit_pos_emb_widthpatch_embedr   num_hidden_layersnum_attention_headsintermediate_sizer   encoder)r=   r   r   inputskwargsr>   r   r   r7     s.   zMoonVitPretrainedModel.__init__pixel_valuesrh   r"   c                 C   s,   |  ||}| ||}t||| jd}|S )z
        Args:
            pixel_values (torch.Tensor): The input pixel values.
            grid_hw (torch.Tensor): The grid height and width.

        Returns:
            torch.Tensor: The output tokens.
        )r   )r   r   r   r   )r=   r   rh   r   r   r   r   rP   =  s   zMoonVitPretrainedModel.forwardr   )rQ   rR   rS   r   config_class
model_type_no_split_modules_supports_flash_attn_2_supports_sdparU   r7   r   rV   rP   rW   r   r   r>   r   r     s&    !r   )r   )2collections.abcr   copyr   	functoolsr   r   torch.nnr8   torch.nn.functional
functionalrI   transformers.activationsr   transformers.modeling_utilsr   vllm.distributedr   r   $vllm.model_executor.layers.attentionr	   vllm.model_executor.layers.convr
   !vllm.model_executor.layers.linearr   r   r    vllm.model_executor.models.utilsr   !vllm.model_executor.models.visionr   vllm.platformsr   'vllm.transformers_utils.configs.moonvitr   r   rV   ri   r.   Moduler/   rX   rj   r   r   r   r   rT   r   r   r   r   r   r   <module>   sV   ,
"+z(w2

