o
    پi]                     @   s   d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlmZ d dlm  mZ d dlmZ d dlmZ zd dlmZ W n eyP   dZY nw d dlmZ 		d1d	ejd
ejdejdeej deej f
ddZ		d1d	ejd
ejdejdeej deej dejfddZeedZdd Zdejdejdejdeejejf fddZ G dd dej!Z"G dd dej!Z#G dd  d ej!Z$G d!d" d"ej!Z%G d#d$ d$ej!Z&G d%d& d&ej!Z'	'd2d(ejd)ejd*e(e)e)f deej fd+d,Z*G d-d. d.ej!Z+G d/d0 d0eZ,dS )3    N)deepcopy)cached_property)ListOptionalSequenceTupleUnion)ACT2FN)PreTrainedModel)flash_attn_varlen_func)MoonViTConfigqkvq_cu_seqlensk_cu_seqlensc              
   C   s$  t du rtd|  |   kr |   kr dks%J d J d|d | jd ks2J d|d |jd   krG|jd ksLJ d J d| jtjtjfv s^J d	| j d
|dd |dd   	 }|dd |dd   	 }t | ||||||dd}|j
dd}|S )a  Multi-head attention using flash attention 2.
    This function is used to handle the case where the query, key, and value are packed.
    Args:
        q, k, v: tensor of shape (tot_seqlens, num_heads, head_dim).
        q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
            The first element should be 0 and the last element should be q.shape[0].
        k_cu_seqlens (torch.Tensor): cumulative sequence lengths of k.
            The first element should be 0 and the last element should be k.shape[0].

    Returns:
        output: shape (batch_size, seqlen, dim) or (tot_seqlens, dim) if packing,
            where dim = num_heads * head_dim
    NzWflash_attn is not installed, this function needs flash_attn_varlen_func from flash_attn   q, k, v must have 3 dimsr   #q_cu_seqlens must sum to q.shape[0]z#k_cu_seqlens must sum to k.shape[0]zunsupported dtype z for multihead attn   F)causal)	start_dim)r   ImportErrordimshapedtypetorchbfloat16float16maxitemflatten)r   r   r   r   r   max_seqlen_qmax_seqlen_kattn_out r'   U/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/kimi_vl_moonvit.pymultihead_attention?   s>   :$  
r)   returnc           	      C   s  |   |    kr|    krdksJ d J d|d | jd ks*J d| jd }tjd||g| jtjd}tdt|D ]}d|d	||d  || ||d  || f< qC| dd} |dd}|dd}t	j
| |||d
d}|dd}||d}|S )a  Multi-head attention using torch scaled dot product attention.
    This function is used to handle the case where the query, key, and value are packed.
    Args:
        q, k, v: tensor of shape (tot_seqlens, num_heads, head_dim).
        q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
            The first element should be 0 and the last element should be q.shape[0].
        k_cu_seqlens (torch.Tensor): cumulative sequence lengths of k.
            The first element should be 0 and the last element should be k.shape[0].

    Returns:
        output: shape (batch_size, seqlen, dim) or (tot_seqlens, dim) if packing,
            where dim = num_heads * head_dim
    r   r   r   r   r   r   devicer   T.g        )	dropout_p)r   r   r   zerosr,   boolrangelen	transposeFscaled_dot_product_attentionreshape)	r   r   r   r   r   
seq_lengthattention_maskiattn_outputr'   r'   r(   sdpa_attentions   s*   :
r:   )flash_attention_2sdpac                 C   s   | j |j d ksJ | j|jf| jd d |jd d ks&J | j|jf| jd d|jd  ks:J | j|jf|jtjksEJ |jd S )Nr   r   r      )ndimr   r   r   	complex64)x	freqs_cisr'   r'   r(   _apply_rope_input_validation   s    ,(rB   xqxkrA   c                 C   s   t | | t || |d}t|  jg | jdd ddR  }t| jg | jdd ddR  }t|| d}t|| d}|	| |	|fS )a  
    Args: (The leading dimensions of all inputs should be the same)
        xq: query, tensor of shape (..., num_heads, head_dim)
        xk: key, tensor of shape (..., num_heads, head_dim)
        freqs_cis: tensor of shape (..., head_dim/2), dtype=torch.complex64. It contains the precomputed cis(freqs) for each position in the 2D grid.
    Returns:
        xq_out, xk_out: tensors of shape (..., num_heads, head_dim)
    r   Nr   r=   )
rB   	unsqueezer   view_as_complexfloatviewr   view_as_realr#   type_as)rC   rD   rA   xq_xk_xq_outxk_outr'   r'   r(   
apply_rope   s   


,,rO   c                       sZ   e Zd Z	ddededededdf
 fdd	Zd
d ZdejdejdejfddZ	  Z
S )Learnable2DInterpPosEmbbicubicheightwidthr   interpolation_moder*   Nc                    s>   t    || _|| _|| _tt|||| _	| 
  d S N)super__init__rR   rS   rT   nn	Parameterr   emptyweightreset_parameters)selfrR   rS   r   rT   	__class__r'   r(   rW      s   
z Learnable2DInterpPosEmb.__init__c                 C   s   t j| j d S rU   )rX   initnormal_r[   r]   r'   r'   r(   r\      s   z(Learnable2DInterpPosEmb.reset_parametersr@   grid_hwsc                 C   s   g }|  D ]4}|| jjd d kr|| jjdd q|tj| jdd|| j	d
ddjdd q|t| }|S )Nr   r   )end_dim)r=   r   r   r   )sizemode)r   r=   r   )tolistr[   r   appendr#   r3   interpolatepermuterE   rT   squeezer   cat)r]   r@   rc   pos_embsr   outr'   r'   r(   forward   s"   
zLearnable2DInterpPosEmb.forward)rQ   )__name__
__module____qualname__intstrrW   r\   r   Tensorro   __classcell__r'   r'   r^   r(   rP      s    
$rP   c                       sh   e Zd Z				ddededeeeeef f dedef
 fd	d
ZdejdejdejfddZ	  Z
S )MoonVisionPatchEmbedr      ry   ry   out_dimin_dim
patch_sizepos_emb_heightpos_emb_widthc                    s   t    t|ttfsJ dt| t|tr||f}t|dks+J d| || _tj	||||d| _
t|||d| _d S )NzInvalid patch_size type: r=   z,Expected patch_size to be a tuple of 2, got )kernel_sizestride)rR   rS   r   )rV   rW   
isinstancers   r   typer1   r|   rX   Conv2dprojrP   pos_emb)r]   rz   r{   r|   r}   r~   r^   r'   r(   rW      s$   

zMoonVisionPatchEmbed.__init__r@   grid_hwr*   c                 C   s(   |  ||dd}| ||}|S )z
        Args:
            x (L, Channels): input tensor
            grid_hw (N, 2): grid height and width

        Returns:
            (L, Cout) tensor
        r   r   )r   rH   re   r   )r]   r@   r   r'   r'   r(   ro     s   	zMoonVisionPatchEmbed.forward)r   rx   ry   ry   )rp   rq   rr   rs   r   r   rW   r   ru   ro   rv   r'   r'   r^   r(   rw      s"    $rw   c                       s   e Zd ZdZ	ddededef fddZd	d
 Zedej	fddZ
dej	dej	fddZdej	dej	dej	fddZ  ZS )Rope2DPosEmbaR  2D rotary position embedding with multi-resolution support.

    This class is intended to be used in the following way:
    1. Before training, create an instance of Rope2DPosEmb. This instance will hold the precomputed cis.
    2. Before each forward pass, call `get_freqs_cis_by_*` to get the `freqs_cis` tensor for this iteration.
    3. During the forward pass, pass the `freqs_cis` tensor to each attention layer, and call `apply` just before each attention operation.
        The rope is shared across all attention layers and all heads.

    Refs:
    - RoFormer: https://arxiv.org/abs/2104.09864
    - VisionLLaMA: https://arxiv.org/abs/2403.00522
    - https://github.com/Meituan-AutoML/VisionLLaMA/blob/main/dit/models.py

    Args:
        dim (int): usually the multi-head attention dimension, should be divisible by 4 (TODO: relax this constraint if needed)
        max_height (int): the maximum height of the 2D grid
        max_width (int): the maximum width of the 2D grid
        theta_base (float): the base of the theta
        device (str): the device to store the precomputed cis
    '  cudar   
max_height	max_widthc                    sB   t    || _| jd dksJ d|| _|| _|| _|| _d S )N   r   zdim must be divisible by 4)rV   rW   r   r   r   
theta_baser,   )r]   r   r   r   r   r,   r^   r'   r(   rW   &  s   

zRope2DPosEmb.__init__c                 C   s$   d| j  d| j d| j d| j S )Nzdim=z, max_height=z, max_width=z, theta_base=)r   r   r   r   rb   r'   r'   r(   
extra_repr1  s   $zRope2DPosEmb.extra_reprr*   c                 C   s   | j | j }td| | j}|| j }|| j }td| jdd| jd   | j}d| j|| j   }t	|| }t	|| }t
t||}	t
t||}
tj|	jdd|
jddgdd}|| j | jd}|S )a  Calculate the cis(freqs) for each position in the 2D grid.

        Return: complex tensor of shape (max_height, max_width, dim//2) and value:
            height axis: ret[h, w, 2*i] = cis(h * theta_base**(-4*i/dim))
            weight axis: ret[h, w, 2*i+1] = cis(w * theta_base**(-4*i/dim))   with (i in [0, dim//4))
            note: `cis` is a mathematical notation defined by cis x = cos x + i sin x,
        r   r   Ng      ?r   r   )r   r   r   arangerG   tor,   r   r   outerpolar	ones_likerl   rE   r5   )r]   Nflat_posx_posy_pos	dim_rangefreqsx_freqsy_freqsx_cisy_cisrA   r'   r'   r(   precomputed_freqs_cis4  s    	

(z"Rope2DPosEmb.precomputed_freqs_cisrc   c                    sP   |  }t fdd|D sJ | j jftj fdd|D dd}|S )z
        Args:
            grid_hws (torch.Tensor): containing list of (height, width) or (t, height, width) tuples.
        Returns:
            freqs_cis: tensor of shape (sum(t * height * width), dim//2)
        c                 3   sH    | ]\}}d |  ko j kn  od |  ko jkn  V  qdS )r   N)r   r   .0hwrb   r'   r(   	<genexpr>Y  s    8
z8Rope2DPosEmb.get_freqs_cis_by_seqlens.<locals>.<genexpr>c                    s4   g | ]\}} j d |d |f d jd qS )Nr   r=   )r   r5   r   r   rb   r'   r(   
<listcomp>a  s    "z9Rope2DPosEmb.get_freqs_cis_by_seqlens.<locals>.<listcomp>r   r   )rg   allr   r   r   rl   )r]   rc   shapesrA   r'   rb   r(   get_freqs_cis_by_seqlensQ  s    

z%Rope2DPosEmb.get_freqs_cis_by_seqlenspos_idxpos_idx_maskc                 C   s   |j dd |j kr|j d dkr|j|jd ks!J |j |j f|jtjks,J |j|j | jd f }tj|tj| jd}| j	|d | |d | f ||< |S )a  
        Args:
            pos_idx: tensor of shape (..., 2), It contains the (h, w) position indices of each 2D token.
            pos_idx_mask: a mask of shape (...), the leading dimensions should be the same as pos_idx.
                Rope will only be applied to the tokens with True mask. `freqs_cis` for the tokens with False mask with be ones.
        Return:
            freqs_cis: tensor of shape (..., dim//2)
        Nr   r=   r   )r   r,   ).r   ).r   )
r   r>   r   r   r/   r   onesr?   r,   r   )r]   r   r   shprA   r'   r'   r(   get_freqs_cis_by_idxi  s   

z!Rope2DPosEmb.get_freqs_cis_by_idx)r   r   )rp   rq   rr   __doc__rs   rW   r   r   r   ru   r   r   r   rv   r'   r'   r^   r(   r     s*    r   c                       sB   e Zd ZdZd
dee f fddZdejdejfdd	Z	  Z
S )MLP2zn
    Args:
        dims: [in_dim, hidden_dim, out_dim]
        bias: whether to use bias in linear layer.
    Tdimsc                    s   t    t|dksJ tj|d |d |d| _tj|d |d |d| _|| _| j| jfD ]}tjj	|j
td|j d |jd urMtj|j q0d S )Nr   r   r   biasr=   )std)rV   rW   r1   rX   Linearfc0fc1
activationr`   trunc_normal_r[   mathsqrtin_featuresr   zeros_)r]   r   r   r   mr^   r'   r(   rW     s   

zMLP2.__init__r@   r*   c                 C   s   |  |}| |}| |S rU   )r   r   r   )r]   r@   r'   r'   r(   ro     s   


zMLP2.forward)T)rp   rq   rr   r   listrs   rW   r   ru   ro   rv   r'   r'   r^   r(   r     s    r   c                       s   e Zd Zdejdddededededef
 fd	d
Z	dde	j
de	j
dee	j
 fddZ	dde	j
de	j
dee	j
df de	j
fddZ  ZS )MoonVitEncoderLayerr;   F)attn_implementationr   	attn_bias	num_heads
hidden_dimmlp_dimr   r   c                   s   t    || _|| _| j| j | _|| _t|| _t|| _	t
|||g|| _tj||d |d| _tj|||d| _d S )Nr   r   )rV   rW   r   r   hidden_size_per_attention_headr   rX   	LayerNormnorm0norm1r   mlpr   wqkvwo)r]   r   r   r   r   r   r   r^   r'   r(   rW     s   

zMoonVitEncoderLayer.__init__Nr@   
cu_seqlensrope_freqs_cisc                 C   s   |  |}| dd d| j| jf }|j| }tj|dd\}}}t|||\}}t| j	 }	|	|||||d}
| 
|
}
|
S )z}
        Args:
            x (torch.Tensor): (batch_size, seqlen, hidden_dim)
            cu_seqlens (torch.Tensor):
        Nr   r   r   )r   r   )r   re   r   r   rH   r   unbindrO   VL_VISION_ATTENTION_FUNCTIONSr   r   )r]   r@   r   r   xqkv	qkv_shaperC   rD   xv	attn_funcr&   r'   r'   r(   attention_qkvpacked  s   




z'MoonVitEncoderLayer.attention_qkvpackedhidden_statesr*   c                 C   sF   |}|  |}| j|||d}|| }|}| | |}|| }|S )a  
        Args:
            hidden_states: non-packed (B, N, D) or packed (L, D). if non-packed, seqlens should be None, if packed, seqlens should be set

        Returns:
            output: same shape of input, non-packed (B, N, D) for non-packed input, (L, D) for packed input
        r   )r   r   r   r   )r]   r   r   r   residualr&   r'   r'   r(   ro     s   
zMoonVitEncoderLayer.forwardrU   )rp   rq   rr   r3   gelurs   rt   r/   rW   r   ru   r   r   r   ro   rv   r'   r'   r^   r(   r     sB    
$r   c                       sJ   e Zd Zdedededdf fddZdejd	ejdejfd
dZ  Z	S )MoonVitEncoderr   
num_layers	block_cfgr*   Nc                    sR   t    t d  d  dd| _t fddt|D | _t|| _	d S )Nr   r   i   c                    s   g | ]	}t d i  qS )r'   )r   )r   _r   r'   r(   r     s    z+MoonVitEncoder.__init__.<locals>.<listcomp>)
rV   rW   r   rope_2drX   
ModuleListr0   blocksr   final_layernorm)r]   r   r   r   r^   r   r(   rW     s   
zMoonVitEncoder.__init__r   r   c                 C   s   | j j|d}ttjd|j|jd|d d df |d d df  f}|jdtjd}t	| j
D ]\}}||||d}q1| |}|S )N)rc   r   r+   r   )r   r   r   )r   r   r   rl   r.   r,   r   cumsumint32	enumerater   r   )r]   r   r   r   lengthsr   r   blockr'   r'   r(   ro     s   
zMoonVitEncoder.forward)
rp   rq   rr   rs   dictrW   r   ru   ro   rv   r'   r'   r^   r(   r     s"    r   r=   r=   r@   r   merge_kernel_sizec                 C   s   |  d}g }d}| D ]L}|d |d }}| ||||   }	|\}
}||
 || }}|	||
|||}|ddddd }||| |
| d}|| ||| 7 }q|S )Nr   r   r   r=   r   r   )re   rg   rH   rj   
contiguousrh   )r@   r   r   d_modeloutputspre_sumx_shaperR   rS   seqkernel_heightkernel_width
new_height	new_widthreshaped_seq
padded_seqr'   r'   r(   patch_merger  s$   


r   c                       sX   e Zd Z			ddedeeef dededef
 fd	d
Zdej	dej	fddZ
  ZS )MoonVitVLProjectorr   h㈵>   in_channelsr   
hidden_actln_epsrz   c                    sj   t    ||d  |d  | _tjj||d| _tj| j| jdd| _t| | _	tj| j|dd| _
d S )Nr   r   )epsTr   )rV   rW   hidden_sizerX   r   pre_normr   linear_1r	   actlinear_2)r]   r   r   r   r   rz   r^   r'   r(   rW   7  s   

zMoonVitVLProjector.__init__r   r*   c                 C   s6   |  |d| j}| |}| |}| |}|S )Nr   )r   rH   r   r  r  r  )r]   r   r'   r'   r(   ro   G  s
   


zMoonVitVLProjector.forward)r   r   r   )rp   rq   rr   rs   r   rt   rG   rW   r   ru   ro   rv   r'   r'   r^   r(   r   5  s     
r   c                       sT   e Zd ZeZdZdgZdZdZdef fddZ	de
jde
jd	e
jfd
dZ  ZS )MoonVitPretrainedModelmoonvitPackingTransformerTconfigc              
      s   ddl m} t j|g|R i | t|}|j| _|j| _t|j|j|j	|j
d| _t|j|j|j|j|j| d|jdd| _d S )Nr   )GELUTanh)rz   r|   r}   r~   T)r   r   r   r   r   r   )r   r   r   )transformers.activationsr  rV   rW   r   r   r|   rw   r   init_pos_emb_heightinit_pos_emb_widthpatch_embedr   num_hidden_layersnum_attention_headsintermediate_size_attn_implementationencoder)r]   r  inputskwargsr  r^   r'   r(   rW   V  s,   zMoonVitPretrainedModel.__init__pixel_valuesr   r*   c                 C   s,   |  ||}| ||}t||| jd}|S )z
        Args:
            pixel_values (torch.Tensor): The input pixel values.
            grid_hw (torch.Tensor): The grid height and width.

        Returns:
            torch.Tensor: The output tokens.
        )r   )r  r  r   r   )r]   r  r   r   r'   r'   r(   ro   q  s   zMoonVitPretrainedModel.forward)rp   rq   rr   r   config_class
model_type_no_split_modules_supports_flash_attn_2_supports_sdparW   r   ru   ro   rv   r'   r'   r^   r(   r  O  s    r  )NN)r   )-r   copyr   	functoolsr   typingr   r   r   r   r   r   torch.nnrX   torch.nn.functional
functionalr3   r	  r	   transformers.modeling_utilsr
   flash_attn.flash_attn_interfacer   r   sglang.srt.configsr   ru   r)   r:   r   rB   tuplerO   ModulerP   rw   r   r   r   r   r   rs   r   r   r  r'   r'   r'   r(   <module>   s   +
8
+
#,uR,

