o
    پi_m                     @   s
  d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z
d dlZd dlm  mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZmZ zd d	lmZ W n eyg   d d
lmZ ee_eZY nw d dlmZ d dl m!Z! d dl"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 dZ6e 7e8Z9d dl:m;Z; 	dHdej<dej<dej<de=ej<ej<f fddZ>	dIdej<d ej<d!e=e?e?f de@ej< fd"d#ZAG d$d% d%ejBZCd&d' ZDd(d) ZEeDejFd*d+d,d- ZGdJd/d0ZHG d1d2 d2ejBZIG d3d4 d4ejBZJG d5d6 d6ejBZKG d7d8 d8ejBZLG d9d: d:ejBZMG d;d< d<ejBZNeO d=ejjBdB d>e@ej< fd?d@ZPeO 	dHdAejjBdBej<dCej<d=ejjBdB de@ej< f
dDdEZQG dFdG dGejBZReRgZSdS )K    N)deepcopy)IterableListOptionalSequenceTuple)nn)activations)KimiK25ConfigKimiK25VisionConfig)QuantizationConfig)/MultiModalityDataPaddingPatternMultimodalTokensgeneral_mm_embed_routine)PytorchGELUTanh)GELUTanh)VisionAttention)ReplicatedLinear)ModalityMultimodalDataItemMultimodalInputs)ForwardBatch)default_weight_loader)DeepseekV3ForCausalLM)MLP2)WeightsMapper)!run_dp_sharded_mrope_vision_model)get_global_server_args)
add_prefixi?  )is_dp_attention_enabledxqxk	freqs_cisreturnc                 C   s   | d}t|  jg | jdd ddR  }t| jg | jdd ddR  }t|| d}t|| d}|| ||fS )a  
    Args: (The leading dimensions of all inputs should be the same)
        xq: query, tensor of shape (..., num_heads, head_dim)
        xk: key, tensor of shape (..., num_heads, head_dim)
        freqs_cis: tensor of shape (..., head_dim/2), dtype=torch.complex64. It contains the precomputed cis(freqs) for each position in the 2D grid.
    Returns:
        xq_out, xk_out: tensors of shape (..., num_heads, head_dim)
    N   )		unsqueezetorchview_as_complexfloatviewshapeview_as_realflattentype_as)r   r    r!   x_shapexq_xk_xq_outxk_out r4   N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/kimi_k25.py
apply_rope0   s   
,,r6   r%   r%   x	grid_thwsmerge_kernel_sizec              	   C   s   |  d}g }d}| D ]P\}}}| |||| |   }	|\}
}||
 || }}|	|||
|||}|dddddd jdd}||| |
| d}|| ||| | 7 }q|S )	Nr$   r         r%         dim)sizetolistr*   permute
contiguousmeanappend)r8   r9   r:   d_modeloutputspre_sumthwseqkernel_heightkernel_width
new_height	new_widthreshaped_seq
padded_seqr4   r4   r5   tpool_patch_mergerE   s$   

rT   c                       sv   e Zd Zejddddddedededed	ee d
e	def fddZ
	ddejdejdedejdB fddZ  ZS )MoonViTEncoderLayerFN )
activation	attn_biasquant_configprefixuse_data_parallel	num_heads
hidden_dimmlp_dimrX   rY   rZ   r[   c          	         s|   t    || _|| _| j| j | _t|| _t|| _t	|||g|| _
t|||d||d|td||tt d| _d S )NTattn)	embed_dimr\   projection_sizeuse_qkv_parallelqkv_bias	proj_biasflatten_batchrY   rZ   r[   %customized_position_embedding_applieruse_dp_attention_reduce)super__init__r\   r]   hidden_size_per_attention_headr   	LayerNormnorm0norm1r   mlpr   r   r6   r   r_   )	selfr\   r]   r^   rW   rX   rY   rZ   r[   	__class__r4   r5   ri   e   s*   
zMoonViTEncoderLayer.__init__hidden_states
cu_seqlens
max_seqlenrope_freqs_cisc                 C   sJ   |}|  |}| j|||d}|| }|}| |}| |}|| }|S )N)rs   position_embeddings)rl   r_   rm   rn   )ro   rr   rs   rt   ru   residualr4   r4   r5   forward   s   


zMoonViTEncoderLayer.forwardN)__name__
__module____qualname__Fgeluintboolr   r   strri   r'   Tensorrx   __classcell__r4   r4   rp   r5   rU   c   s>    	
)rU   c                    s   t    fdd}|S )Nc                    s<   | j t |f}| vr | | |dd}| ||S )N)@   r   r+   )requires_gradr'   is_grad_enabledadd)orginterpolation_moder+   key__get_rope_shape_first_call_flagfuncr4   r5   wrapper   s
   
z(get_rope_shape_decorate.<locals>.wrapper)set)r   r   r4   r   r5   get_rope_shape_decorate   s   r   c                 C   s~   | d dksJ t j| d t jd}|| d  }dd|  }|d}t d||}t |}t |}t j||gd	d
}|S )a-  
    From:
    https://github.com/OpenGVLab/InternVideo/blob/421f6d2361fc8f61a3394244571f2601a4e99e29/InternVideo2/multi_modality/models/backbones/internvideo2/pos_embed.py#L86
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M,)
    out: (M, D)
    r%   r   dtypeg       @      ?'  r$   zm,d->mdr;   axis)nparangefloat32reshapeeinsumsincosconcatenate)r`   posomegaoutemb_sinemb_cosembr4   r4   r5   !get_1d_sincos_pos_embed_from_grid   s   


r   T)dynamicc                 C   s0   t j| dd||dddjddS )N)r%   r   r;   r   )rA   mode)r;   r%   r   r;   end_dim)r}   interpolaterC   r&   squeezer-   )r   r   r+   r4   r4   r5   get_rope_shape   s   r   Fc                 C   s>   t j|t jd}t| |}|rt jt d| g|gdd}|S )z
    t_size: int of the temporal size
    return:
    pos_embed: [t_size, embed_dim] or [1+t_size, embed_dim] (w/ or w/o cls_token)
    r   r;   r   r   )r   r   r   r   r   zeros)r`   t_size	cls_tokengrid_t	pos_embedr4   r4   r5   get_1d_sincos_pos_embed   s
   
r   c                       s^   e Zd Z	ddedededededdf fd	d
Zdd ZdejdejdejfddZ	  Z
S )$Learnable2DInterpPosEmbDivided_fixedbicubicheightwidth
num_framesr@   r   r"   Nc                    st   t    || _|| _|| _|| _|| _tt	
|||| _| jdt	t| j| j ddd |   d S )Ntime_weightr;   F
persistent)rh   ri   r   r   r   r@   r   r   	Parameterr'   emptyweightregister_buffer
from_numpyr   r)   r&   reset_parameters)ro   r   r   r   r@   r   rp   r4   r5   ri      s   
z-Learnable2DInterpPosEmbDivided_fixed.__init__c                 C   s   t j| j d S ry   )r   initnormal_r   ro   r4   r4   r5   r      s   z5Learnable2DInterpPosEmbDivided_fixed.reset_parametersr8   r9   c           
      C   s   g }|  D ]X\}}}|| jksJ d| d| j ||f| jjd d kr/| jjdd}nt| j| j||fd}|dkrA|}n|d|dd| j	d|  }|
|d|jd  q|t| }	|	S )Nzt:z > self.num_frames:r$   r;   r   )r   r+   r   )rB   r   r   r+   r-   r   r   r&   repeatr   rF   r   r'   cat)
ro   r8   r9   pos_embsrJ   rK   rL   
pos_emb_2d
pos_emb_3dr   r4   r4   r5   rx     s"     z,Learnable2DInterpPosEmbDivided_fixed.forward)r   )rz   r{   r|   r   r   ri   r   r'   r   rx   r   r4   r4   rp   r5   r      s"    $r   c                       sj   e Zd ZdZddededef fddZdd	 Zd
ejdej	fddZ
dej	d
ejdej	fddZ  ZS )Rope2DPosEmbRepeateda  2D rotary position embedding with multi-resolution support.
    This class is intended to be used in the following way:
    1. Before training, create an instance of Rope2DPosEmb. This instance will hold the precomputed cis.
    2. Before each forward pass, call `get_freqs_cis_by_*` to get the `freqs_cis` tensor for this iteration.
    3. During the forward pass, pass the `freqs_cis` tensor to each attention layer, and call `apply` just before each attention operation.
        The rope is shared across all attention layers and all heads.
    Refs:
    - RoFormer: https://arxiv.org/abs/2104.09864
    - VisionLLaMA: https://arxiv.org/abs/2403.00522
    - https://github.com/Meituan-AutoML/VisionLLaMA/blob/main/dit/models.py
    Args:
        dim (int): usually the multi-head attention dimension, should be divisible by 4 (TODO: relax this constraint if needed)
        max_height (int): the maximum height of the 2D grid
        max_width (int): the maximum width of the 2D grid
        theta_base (float): the base of the theta
    r   r@   
max_height	max_widthc                    s<   t    || _| jd dksJ d|| _|| _|| _d S )Nr=   r   zdim must be divisible by 4)rh   ri   r@   r   r   
theta_base)ro   r@   r   r   r   rp   r4   r5   ri   -  s   

zRope2DPosEmbRepeated.__init__c                 C   s$   d| j  d| j d| j d| j S )Nzdim=z, max_height=z, max_width=z, theta_base=)r@   r   r   r   r   r4   r4   r5   
extra_repr5  s   $zRope2DPosEmbRepeated.extra_reprdevicer"   c                 C   s   | j | j }td| |}|| j }|| j }td| jdd| jd   |}d| j|| j   }t|| }t|| }	t	t
||}
t	t
|	|	}tj|
jdd|jddgdd}|| j | jd}|S )a  Calculate the cis(freqs) for each position in the 2D grid.
        Return: complex tensor of shape (max_height, max_width, dim//2) and value:
            height axis: ret[h, w, 2*i] = cis(h * theta_base**(-4*i/dim))
            weight axis: ret[h, w, 2*i+1] = cis(w * theta_base**(-4*i/dim))   with (i in [0, dim//4))
            note: `cis` is a mathematical notation defined by cis x = cos x + i sin x,
        r   r=   Nr   r$   r?   )r   r   r'   r   r)   tor@   r   outerpolar	ones_liker   r&   r   )ro   r   Nflat_posx_posy_pos	dim_rangefreqsx_freqsy_freqsx_cisy_cisr!   r4   r4   r5   _precompute_freqs_cis8  s    

&z*Rope2DPosEmbRepeated._precompute_freqs_cisr9   c                    sp   t  ds jd |dd | }t fdd|D s(J | j jftj fdd|D dd	}|S )
z
        Args:
            grid_thws (torch.Tensor): grid time, height and width
        Returns:
            freqs_cis: tensor of shape (sum(t * height * width), dim//2)
        r!   Fr   c                 3   sJ    | ] \}}}d |  ko j kn  o d |  ko jkn  V  qdS )r;   N)r   r   .0rJ   rK   rL   r   r4   r5   	<genexpr>b  s    :
z5Rope2DPosEmbRepeated.get_freqs_cis.<locals>.<genexpr>c                    s>   g | ]\}}} j d |d |f d jd |dqS )Nr$   r%   r;   )r!   r   r@   r   r   r   r4   r5   
<listcomp>j  s    *z6Rope2DPosEmbRepeated.get_freqs_cis.<locals>.<listcomp>r   r?   )	hasattrr   r   rB   allr   r   r'   r   )ro   r9   r   shapesr!   r4   r   r5   get_freqs_cisS  s(   
	

z"Rope2DPosEmbRepeated.get_freqs_cis)r   )rz   r{   r|   __doc__r   ri   r   r'   r   r   r   r   r   r4   r4   rp   r5   r     s    r   c                       sp   e Zd Z						ddededeeeef B d	ed
ededef fddZdejdejdejfddZ	  Z
S )MoonVision3dPatchEmbedr<      r   r   r=   divided_fixedout_dimin_dim
patch_sizepos_emb_heightpos_emb_widthpos_emb_timepos_emb_typec                    s   t    t|ttB sJ dt| t|tr||f}t|dks+J d| || _tj	||||d| _
|dkrGt||||d| _d S td| )NzInvalid patch_size type: r%   z,Expected patch_size to be a tuple of 2, got )kernel_sizestrider   )r   r   r   r@   zNot support pos_emb_type: )rh   ri   
isinstancer   r   typelenr   r   Conv2dprojr   pos_embNotImplementedError)ro   r   r   r   r   r   r   r   rp   r4   r5   ri   u  s.   


zMoonVision3dPatchEmbed.__init__r8   r9   r"   c                 C   s(   |  ||dd}| ||}|S )z
        Args:
            x (L, Channels): input tensor
            grid_hws (N, 3): temporal, height and width
        Returns:
            (L, Cout) tensor
        r   r$   )r   r*   rA   r   )ro   r8   r9   r4   r4   r5   rx     s   zMoonVision3dPatchEmbed.forward)r<   r   r   r   r=   r   )rz   r{   r|   r   tupler   ri   r'   r   rx   r   r4   r4   rp   r5   r   s  s.    $#r   c                       sR   e Zd Z	ddededededdf
 fdd	Zd
ejdejdejfddZ	  Z
S )MoonViT3dEncoderspatial_temporalr]   
num_layers	block_cfgvideo_attn_typer"   Nc                    sn   t    |dksJ d| || _t d  d  dd| _t fddt|D | _t	|| _
d S )Nr   z0video_attn_type must be "spatial_temporal", got r]   r\   i   c                    s   g | ]	}t d i  qS )r4   )rU   )r   r   r  r4   r5   r     s    z-MoonViT3dEncoder.__init__.<locals>.<listcomp>)rh   ri   r  r   rope_2dr   
ModuleListrangeblocksrk   final_layernorm)ro   r]   r   r  r  rp   r  r5   ri     s   

zMoonViT3dEncoder.__init__rr   r9   c                 C   s   | j j||jd}ttjd|j|jd|d d df |d d df  |d d df  f}| }||jj	dtj
d}| jD ]
}|||||d}qA| |}|S )N)r9   r   r;   )r   r   r   r%   )r@   r   )ru   )r  r   r   r'   r   r   r   maxr   cumsumint32r  r  )ro   rr   r9   ru   lengthsrt   rs   blockr4   r4   r5   rx     s    .

zMoonViT3dEncoder.forward)r   )rz   r{   r|   r   dictr   ri   r'   r   rx   r   r4   r4   rp   r5   r     s(    r   c                       s~   e Zd ZdZdgZdZdZdddef fddZe	d	e
jfd
dZe	d	e
jfddZde
jde
jd	e
jfddZ  ZS )MoonViT3dPretrainedModel	moonvit3dPackingTransformerTFr[   r[   c             
      s   t    t|}|| _|j| _|j| _|j| _t|j|j|j	|j
|j|jd| _t|j|j|j|j|jt d|d|jd| _d S )N)r   r   r   r   r   r   T)r\   r]   r^   rW   rX   r[   )r]   r   r  r  )rh   ri   r   configr:   r   
merge_typer   hidden_sizeinit_pos_emb_heightinit_pos_emb_widthinit_pos_emb_timer   patch_embedr   num_hidden_layersnum_attention_headsintermediate_sizer   r  encoder)ro   r  r[   inputskwargsrp   r4   r5   ri     s4   
	z!MoonViT3dPretrainedModel.__init__r"   c                 C      | j jjjS ry   )r  r   r   r   r   r4   r4   r5   r        zMoonViT3dPretrainedModel.dtypec                 C   r   ry   )r  r   r   r   r   r4   r4   r5   r     r!  zMoonViT3dPretrainedModel.devicepixel_valuesr9   c                 C   sl   |j dksJ d|j  |ddksJ d| | ||}| ||}|d}t||| jd}|S )z
        Args:
            pixel_values (torch.Tensor): The input pixel values.
            grid_thws (torch.Tensor): Temporal, height and width.
        Returns:
            torch.Tensor: The output tokens.
        r%   zgrid_thws should be 2D, got r;   r<   zNo support for _thw: r   )r:   )ndimrA   r  r  r   rT   r:   )ro   r"  r9   rr   r4   r4   r5   rx     s   

z MoonViT3dPretrainedModel.forward)rz   r{   r|   
model_type_no_split_modules_supports_flash_attn_2_supports_sdpar   ri   propertyr'   r   r   r   rx   r   r4   r4   rp   r5   r    s"    r  c                       sD   e Zd ZdZ	ddedef fddZdejdejfd	d
Z	  Z
S )K2VLMultiModalProjectorz3Multi-modal projector with patch merging for K2-VL.rV   r  rZ   c                    s~   t    |j\}}|j| | | _tjj|jdd| _t	| j| jdt
|dd| _t	| j|jdt
|dd| _t | _d S )Ngh㈵>)epsTlinear_1)biasrZ   linear_2)rh   ri   r:   vt_hidden_sizer  r'   r   rk   pre_normr   r   r+  text_hidden_sizer-  GELUact)ro   r  rZ   merge_hmerge_wrp   r4   r5   ri   !  s"   

z K2VLMultiModalProjector.__init__image_featuresr"   c                 C   s>   |  |d| j}| |\}}| |}| |\}}|S )Nr$   )r/  r*   r  r+  r2  r-  )ro   r5  rr   r   r4   r4   r5   rx   ;  s
   
zK2VLMultiModalProjector.forward)rV   )rz   r{   r|   r   r   r   ri   r'   r   rx   r   r4   r4   rp   r5   r)    s    r)  mm_projector	vt_outputc                 C   sZ   | du r|S dd |D }t j|dd}| r| |n|}|d|jd }t ||}|S )z+Apply MM projector to vision tower outputs.Nc                 S   s   g | ]}|j d  qS )r   r   )r   r8   r4   r4   r5   r   K  s    z&mm_projection_auto.<locals>.<listcomp>r   r?   r$   )r'   r   r   r+   split)r6  r7  num_embedding_listbatchedproj_outr4   r4   r5   mm_projection_autoC  s   r<  vision_towerr"  grid_thwc                 C   sF  t |tjsJ dt||jd }|d}t| t}t	
d|t g }d}d}	d}
t|D ]D}||  }|
| |krF|
|7 }
q3|	|k rs||	| }||	|  }||||  }| ||}t||}|| ||7 }|}	|}
q3|	|k r||	| }||	|  }||||  }| ||}t||}|| |S )z"Auto-batched vision tower forward.z*expect pixel_values to be a tensor, get {}r   r$   z8vt max_infer_batch: %s, KIMIV_VT_INFER_MAX_PATCH_NUM: %s)r   r'   r   formatr   r+   prodr	  KIMIV_VT_INFER_MAX_PATCH_NUMloggerdebugr  itemsumr<  extend)r=  r"  r>  r6  nn_patches_each_mediamax_infer_batchtensorsrI   current_group_startcurrent_group_patchesicurrent_media_patchesgroup_grid_thwgroup_n_patchesgroup_inputgroup_outputr;  r4   r4   r5   vision_tower_forward_autoS  sN   








rS  c                	       s   e Zd ZeddidZ		ddedee ded	df fd
dZ	de
e d	ejfddZde
e defddZ	ddejdejdedefddZdeeeejf  fddZ  ZS )KimiK25ForConditionalGenerationzlanguage_model.layers.zlanguage_model.model.layers.)orig_to_new_prefixNrV   r  rY   rZ   r"   c                    s   t    || _|| _t j| _t|j| jd| _	t
|j| _t|j|| _t| jdrB| jj}| j	j|d| _	| jj|d| _d S d S )Nr  r   r   )rh   ri   r  rY   r   mm_enable_dp_encoderr[   r  vision_configr=  r)  r6  r   text_configlanguage_modelr   r   r   )ro   r  rY   rZ   r  target_dtyperp   r4   r5   ri     s   

z(KimiK25ForConditionalGeneration.__init__itemsc                 C   s   t jdd |D dd| jj}t jdd |D dd| jj}| jjj	j
j}||}| jrCt| j|| dd}| |}|S t| j||| jd}t j|dd}|S )	Nc                 S      g | ]}|j qS r4   )featurer   rD  r4   r4   r5   r         zEKimiK25ForConditionalGeneration.get_image_feature.<locals>.<listcomp>r   r?   c                 S   r\  r4   )r9   r^  r4   r4   r5   r     r_  r  )	rope_type)r6  )r'   r   r   r=  r   concatr   r   r  r   r   r[   r   rB   r6  rS  )ro   r[  r"  r9   rZ  image_embedsr5  r4   r4   r5   get_image_feature  s2   

z1KimiK25ForConditionalGeneration.get_image_feature	input_ids	mm_inputsc                 C   s   t  }|||S ry   )r   pad_input_tokens)ro   rd  re  patternr4   r4   r5   pad_input_ids  s   z-KimiK25ForConditionalGeneration.pad_input_idsF	positionsforward_batchget_embeddingc                 C   s    t ||| jtj| ji|d}|S )N)rd  rj  rY  data_embedding_funcsri  )r   rY  r   IMAGErc  )ro   rd  ri  rj  rk  rr   r4   r4   r5   rx     s   
z'KimiK25ForConditionalGeneration.forwardweightsc                 C   s  t | dd}|dur||}g }g }|D ]9\}}d|v s!d|v rA|dd}|dd}|d	d
}|dd}|||f q|dd}|||f qt|}t| jdd}| D ]\}}||vrotd| d|| }	t |	dt}
|
|	| q_|r| j	
| dS dS )zBLoad weights for the model, separating vision and language weightshf_to_sglang_mapperNr=  r6  zwqkv.zattn.qkv_proj.zwo.z
attn.proj.zmm_projector.proj.0zmm_projector.linear_1zmm_projector.proj.2zmm_projector.linear_2zlanguage_model.rV   F)remove_duplicatezWeight z not found in params_dictweight_loader)getattrapplyreplacerF   r  named_parametersr[  
ValueErrorr   rY  load_weights)ro   rn  mappervision_weightslanguage_weightsnameloaded_weightvision_state_dictparams_dictparamrq  r4   r4   r5   rw    s2   
z,KimiK25ForConditionalGeneration.load_weights)NrV   F)rz   r{   r|   r   ro  r
   r   r   r   ri   r   r   r'   r   rc  r   r   rh  r   r   rx   r   r   rw  r   r4   r4   rp   r5   rT    s:    		
$rT  ry   )r7   r  )Tloggingcopyr   typingr   r   r   r   r   numpyr   r'   torch.nn.functionalr   
functionalr}   transformersr	   sglang.srt.configs.kimi_k25r
   r   *sglang.srt.layers.quantization.base_configr   sglang.srt.managers.mm_utilsr   r   transformers.activationsr   ImportErrorr   "sglang.srt.layers.attention.visionr   sglang.srt.layers.linearr   "sglang.srt.managers.schedule_batchr   r   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.deepseek_v2r   !sglang.srt.models.kimi_vl_moonvitr   sglang.srt.models.utilsr   sglang.srt.multimodal.mm_utilsr   sglang.srt.server_argsr   sglang.srt.utilsr   rA  	getLoggerrz   rB  sglang.srt.layers.dp_attentionr   r   r   r6   r   listrT   ModulerU   r   r   compiler   r   r   r   r   r   r  r)  inference_moder<  rS  rT  
EntryClassr4   r4   r4   r5   <module>   s    



@

8X34D%

4 
