o
    پiP                     @   s  d Z ddlmZ ddlmZmZmZmZmZ ddl	Z	ddl
mZmZ ddl	mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* eej+ddZ,dd Z-d)ddZ.G dd dej/Z0G dd dej/Z1G dd  d ej/Z2G d!d" d"ej/Z3G d#d$ d$ej/Z4G d%d& d&ej/Z5G d'd( d(ej/Z6e6Z7dS )*zIThis is basically a copy from perception_models/core/vision_encoder/pe.py    )partial)CallableIterableListOptionalTupleN)	rearrangerepeat)nn)
functional)ACT2FN)Step3VLConfig)VisionAttention)ColumnParallelLinearRowParallelLinear)QuantizationConfig)/MultiModalityDataPaddingPatternMultimodalTokensgeneral_mm_embed_routine)ModalityMultimodalDataItemMultimodalInputs)ForwardBatch)default_weight_loader)Qwen3ForCausalLM)
add_prefixh㈵>)epsc                 C   s<   t | ddd} | jdd\}}tj| |fdd} t | dS )Nz... (d r) -> ... d r   rdimz... d r -> ... (d r))r   unbindtorchstack)xx1x2 r)   R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/step3_vl_10b.pyrotate_half#   s   
r+         ?c                 C   s   |j }|jdkr|j| }| | d  } | jd }|| }||jd ks/J d|jd ||dd |f |d||f |d|d f }	}}
||   | t||   |  }tj|	||
fdd}|	|S )N   r    zPfeature dimension {} is not of sufficient size to rotate in all the positions {}.r!   )
dtypendimshapeformatcosr+   sinr$   cattype)freqststart_indexscaleseq_dimr/   seq_lenrot_dim	end_indext_leftt_rightoutr)   r)   r*   apply_rotary_emb*   s"   



$
rB   c                	       s   e Zd Z					ddededed	ef fd
dZdeeB dedejfddZ	dejdejfddZ
dejfddZdejdejdeeef fddZ  ZS )PerceptionEncoderRope2DF'  
      r,   r"   max_grid_heightmax_grid_widthuse_cls_tokenc	           
         s`   t    || _|| _|| _|| _||||d    | _|| _|| _| 	 }	| j
d|	dd d S )Nr   freqs_cacheF)
persistent)super__init__r"   rG   rH   rI   thetamax_freq	num_freqs_compute_2d_freqsregister_buffer)
selfr"   rG   rH   rI   rN   rO   rP   theta_rescale_factorcache	__class__r)   r*   rM   E   s   
z PerceptionEncoderRope2D.__init__basereturnc                 C   s.   d|t d|dd |d   |   }|S )Nr,   r   r   )r$   arangefloat)rS   rX   r"   r7   r)   r)   r*   _compute_inv_freq[   s   *z)PerceptionEncoderRope2D._compute_inv_freqr8   inv_freqc                 C   s(   t d||j|}t|ddd}|S )Nz..., f -> ... fz... n -> ... (n r)r   r   )r$   einsumr6   r/   r	   )rS   r8   r]   r7   r)   r)   r*   _compute_freqs_   s   z&PerceptionEncoderRope2D._compute_freqsc                 C   s   t j| jt jd}t j| jt jd}| jr|d7 }|d7 }| | j| jd }| 	||d d d f 
| j| jd}| 	||d d d f 
| j| jd}t j||gdd| j| j d}| jrqt jt d|jd |gdd}|d }|S )N)r/   rF   r   r    r!   r   )NN.)r$   rZ   rG   r[   rH   rI   r\   rN   r"   r_   expandr5   reshapezerosr1   )rS   grid_h_rangegrid_w_ranger]   freqs_hfreqs_wr7   r)   r)   r*   rQ   d   s&   

 z)PerceptionEncoderRope2D._compute_2d_freqsqkgrid_hwc                 C   sN  |d | j ks|d | jkrZtj|d |jddd}tj|d |jddd}|| j | dtj}| j	rRtj
tjd|jd|d gdd}|tj}| jd|}n| j}|j}	|\}
}}||
|d| jdddd}||
|d| jdddd}t||}t||}|dddd|	}|dddd|	}||fS )Nr   rF   )devicer    r!   r   r.   )rG   rH   r$   rZ   rj   viewra   tolongrI   r5   rb   rJ   index_selectr1   r"   permuterB   )rS   rg   rh   ri   x_shaperowscols	positionsr7   	ori_shapebsr<   _r)   r)   r*   forwardy   s(   


zPerceptionEncoderRope2D.forward)FrD   rE   rF   r,   )__name__
__module____qualname__intboolrM   r[   r$   Tensorr\   r_   rQ   tuplerw   __classcell__r)   r)   rV   r*   rC   D   s2    
rC   c                       s&   e Zd Zd fdd	Zdd Z  ZS )PerceptionEncoderLayerScaler   Fc                    s*   t    || _t|t| | _d S N)rL   rM   inplacer
   	Parameterr$   onesgamma)rS   r"   init_valuesr   rV   r)   r*   rM      s   
z$PerceptionEncoderLayerScale.__init__c                 C   s   | j r	|| jS || j S r   )r   mul_r   )rS   r&   r)   r)   r*   rw      s   z#PerceptionEncoderLayerScale.forward)r   F)rx   ry   rz   rM   rw   r   r)   r)   rV   r*   r      s    r   c                       s\   e Zd Z		ddededeg ejf dedB def
 fdd	Z	d
e
jde
jfddZ  ZS )PerceptionEncoderMLPN 	input_dim
hidden_dim	act_layerquant_configprefixc                    sH   t    t||d|| dd| _|| _t||d|| dd| _d S )NTz.fc1)biasr   r   z.fc2)rL   rM   r   fc1
activationr   fc2)rS   r   r   r   r   r   rV   r)   r*   rM      s    
zPerceptionEncoderMLP.__init__r&   rY   c                 C   s*   |  |\}}| |}| |\}}|S r   )r   r   r   )rS   r&   rv   r)   r)   r*   rw      s   
zPerceptionEncoderMLP.forwardNr   )rx   ry   rz   r{   r   r
   Moduler   strrM   r$   r}   rw   r   r)   r)   rV   r*   r      s    r   c                       s~   e Zd Zddejejdddfdedededed	ed
ededede	de
dB def fddZdejdeeef fddZ  ZS )PerceptionEncoderVisionBlock      @NFr   d_modeln_headrG   rH   	mlp_ratiols_init_valuer   
norm_layerrI   r   r   c              
      s   t    || | _t| j|||	d| _t|||dd|
td|| jd| _|d ur.t||nt	
 | _|d ur<t||nt	
 | _||| _||| _t|| }t||||
| dd| _d S )N)r"   rG   rH   rI   Tattn)	embed_dim	num_headsprojection_sizeuse_qkv_parallel	proj_biasr   r   %customized_position_embedding_applierz.mlpr   r   )rL   rM   head_dimrC   roper   r   r   r   r
   Identityls_1ls_2ln_1ln_2r{   r   mlp)rS   r   r   rG   rH   r   r   r   r   rI   r   r   r   rV   r)   r*   rM      sH   





z%PerceptionEncoderVisionBlock.__init__r&   ri   c                 C   s<   ||  | j| ||d }|| | | | }|S )N)position_embeddings)r   r   r   r   r   r   )rS   r&   ri   r)   r)   r*   rw      s   z$PerceptionEncoderVisionBlock.forwardrx   ry   rz   r
   GELU	LayerNormr{   r[   r   r|   r   r   rM   r$   r}   r~   rw   r   r)   r)   rV   r*   r      s@    	
$6r   c                       s   e Zd Zddejejdddfdedededed	ed
edededede	de
dB def fddZdejdeeef fddZ  ZS )"PerceptionEncoderVisionTransformerr   NFr   widthlayersheadsrG   rH   r   r   r   r   rI   r   r   c                    sL   t    
| _|| _t 	
fddt|D | _d S )Nc                    s4   g | ]}t 
 	 d | dqS )z.resblocks.)r   r   rG   rH   r   r   r   r   rI   r   r   )r   ).0ir   r   r   rG   rH   r   r   r   r   rI   r   r)   r*   
<listcomp>  s     z?PerceptionEncoderVisionTransformer.__init__.<locals>.<listcomp>)rL   rM   r   r   r
   
ModuleListrange	resblocks)rS   r   r   r   rG   rH   r   r   r   r   rI   r   r   rV   r   r*   rM      s   

z+PerceptionEncoderVisionTransformer.__init__r&   ri   c                 C   s   | j D ]}|||d}q|S )Nri   )r   )rS   r&   ri   blockr)   r)   r*   rw   !  s   
z*PerceptionEncoderVisionTransformer.forwardr   r)   r)   rV   r*   r      sD    	
$%r   c                	       s   e Zd Zeddfdededee def fddZe	d	e
jfd
dZdedefddZde
jfddZde
jfddZ  ZS )PerceptionEncoderNr   r   r   r   r   c                    s  t    |j| _|jp|j| _|j| _|j| _|j| _|j| _|j| _|j	| _	| j	s/t
d|j| _tjd|j|j|jdd| _|jrI||jnt | _|jrV|| jnt | _t|j|j|j| j| j | j| j |j|j||| j|| dd| _tj|j|jd dddd	| _tj|jd |jd
 dddd	| _| jrt| jd t| j | _| jr| j| j | _t| jd tt| j| jd  | j | _ d S d S )Nzuse_rope2d must be Truer.   F)in_channelsout_channelskernel_sizestrider   z.transformer)	rG   rH   r   r   r   r   rI   r   r   r   rF   )r   r   padding   g      )!rL   rM   
patch_size
output_dimr   r   r   use_abs_posembrI   
use_rope2d
ValueError
image_sizer
   Conv2dconv1
use_ln_prer   ln_preuse_ln_postln_postr   r   r   transformervit_downsampler1vit_downsampler2r   r$   randnclass_embeddingposemb_grid_sizer{   positional_embedding)rS   configr   r   r   r   rV   r)   r*   rM   (  sp   



zPerceptionEncoder.__init__rY   c                 C   
   | j jjS r   )r   weightr/   rS   r)   r)   r*   r/   o     
zPerceptionEncoder.dtypegrid_hgrid_wc                 C   s   | j |kr| j |kr| jd S | j}| jr"|d d |dd  }}|d| j | j ddddd }tj|||fddd	}|ddddd| j}| jrXt	j
||gdd
}|d S )N)N.rF   r    r   r.   r   bilinearF)sizemodealign_cornersr!   )r   r   rI   ra   ro   
contiguousFinterpolater   r$   r5   )rS   r   r   	pos_embedcls_token_embedr)   r)   r*   sample_abs_posembs  s    
z#PerceptionEncoder.sample_abs_posembr&   c                 C   s   |j \}}}}|| j || j }}| |}|dddd|d| j}| jr<tj| j	
ddd|dd|gdd}| jrG|| || }| |}| j|||fd}| |}| jrj|d d dd d d f }|S )Nr   r   r.   rF   r    r!   r   )r1   r   r   ro   ra   r   rI   r$   r5   r   rk   r`   r   r   r   r   r   )rS   r&   batchrv   hwr   r   r)   r)   r*   forward_features  s    


z"PerceptionEncoder.forward_featuresc                 C   s~   |  |}|j\}}}t|d }|dd }|||||}| |}| |}|j\}}}}||d|| ddS )Ng      ?r   rF   r    )r   r1   r{   	transposer   rk   r   r   )rS   r&   BPCTr)   r)   r*   rw     s   


zPerceptionEncoder.forward)rx   ry   rz   _DEFAULT_NORM_LAYERr   r   r   r   rM   propertyr$   r/   r{   r   r}   r   rw   r   r)   r)   rV   r*   r   '  s$    Gr   c                	       s   e Zd Z		d#dedee def fddZdej	d	ej	fd
dZ
ed	ejfddZd	ej	fddZdej	d	ej	fddZdee d	ej	fddZdee defddZ	d$dej	dej	dedefddZd eeeej	f  fd!d"Z  ZS )%StepVLForConditionalGenerationNr   r   r   r   c              	      sx   t    || _t|jt|jj |t|dd| _t	|jj
d |jj|jd|t|dd| _t|j|t|dd| _d S )	Nvision_modelr   r   Tvit_large_projector)r   gather_outputr   r   language_model)r   r   r   )rL   rM   r   r   vision_configr   
hidden_actr   r   r   r   text_confighidden_sizeprojector_biasr   r   r   )rS   r   r   r   rV   r)   r*   rM     s*   


	z'StepVLForConditionalGeneration.__init__input_tensorrY   c                 C   s
   |  |S r   )r   )rS   r   r)   r)   r*   _get_vision_model_output  s   
z7StepVLForConditionalGeneration._get_vision_model_outputc                 C   r   r   )r   r   rj   r   r)   r)   r*   rj     r   z%StepVLForConditionalGeneration.devicec                    s4   t |tjr|ddS tt fdd|D S )Nr   r-   c                 3   s    | ]}  |V  qd S r   )_flatten_embeddings)r   r8   r   r)   r*   	<genexpr>  s    zEStepVLForConditionalGeneration._flatten_embeddings.<locals>.<genexpr>)
isinstancer$   r}   flattenr5   r~   )rS   
embeddingsr)   r   r*   r     s   z2StepVLForConditionalGeneration._flatten_embeddingsimage_featuresc                 C   s   |  |\}}|S r   )r   )rS   r  rv   r)   r)   r*   _process_image_features  s   z6StepVLForConditionalGeneration._process_image_featuresitemsc                 C   sH  t |dksJ |d }|j| jj| j}|jd}|jdd }|d ur4|| jj| j}| 	|}|d urB| 	|nd }| 
|}|d urR| 
|nd }g }d}	t|D ]B\}
}g }|dkrz||	|	|  }||d|jd  |||
 d|jd  |	|7 }	|t |dkrt|n|d  q\| |S )NrF   r   num_patchespatch_pixel_valuesr    )lenfeaturer6   r   r/   rl   rj   model_specific_datagetr   r  	enumerateappendrk   r1   r$   r5   r   )rS   r  itempixel_valuesr  r	  r  patch_image_featuresmerged_image_featurescur_patch_idxr   	num_patchcur_featurepatch_slicer)   r)   r*   get_image_feature  sD   





z0StepVLForConditionalGeneration.get_image_feature	input_ids	mm_inputsc                 C   s   t  }|||S r   )r   pad_input_tokens)rS   r  r  patternr)   r)   r*   pad_input_ids
  s   z,StepVLForConditionalGeneration.pad_input_idsFrs   forward_batchget_embeddingc                 C   s    t ||| jtj| ji|d}|S )N)r  r  r   data_embedding_funcsrs   )r   r   r   IMAGEr  )rS   r  rs   r  r  hidden_statesr)   r)   r*   rw     s   
z&StepVLForConditionalGeneration.forwardweightsc           
      C   s  t |}g }g }|D ]?\}}d|v sd|v rB|dd}|dd}|dd}|d	d
}|dd}|dd}|||f q
|||f q
t|}t| jdd}| D ]\}}||vrjtd| d|| }t|dt}	|	|| qZ|r| j	
| dS dS )zBLoad weights for the model, separating vision and language weightsr   r   z.attn.in_proj_weightz.attn.qkv_proj.weightz.attn.in_proj_biasz.attn.qkv_proj.biasz.attn.out_proj.biasz.attn.proj.biasz.attn.out_proj.weightz.attn.proj.weightz	.mlp.c_fcz.mlp.fc1z.mlp.c_projz.mlp.fc2F)remove_duplicatezWeight z not found in params_dictweight_loaderN)listreplacer  dictnamed_parametersr  r   getattrr   r   load_weights)
rS   r#  vision_weightslanguage_weightsnameloaded_weightvision_state_dictparams_dictparamr%  r)   r)   r*   r+  !  s0   z+StepVLForConditionalGeneration.load_weightsr   )F)rx   ry   rz   r   r   r   r   rM   r$   r}   r   r   rj   r   r  r   r   r  r{   r   r  r   r|   rw   r   r   r+  r   r)   r)   rV   r*   r     s8    (	
$r   )r   r,   r-   )8__doc__	functoolsr   typingr   r   r   r   r   r$   einopsr   r	   r
   torch.nnr   r   transformers.activationsr   sglang.srt.configs.step3_vlr   "sglang.srt.layers.attention.visionr   sglang.srt.layers.linearr   r   *sglang.srt.layers.quantization.base_configr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr   r   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.qwen3r   sglang.srt.utilsr   r   r   r+   rB   r   rC   r   r   r   r   r   r   
EntryClassr)   r)   r)   r*   <module>   s>   
O
!=, 
 