o
    
۾iI                     @   s  d Z ddlmZ ddlmZ ddlZddlmZmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z% ee	j&ddZ'dd Z(d+ddZ)G dd de	j*Z+G dd de	j*Z,G dd  d e	j*Z-G d!d" d"e	j*Z.G d#d$ d$e	j*Z/G d%d& d&e	j*Z0G d'd( d(e	j*Z1G d)d* d*eZ2dS ),zIThis is basically a copy from perception_models/core/vision_encoder/pe.py    )Callable)partialN)	rearrangerepeat)nn)
functional)
VllmConfig)$get_tensor_model_parallel_world_size)
get_act_fn)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig   )Step3VLForConditionalGeneration)WeightsMapperinit_vllm_registered_modelmaybe_prefix)is_vit_use_data_parallelrun_dp_sharded_vision_modelh㈵>)epsc                 C   s<   t | ddd} | jdd\}}tj| |fdd} t | dS )Nz... (d r) -> ... d r   rdimz... d r -> ... (d r))r   unbindtorchstack)xx1x2 r&   V/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/step_vl.pyrotate_half    s   
r(         ?c                 C   s   |j }|jdkr|j| }| | d  } | jd }|| }||jd ks/J d|jd ||dd |f |d||f |d|d f }	}}
||   | t||   |  }tj|	||
fdd}|	|S )N   r   zPfeature dimension {} is not of sufficient size to rotate in all the positions {}.r   )
dtypendimshapeformatcosr(   sinr!   cattype)freqststart_indexscaleseq_dimr,   seq_lenrot_dim	end_indext_leftt_rightoutr&   r&   r'   apply_rotary_emb'   s"   



$
r?   c                	       s   e Zd Z					ddededed	ef fd
dZdeeB dedejfddZ	dejdejfddZ
dejfddZdejdejdeeef fddZ  ZS )PerceptionEncoderRope2DF'  
   r   r)   r   max_grid_heightmax_grid_widthuse_cls_tokenc	           
         s`   t    || _|| _|| _|| _||||d    | _|| _|| _| 	 }	| j
d|	dd d S )Nr   freqs_cacheF)
persistent)super__init__r   rC   rD   rE   thetamax_freq	num_freqs_compute_2d_freqsregister_buffer)
selfr   rC   rD   rE   rJ   rK   rL   theta_rescale_factorcache	__class__r&   r'   rI   B   s   
z PerceptionEncoderRope2D.__init__basereturnc                 C   s.   d|t d|dd |d   |   }|S )Nr)   r   r   )r!   arangefloat)rO   rT   r   r4   r&   r&   r'   _compute_inv_freqX   s   *z)PerceptionEncoderRope2D._compute_inv_freqr5   inv_freqc                 C   s(   t d||j|}t|ddd}|S )Nz..., f -> ... fz... n -> ... (n r)r   r   )r!   einsumr3   r,   r   )rO   r5   rY   r4   r&   r&   r'   _compute_freqs\   s   z&PerceptionEncoderRope2D._compute_freqsc                 C   s   t j| jt jd}t j| jt jd}| jr|d7 }|d7 }| | j| jd }| 	||d d d f 
| j| jd}| 	||d d d f 
| j| jd}t j||gdd| j| j d}| jrqt jt d|jd |gdd}|d }|S )N)r,   r   r   r   r   r   )NN.)r!   rV   rC   rW   rD   rE   rX   rJ   r   r[   expandr2   reshapezerosr.   )rO   grid_h_rangegrid_w_rangerY   freqs_hfreqs_wr4   r&   r&   r'   rM   a   s&   

 z)PerceptionEncoderRope2D._compute_2d_freqsqkgrid_hwc                 C   s   |d | j ks|d | jkrZtj|d |jddd}tj|d |jddd}|| j | dtj}| j	rRtj
tjd|jd|d gdd}|tj}| jd|}n| j}t||}t||}||fS )Nr   r   )devicer   r   r   )rC   rD   r!   rV   rf   viewr]   tolongrE   r2   r^   rF   index_selectr?   )rO   rc   rd   re   rowscols	positionsr4   r&   r&   r'   forwardv   s   

zPerceptionEncoderRope2D.forward)FrA   rB   r   r)   )__name__
__module____qualname__intboolrI   rW   r!   TensorrX   r[   rM   tuplern   __classcell__r&   r&   rR   r'   r@   A   s&    *r@   c                       s&   e Zd Zd fdd	Zdd Z  ZS )PerceptionEncoderLayerScaler   Fc                    s*   t    || _t|t| | _d S N)rH   rI   inplacer   	Parameterr!   onesgamma)rO   r   init_valuesry   rR   r&   r'   rI      s   
z$PerceptionEncoderLayerScale.__init__c                 C   s   | j r	|| jS || j S rx   )ry   mul_r|   )rO   r#   r&   r&   r'   rn      s   z#PerceptionEncoderLayerScale.forward)r   F)ro   rp   rq   rI   rn   rv   r&   r&   rR   r'   rw      s    rw   c                       s\   e Zd Z		ddededeg ejf dedB def
 fdd	Z	d
e
jde
jfddZ  ZS )PerceptionEncoderMLPN 	input_dim
hidden_dim	act_layerquant_configprefixc                    sR   t    t }t||d|| d|d| _|| _t||d|| d|d| _d S )NTz.fc1biasr   r   
disable_tpz.fc2)rH   rI   r   r   fc1
activationr   fc2)rO   r   r   r   r   r   use_data_parallelrR   r&   r'   rI      s&   
zPerceptionEncoderMLP.__init__r#   rU   c                 C   s*   |  |\}}| |}| |\}}|S rx   )r   r   r   )rO   r#   _r&   r&   r'   rn      s   
zPerceptionEncoderMLP.forward)Nr   )ro   rp   rq   rr   r   r   Moduler   strrI   r!   rt   rn   rv   r&   r&   rR   r'   r      s    r   c                       sh   e Zd Z			ddededededed	edB d
ef fddZdej	de
eef dej	fddZ  ZS ) PerceptionEncoderVisionAttentionFNr   	embed_dim	num_headsrC   rD   rE   r   r   c           
   	      s   t    || _|| _|| | _| jd | _t }|rdnt }	| j|	 dks+J d| j|	 | _t	|| j| jd|| d|d| _
t||d|| d|d| _t| j| j| j| d	d
| _t| j|||d| _d S )N      r   r   z(embed_dim must be divisible by num_headsTz	.qkv_projr   z	.out_proj.attnr   )r   rC   rD   rE   )rH   rI   r   total_num_headshead_dimr7   r   r	   r   r   qkv_projr   out_projr   attnr@   rope)
rO   r   r   rC   rD   rE   r   r   r   tp_sizerR   r&   r'   rI      sP   


	z)PerceptionEncoderVisionAttention.__init__r#   re   rU   c                 C   s   |j \}}}| |\}}|jddd\}}}	|||| j| jdddd}|||| j| jdddd}| j|||d\}}|dddd||| j| j }|dddd||| j| j }| 	|||	}
| 
|
\}
}|
S )Nr+   r   )chunksr   r   r   r   re   )r.   r   chunkrg   r   r   permuter   r]   r   r   )rO   r#   re   bszr9   r   qkvrc   rd   vattn_outputr&   r&   r'   rn      s     ""z(PerceptionEncoderVisionAttention.forward)FNr   )ro   rp   rq   rr   rs   r   r   rI   r!   rt   ru   rn   rv   r&   r&   rR   r'   r      s(    *5r   c                       s~   e Zd Zddejejdddfdedededed	ed
ededede	de
dB def fddZdejdeeef fddZ  ZS )PerceptionEncoderVisionBlock      @NFr   d_modeln_headrC   rD   	mlp_ratiols_init_valuer   
norm_layerrE   r   r   c              	      s   t    t|||||	|
| dd| _|d urt||nt | _|d ur+t||nt | _||| _	||| _
t|| }t||||
| dd| _d S )Nr   )rC   rD   rE   r   r   z.mlpr   r   )rH   rI   r   r   rw   r   Identityls_1ls_2ln_1ln_2rr   r   mlp)rO   r   r   rC   rD   r   r   r   r   rE   r   r   r   rR   r&   r'   rI      s8   




z%PerceptionEncoderVisionBlock.__init__r#   re   c                 C   s<   ||  | j| ||d }|| | | | }|S Nr   )r   r   r   r   r   r   )rO   r#   re   r&   r&   r'   rn   *  s   z$PerceptionEncoderVisionBlock.forwardro   rp   rq   r   GELU	LayerNormrr   rW   r   rs   r   r   rI   r!   rt   ru   rn   rv   r&   r&   rR   r'   r      s@    	
$-r   c                       s   e Zd Zddejejdddfdedededed	ed
edededede	de
dB def fddZdejdeeef fddZ  ZS )"PerceptionEncoderVisionTransformerr   NFr   widthlayersheadsrC   rD   r   r   r   r   rE   r   r   c                    sL   t    
| _|| _t 	
fddt|D | _d S )Nc                    s4   g | ]}t 
 	 d | dqS )z.resblocks.)r   r   rC   rD   r   r   r   r   rE   r   r   )r   ).0ir   r   r   rC   rD   r   r   r   r   rE   r   r&   r'   
<listcomp>D  s     z?PerceptionEncoderVisionTransformer.__init__.<locals>.<listcomp>)rH   rI   r   r   r   
ModuleListrange	resblocks)rO   r   r   r   rC   rD   r   r   r   r   rE   r   r   rR   r   r'   rI   1  s   

z+PerceptionEncoderVisionTransformer.__init__r#   re   c                 C   s   | j D ]}|||d}q|S r   )r   )rO   r#   re   blockr&   r&   r'   rn   V  s   
z*PerceptionEncoderVisionTransformer.forwardr   r&   r&   rR   r'   r   0  sD    	
$%r   c                	       sl   e Zd ZeddfdedededB def fddZd	ed
efddZ	de
jfddZde
jfddZ  ZS )PerceptionEncoderNr   r   r   r   r   c                    s  t    |j| _|jp|j| _|j| _|j| _|j| _|j| _|j| _|j	| _	| j	s/t
d|j| _td|j|j|jdd| _|jrH||jnt | _|jrU|| jnt | _t|j|j|j| j| j | j| j |j|j||| j|| dd| _t|j|jd dddd	| _t|jd |jd
 dddd	| _| jrt| jd t| j | _| jr| j| j | _t| jd tt| j| jd  | j | _ d S d S )Nzuse_rope2d must be Truer+   F)in_channelsout_channelskernel_sizestrider   z.transformer)	rC   rD   r   r   r   r   rE   r   r   r   r   )r   r   padding   r   )!rH   rI   
patch_size
output_dimr   r   r   use_abs_posembrE   
use_rope2d
ValueError
image_sizer   conv1
use_ln_prer   r   ln_preuse_ln_postln_postr   r   r   transformervit_downsampler1vit_downsampler2rz   r!   randnclass_embeddingposemb_grid_sizerr   positional_embedding)rO   configr   r   r   r   rR   r&   r'   rI   ]  sp   



zPerceptionEncoder.__init__grid_hgrid_wc                 C   s   | j |kr| j |kr| jd S | j}| jr"|d d |dd  }}|d| j | j ddddd }tj|||fddd	}|ddddd| j}| jrXt	j
||gdd
}|d S )N)N.r   r   r   r+   r   bilinearF)sizemodealign_cornersr   )r   r   rE   r]   r   
contiguousFinterpolater   r!   r2   )rO   r   r   	pos_embedcls_token_embedr&   r&   r'   sample_abs_posemb  s    
z#PerceptionEncoder.sample_abs_posembr#   c                 C   s   |j \}}}}|| j || j }}| |}|dddd|d| j}| jr<tj| j	
ddd|dd|gdd}| jrG|| || }| |}| j|||fd}| |}| jrj|d d dd d d f }|S )Nr   r   r+   r   r   r   r   )r.   r   r   r   r]   r   rE   r!   r2   r   rg   r\   r   r   r   r   r   )rO   r#   batchr   hwr   r   r&   r&   r'   forward_features  s    


z"PerceptionEncoder.forward_featuresc                 C   s~   |  |}|j\}}}t|d }|dd }|||||}| |}| |}|j\}}}}||d|| ddS )Ng      ?r   r   r   )r   r.   rr   	transposer   rg   r   r   )rO   r#   BPCTr&   r&   r'   rn     s   


zPerceptionEncoder.forward)ro   rp   rq   _DEFAULT_NORM_LAYERr   r   r   rI   rr   r   r!   rt   r   rn   rv   r&   r&   rR   r'   r   \  s     Gr   c                       s   e Zd Zeddddddddd	Zd
ddededdf fddZdej	dB dej	dB fddZ
dej	dej	fddZ  ZS )StepVLForConditionalGenerationzlanguage_model.model.zlanguage_model.lm_head.)zmodel.zlm_head.z.attn.qkv_proj.weightz.attn.qkv_proj.biasz.mlp.fc1z.mlp.fc2)z.attn.in_proj_weightz.attn.in_proj_biasz	.mlp.c_fcz.mlp.c_proj)orig_to_new_prefixorig_to_new_substrr   r   vllm_configr   rU   Nc             
      s  t t|   |jj}|jj}|j}|| _|| _|jdk| _	| 
|d0 t|jt|jj|t|dd| _t|jjd |jj|jd|t|d| j	d| _W d    n1 sWw   Y  | | t||jt|d	d
| _W d    n1 sxw   Y  | jj| _d S )Ndataimagevision_modelr   r   Tvit_large_projector)r   gather_outputr   r   r   language_model)r   	hf_configr   )rH   r   rI   model_configr   multimodal_configr   r   mm_encoder_tp_moder   _mark_tower_modelr   vision_configr
   
hidden_actr   r   r   r   text_confighidden_sizeprojector_biasr   _mark_language_modelr   r   make_empty_intermediate_tensors)rO   r   r   r   r   r   rR   r&   r'   rI     sB   



z'StepVLForConditionalGeneration.__init__input_tensorc                 C   s(   |d u rd S | j rt|| jS | |S rx   )r   r   r   )rO   r	  r&   r&   r'   _get_vision_model_output  s
   
z7StepVLForConditionalGeneration._get_vision_model_outputimage_featuresc                 C   s   |  |\}}|S rx   )r   )rO   r  r   r&   r&   r'   _process_image_features  s   z6StepVLForConditionalGeneration._process_image_features)ro   rp   rq   r   hf_to_vllm_mapperr   r   rI   r!   rt   r
  r  rv   r&   r&   rR   r'   r     s$     '
	r   )r   r)   r*   )3__doc__collections.abcr   	functoolsr   r!   einopsr   r   r   torch.nnr   r   vllm.configr   vllm.distributedr	   %vllm.model_executor.layers.activationr
   9vllm.model_executor.layers.attention.mm_encoder_attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   step3_vlr   utilsr   r   r   visionr   r   r   r   r(   r?   r   r@   rw   r   r   r   r   r   r   r&   r&   r&   r'   <module>   s:   
G
$F4, 