o
    
۾i:                     @   s`  d dl mZ d dlmZ d dlZd dlmZ d dlm  mZ	 d dl
mZ d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ d dlmZ d dl m!Z! ddl"m#Z#m$Z$ eej%dZ&G dd dej'Z(G dd dej'Z)G dd dej'Z*G dd dej'Z+G dd dej'Z,G dd dej'Z-G dd dej'Z.dS )    )Iterable)partialN)PretrainedConfig)divideget_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizesplit_tensor_along_last_dim tensor_model_parallel_all_gather)
get_act_fn)MMEncoderAttention)Conv2dLayer)RMSNorm)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)default_weight_loader   )is_vit_use_data_parallelrun_dp_sharded_vision_model)rms_norm
layer_normc                       sh   e Zd Zdef fddZdejdedefddZdeded	ejfd
dZ	dej
d	ejfddZ  ZS )InternVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
dd| j| _td| j| j| jd| _| j| j d | _| jd | _tt	
d| j| j| _d S )Nr      )in_channelsout_channelskernel_sizestride   )super__init__r   hidden_size	embed_dim
image_size
patch_sizenn	Parametertorchrandnclass_embeddingr   patch_embeddingnum_patchesnum_positionsposition_embeddingselfr   	__class__ Y/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/intern_vit.pyr!   .   s"   

zInternVisionEmbeddings.__init__	pos_embedHWc                 C   sn   |j }| d| j| j | j| j ddddd}tj|||fddd}|dd|| ddd|S )	Nr   r   r   r   bicubicF)sizemodealign_corners)	dtypefloatreshaper$   r%   permuteFinterpolateto)r0   r5   r6   r7   target_dtyper3   r3   r4   _get_pos_embedE   s   


"z%InternVisionEmbeddings._get_pos_embedreturnc              	   C   s`   | j }| j|| kr|S tj|d d d dd d f | |d d dd d d f ||gddS )Nr   dim)r.   r,   r(   catrE   )r0   r6   r7   r.   r3   r3   r4   _get_position_embeddingV   s   "z.InternVisionEmbeddings._get_position_embeddingpixel_valuesc                 C   s   | j jj}|  ||}|j\}}}}|ddd}| j|dd|}t	j
||gdd}	| ||}
|	|
| }	|	S )Nr   r   r8   rG   )r+   weightr=   rC   shapeflatten	transposer*   expandr(   rI   rJ   )r0   rK   rD   patch_embeds
batch_size_heightwidthclass_embeds
embeddingsr.   r3   r3   r4   forwardc   s   
zInternVisionEmbeddings.forward)__name__
__module____qualname__r   r!   r(   TensorintrE   rJ   FloatTensorrX   __classcell__r3   r3   r1   r4   r   -   s
    r   c                       sT   e Zd Zdef fddZdd Z		ddejdB dejdB d	ejfd
dZ	  Z
S )InternVisionPatchModelr   c                    s   t    || _t|| _d S N)r    r!   r   r   rW   r/   r1   r3   r4   r!   r   s   
zInternVisionPatchModel.__init__c                 C      | j S ra   rW   r0   r3   r3   r4   get_input_embeddingsw      z+InternVisionPatchModel.get_input_embeddingsNrK   pixel_embedsrF   c                 C   s\   |d u r|d u rt d|d ur|}|S |d ur,|jdkr$| |}|S t d|j |S )N0You have to specify pixel_values or pixel_embeds   wrong pixel_values size: )
ValueErrorndimrW   rM   )r0   rK   rg   hidden_statesr3   r3   r4   rX   z   s   

zInternVisionPatchModel.forwardNN)rY   rZ   r[   r   r!   re   r(   r\   r^   rX   r_   r3   r3   r1   r4   r`   q   s    r`   c                       sr   e Zd ZdZ	dddddededB ded	ed
df
 fddZde	j
de	j
fddZde	j
d
e	j
fddZ  ZS )InternParallelAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr    num_dummy_headsprefixr   quant_configrr   rs   rF   c             	      sx  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j dt	 }|r4dnt
 }|pA| j| | dk}|rFdn|| _|rMdnt | _|| j | j | _t|| j | j| _| jd | _t| j| j|| j |j|| d|d| _|j| _| jrt| j|j| jd	| _t| j|j| jd	| _t| j| j|| d
|d| _t| j| j| j| dd| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   r   g      z.qkvbiasrt   rs   
disable_tp)epsvar_hidden_sizez.proj)rt   rs   rw   .attnrs   )r    r!   r   r"   r#   num_attention_heads	num_headshead_dimrk   r   r   tp_sizer   tp_rank	dummy_dimr   num_heads_per_partitionscaler   qkv_biasqkvqk_normalizationr   layer_norm_epsq_normk_normr   projr   attn)r0   r   rt   rr   rs   use_data_parallelr   r1   r3   r4   r!      st   

z InternParallelAttention.__init__qkc                 C   sr   | j dkrt| }t| }| |}| |}| j dkr5tt| j d}||| j }||| j }||fS )Nr   )num_partitions)r   r	   
contiguousr   r   r   r   r   )r0   r   r   splitterr3   r3   r4   _apply_qk_norm   s   



z&InternParallelAttention._apply_qk_normxc           
      C   sd   |j \}}}| |\}}|jddd\}}}| jr"| ||\}}| |||}	| |	\}	}|	S )Nr   r8   rG   )rM   r   chunkr   r   r   r   )
r0   r   BNrS   r   r   r   voutr3   r3   r4   rX      s   zInternParallelAttention.forwardra   )rY   rZ   r[   __doc__r   r   r]   strr!   r(   r\   r   rX   r_   r3   r3   r1   r4   ro      s&    Mro   c                	       sN   e Zd Z		ddededB deddf fddZd	ejdejfd
dZ	  Z
S )	InternMLPNrp   r   rt   rs   rF   c                    sf   t    || _t|j| _t }t|j|j	d|| d|d| _
t|j	|jd|| d|d| _d S )NTz.fc1ru   z.fc2)r    r!   r   r
   
hidden_actactivation_fnr   r   r"   intermediate_sizefc1r   fc2)r0   r   rt   rs   r   r1   r3   r4   r!      s(   
zInternMLP.__init__rm   c                 C   s*   |  |\}}| |}| |\}}|S ra   )r   r   r   )r0   rm   rS   r3   r3   r4   rX     s   
zInternMLP.forward)Nrp   )rY   rZ   r[   r   r   r   r!   r(   r\   rX   r_   r3   r3   r1   r4   r      s    r   c                       s   e Zd Z	dddeddededB deded	ee d
df fddZ	dddededB dedefddZ
dejfddZ  ZS )InternVisionEncoderLayerNr   rp   )rr   rs   attn_clsr   rt   rr   rs   r   rF   c                   s   t    |j| _|j| _|j| _|| _| j|||| dd| _t	||| dd| _
t| j | j|jd| _t| j | j|jd| _t|jt| j | _t|jt| j | _d S )Nrz   rq   z.mlp)rt   rs   )rx   )r    r!   r"   r#   r   	norm_typer   
_init_attnr   r   mlpNORM2FNr   norm1norm2r&   r'   initializer_factorr(   onesls1ls2)r0   r   rt   rr   rs   r   r1   r3   r4   r!     s(   
	z!InternVisionEncoderLayer.__init__r{   c                C   s   | j ||||dS )N)rt   rr   rs   )r   )r0   r   rt   rr   rs   r3   r3   r4   r   =  s   z#InternVisionEncoderLayer._init_attnrm   c                 C   s8   ||  | || j  }|| | || j  }|S ra   )r   r   r   r   r   r   )r0   rm   r3   r3   r4   rX   L  s   z InternVisionEncoderLayer.forwardra   )rY   rZ   r[   ro   r   r   r]   r   typer!   r   r(   r\   rX   r_   r3   r3   r1   r4   r     s@    '
r   c                       sb   e Zd Z	ddddeddededB dedB ded	ed
ee f fddZ	de
jfddZ  ZS )InternVisionEncoderNr   rp   )num_hidden_layers_overriderr   rs   	layer_clsr   rt   r   rr   rs   r   c                   sT   t     _|_|d u r j}n|}t fddt|D _d S )Nc              	      s(   g | ]}j   d | dqS )z.layers.rq   )r   ).0	layer_idxr   rr   rs   rt   r0   r3   r4   
<listcomp>m  s    z0InternVisionEncoder.__init__.<locals>.<listcomp>)	r    r!   r   r   num_hidden_layersr&   
ModuleListrangelayers)r0   r   rt   r   rr   rs   r   r   r1   r   r4   r!   X  s   


zInternVisionEncoder.__init__inputs_embedsc                 C   s   |}| j D ]}||}q|S ra   )r   )r0   r   rm   encoder_layerr3   r3   r4   rX   x  s   

zInternVisionEncoder.forwardra   )rY   rZ   r[   r   r   r   r]   r   r   r!   r(   r\   rX   r_   r3   r3   r1   r4   r   W  s*     r   c                       s   e Zd ZddgiZ	ddddddededB dedB d	ed
eddf fddZdd Z			dde
jdB de
jdB de
jfddZdeeee
jf  dee fddZ  ZS )InternVisionModelr   Nr   rp   )r   rr   rs   r   rt   r   rr   rs   rF   c                   s@   t    || _t | _t|| _t||||| dd| _d S )Nz.encoder)r   rt   r   rr   rs   )	r    r!   r   r   r   r   rW   r   encoder)r0   r   rt   r   rr   rs   r1   r3   r4   r!     s   
	
zInternVisionModel.__init__c                 C   rb   ra   rc   rd   r3   r3   r4   re     rf   z&InternVisionModel.get_input_embeddingsrK   rg   c                 C   sz   |d u r|d u rt d|d ur|}n|d ur*|jdkr"| |}nt d|j | jr5t|| j}|S | j|d}|S )Nrh   ri   rj   )r   )rk   rl   rW   rM   r   r   r   )r0   rK   rg   rm   encoder_outputsr3   r3   r4   rX     s   
zInternVisionModel.forwardweightsc                 C   sL   t |  }t }|D ]\}}|| }t|dt}||| || q|S )Nweight_loader)dictnamed_parameterssetgetattrr   add)r0   r   params_dictloaded_paramsnameloaded_weightparamr   r3   r3   r4   load_weights  s   
zInternVisionModel.load_weightsra   rn   )rY   rZ   r[   packed_modules_mappingr   r   r]   r   r!   re   r(   r\   r^   rX   r   tupler   r   r_   r3   r3   r1   r4   r     s@    
,r   )/collections.abcr   	functoolsr   r(   torch.nnr&   torch.nn.functional
functionalrA   transformersr   vllm.distributedr   r   r   r   r	   %vllm.model_executor.layers.activationr
   $vllm.model_executor.layers.attentionr   vllm.model_executor.layers.convr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr   visionr   r   	LayerNormr   Moduler   r`   ro   r   r   r   r   r3   r3   r3   r4   <module>   s2   	Di%<)