o
    
۾i:                     @   s  d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ eejdZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZ G dd dejZ!dS )    )IterableN)PretrainedConfig)	torch_int)
get_act_fn)MMEncoderAttention)Conv2dLayer)RMSNorm)ColumnParallelLinearRowParallelLinear)QuantizationConfig)default_weight_loader)rms_norm
layer_normc                       s2   e Zd Z fddZdejdejfddZ  ZS )InternS1VisionPatchEmbeddingsc                    s   t    |j|j}}|j|j}}|d |d  |d |d   }|d |d  |d |d  f}|| _|| _|| _|| _|| _t||||d| _	d S )N   r   )kernel_sizestride)
super__init__
image_size
patch_sizenum_channelshidden_sizenum_patchespatch_shaper   
projection)selfconfigr   r   r   r   r   r   	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/interns1_vit.pyr       s   
 z&InternS1VisionPatchEmbeddings.__init__pixel_valuesreturnc           	      C   sj   |j \}}}}|| jkrtd| || jjj}|j d |j d }}|ddd}|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.      r   )	shaper   
ValueErrorr   toweightdtypeflatten	transpose)	r   r"   
batch_sizer   heightwidth
embeddingspatch_heightpatch_widthr    r    r!   forward3   s   
z%InternS1VisionPatchEmbeddings.forward)__name__
__module____qualname__r   torchTensorr3   __classcell__r    r    r   r!   r      s    r   c                       sd   e Zd Zdef fddZdejdededejfdd	Z	
ddejdej	d
B dejfddZ
  ZS )InternS1VisionEmbeddingsr   c                    s   t    || _ttdd|j| _|j	r$ttdd|j| _
nd | _
t|| _|j| _t|jtr9|jn|j|jf| _| jj}|jrWttd|d |j| _d S d | _d S )Nr   )r   r   r   nn	Parameterr7   zerosr   	cls_tokenuse_mask_token
mask_tokenr   patch_embeddingsr   
isinstancer   r   r    use_absolute_position_embeddingsposition_embeddings)r   r   r   r   r    r!   r   C   s$   





z!InternS1VisionEmbeddings.__init__r0   r.   r/   r#   c                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| jd  }	|| jd  }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr   g      ?r%   r$   bicubicF)sizemodealign_cornersdim)r&   rD   r7   jit
is_tracingr   r   reshapepermuter;   
functionalinterpolateviewcat)r   r0   r.   r/   r   num_positionsclass_pos_embedpatch_pos_embedrK   
new_height	new_widthsqrt_num_positionsr    r    r!   interpolate_pos_encodingZ   s2   
z1InternS1VisionEmbeddings.interpolate_pos_encodingNr"   bool_masked_posc                 C   s   |j \}}}}| |\}\}}| \}	}
}|d ur5| j|	|
d}|d|}|d|  ||  }| j|	dd}tj	||fdd}| j
d urT|| ||| }|||ffS )NrE   r   rJ   )r&   rA   rG   r@   expand	unsqueezetype_asr>   r7   rS   rD   rZ   )r   r"   r[   _r.   r/   r0   r1   r2   r-   seq_lenmask_tokensw
cls_tokensr    r    r!   r3      s   
z InternS1VisionEmbeddings.forwardN)r4   r5   r6   r   r   r7   r8   intrZ   
BoolTensorr3   r9   r    r    r   r!   r:   B   s&    
4r:   c                	       sP   e Zd ZdZddddedededd	f fd
dZdej	dej	fddZ
  ZS )InternSdpaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr    num_dummy_headsprefixr   rj   rk   r#   Nc                   s@  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d|| j | j | _	| jd | _
tj| j| j| j |jd| _tj| j| j| j |jd| _tj| j| j| j |jd| _|j| _| jrt| j	|j| jd| _t| j	|j| jd| _t| j	| j| _t| j| j| j
| dd| _d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      )bias)epsvar_hidden_sizez.attnrk   )r   r   r   r   	embed_dimnum_attention_heads	num_headshead_dimr'   	dummy_dimscaler;   Linearattention_biasq_projk_projv_projuse_qk_normqk_normalizationr   layer_norm_epsq_normk_normprojection_layerr   attn)r   r   rj   rk   r   r    r!   r      sV   
zInternSdpaAttention.__init__xc                 C   sT   |  |}| |}| |}| jr| |}| |}| |||}| |}|S )zx shape: (B, N, C))rx   ry   rz   r|   r~   r   r   r   )r   r   qkvr    r    r!   r3      s   





zInternSdpaAttention.forward)r4   r5   r6   __doc__r   re   strr   r7   r8   r3   r9   r    r    r   r!   rg      s    :rg   c                	       sN   e Zd Z		ddededB deddf fddZd	ejdejfd
dZ	  Z
S )InternS1VisionMLPNrh   r   quant_configrk   r#   c                    s\   t    || _t|j| _t|j|jd|| dd| _	t
|j|jd|| dd| _d S )NTz.fc1)rl   r   rk   z.fc2)r   r   r   r   
hidden_actactivation_fnr	   r   intermediate_sizefc1r
   fc2)r   r   r   rk   r   r    r!   r      s"   
zInternS1VisionMLP.__init__hidden_statesc                 C   s*   |  |\}}| |}| |\}}|S rd   )r   r   r   )r   r   r_   r    r    r!   r3     s   
zInternS1VisionMLP.forward)Nrh   )r4   r5   r6   r   r   r   r   r7   r8   r3   r9   r    r    r   r!   r      s    r   c                       sv   e Zd Z	dddddededB deded	df
 fd
dZdddededB dedefddZde	j
fddZ  ZS )InternS1VisionLayerNr   rh   ri   r   r   rj   rk   r#   c                   s   t    | j|||| dd| _t||| dd| _t|j |j|j	d| _
t|j |j|j	d| _|j}tj|t|j dd| _tj|t|j dd| _d S )Nz
.attentionri   z.mlp)r   rk   rm   T)requires_grad)r   r   
_init_attn	attentionr   mlpNORM2FN	norm_typer   r}   layernorm_beforelayernorm_afterlayer_scale_init_valuer;   r<   r7   oneslambda_1lambda_2)r   r   r   rj   rk   init_valuesr   r    r!   r     s.   
zInternS1VisionLayer.__init__ro   c                C   s   t |||dS )Nri   )rg   )r   r   r   rj   rk   r    r    r!   r   :  s
   zInternS1VisionLayer._init_attnr   c                 C   s8   ||  | || j  }|| | || j  }|S rd   )r   r   r   r   r   r   )r   r   r    r    r!   r3   H  s   zInternS1VisionLayer.forwardrd   )r4   r5   r6   r   r   re   r   r   r   r7   r8   r3   r9   r    r    r   r!   r     s:    )
r   c                       sX   e Zd Z	ddddddededB dedB ded	ef
 fd
dZdej	fddZ
  ZS )InternS1VisionEncoderNr   rh   num_hidden_layers_overriderj   rk   r   r   r   rj   rk   c                   sL   t     | _|d u r j}n|}t fddt|D | _d S )Nc              	      s&   g | ]}t   d | dqS )z.layer.ri   )r   ).0	layer_idxr   rj   rk   r   r    r!   
<listcomp>m  s    z2InternS1VisionEncoder.__init__.<locals>.<listcomp>)r   r   r   num_hidden_layersr;   
ModuleListrangelayer)r   r   r   r   rj   rk   r   r   r   r!   r   Z  s   
	
zInternS1VisionEncoder.__init__inputs_embedsc                 C   s   |}| j D ]}||}q|S rd   )r   )r   r   r   encoder_layerr    r    r!   r3   x  s   

zInternS1VisionEncoder.forwardrd   )r4   r5   r6   r   r   re   r   r   r7   r8   r3   r9   r    r    r   r!   r   Y  s$    r   c                       s   e Zd Z	ddddddededB dedB ded	ed
df fddZdd Z		dde	j
dB de	j
dB d
e	jfddZdeeee	j
f  d
ee fddZ  ZS )InternS1VisionModelNr   rh   r   r   r   r   rj   rk   r#   c                   s^   t    || _t|| _t|||| dd| _|jr#t	 | _d S tj
|j|jd| _d S )Nz.encoder)r   r   rj   rk   r   )r   r   r   r:   r0   r   encoderuse_mean_poolingr;   Identity	LayerNormr   r}   	layernorm)r   r   r   r   rj   rk   r   r    r!   r     s   
	
zInternS1VisionModel.__init__c                 C   s   | j jS rd   )r0   rA   )r   r    r    r!   get_input_embeddings  s   z(InternS1VisionModel.get_input_embeddingsr"   pixel_embedsc                 C   sr   |d u r|d u rt d|d ur|}n|d ur,|jdkr$| |\}}nt d|j | j|d}| |}|S )Nz0You have to specify pixel_values or pixel_embeds   zwrong pixel_values size: )r   )r'   ndimr0   r&   r   r   )r   r"   r   r   r_   encoder_outputsr    r    r!   r3     s   

zInternS1VisionModel.forwardweightsc                 C   sL   t |  }t }|D ]\}}|| }t|dt}||| || q|S )Nweight_loader)dictnamed_parameterssetgetattrr   add)r   r   params_dictloaded_paramsnameloaded_weightparamr   r    r    r!   load_weights  s   
z InternS1VisionModel.load_weightsrd   )NN)r4   r5   r6   r   r   re   r   r   r   r7   r8   FloatTensorr3   r   tupler   r   r9   r    r    r   r!   r     s<    
,r   )"collections.abcr   r7   torch.nnr;   transformersr   transformers.utilsr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   vllm.model_executor.layers.convr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr	   r
   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr   r   r   Moduler   r:   rg   r   r   r   r   r    r    r    r!   <module>   s,   	#cO"C'