o
    i.                     @   s>  d Z ddlmZ ddlZddlmZ ddlmZmZ ddl	m
Z
mZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ dededefddZdededefddZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#G dd dejZ$G dd  d ejeZ%dS )!zbMinimal implementation of BlipVisionModel intended to be only used
within a vision language model.    )IterableN)Blip2VisionConfigBlipVisionConfig)divide$get_tensor_model_parallel_world_size)
get_act_fn)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)default_weight_loader   )SupportsQuant
image_size
patch_sizereturnc                 C   s   | | dksJ | | S )Nr    r   r   r   r   U/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/blip.pyget_blip_patch_grid_length   s   r   c                 C   s   t | |d}|| S )Nr   )r   )r   r   grid_lengthr   r   r   get_blip_num_patches    s   r   c                       s<   e Zd ZdeeB f fddZdejdejfddZ  Z	S )BlipVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
dd| j| _td| j| j| jd| _t| j| jd| _| jd | _tt	
d| j| j| _d S )Nr      )in_channelsout_channelskernel_sizestrider   )super__init__r   hidden_size	embed_dimr   r   nn	Parametertorchrandnclass_embeddingr	   patch_embeddingr   num_patchesnum_positionsposition_embedding)selfr   	__class__r   r   r"   )   s&   

zBlipVisionEmbeddings.__init__pixel_valuesr   c                 C   s   |j d }| jjj}| |j|d}|ddd}| j|dd}t	j
||gdd}| j|}||d d d |dd d f  }|S )Nr   )dtype   r   dim)shaper*   weightr2   toflatten	transposer)   expandr'   catr-   size)r.   r1   
batch_sizetarget_dtypepatch_embedsclass_embeds
embeddingsposition_embedsr   r   r   forwardC   s   


$zBlipVisionEmbeddings.forward)
__name__
__module____qualname__r   r   r"   r'   TensorrE   __classcell__r   r   r/   r   r   (   s    r   c                	       sh   e Zd ZdZ		ddeeB dedB deddf fdd	Zd
e	j
dedefddZde	j
fddZ  ZS )BlipAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN r   quant_configprefixr   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _t| j| j| j|j|| dd| _t| j| j|| dd| _t | _t| j| j| _t| j| j| j	| d	d
| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      z.qkvbiasrM   rN   z.projectionrM   rN   z.attn)rN   )r!   r"   r   r#   r$   num_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutdropoutr   qkv_biasqkvr   
projectionr   tp_sizer   num_heads_per_partitionr   attnr.   r   rM   rN   r/   r   r   r"   W   sJ   
zBlipAttention.__init__tensorseq_lenbszc                 C   s    | ||| j| jdd S )Nr   r3   )viewrS   rT   r;   
contiguous)r.   r`   ra   rb   r   r   r   _shape   s   zBlipAttention._shapehidden_statesc           	      C   sF   |  |\}}|jddd\}}}| |||}| |\}}|dfS )z#Input shape: Batch x Time x Channelr   r4   r5   N)rZ   chunkr^   r[   )	r.   rf   
qkv_states_query_states
key_statesvalue_statesoutattn_outputr   r   r   rE      s
   zBlipAttention.forwardNrL   )rF   rG   rH   __doc__r   r   r   strr"   r'   rI   intre   rE   rJ   r   r   r/   r   rK   T   s"    -rK   c                	       N   e Zd Z		ddededB deddf fddZd	ejdejfd
dZ	  Z
S )BlipMLPNrL   r   rM   rN   r   c                    s\   t    || _t|j| _t|j|jd|| dd| _	t
|j|jd|| dd| _d S )NTz.fc1rO   z.fc2)r!   r"   r   r   
hidden_actactivation_fnr
   r#   intermediate_sizefc1r   fc2r_   r/   r   r   r"      s"   
zBlipMLP.__init__rf   c                 C   s*   |  |\}}| |}| |\}}|S N)rx   rv   ry   )r.   rf   ri   r   r   r   rE      s   
zBlipMLP.forwardro   rF   rG   rH   r   r   rq   r"   r'   rI   rE   rJ   r   r   r/   r   rt      s    rt   c                	       rs   )BlipEncoderLayerNrL   r   rM   rN   r   c                    sb   t    t||| dd| _tj|j|jd| _t	||| dd| _
tj|j|jd| _d S )Nz
.self_attnrQ   epsz.mlp)r!   r"   rK   	self_attnr%   	LayerNormr#   layer_norm_epslayer_norm1rt   mlplayer_norm2r_   r/   r   r   r"      s   
zBlipEncoderLayer.__init__rf   c                 C   sJ   |}|  |}| j|d\}}|| }|}| |}| |}|| }|S )N)rf   )r   r   r   r   )r.   rf   residualri   r   r   r   rE      s   


zBlipEncoderLayer.forwardro   r{   r   r   r/   r   r|      s    r|   c                       sV   e Zd ZdZ			ddededB dedB deddf
 fd	d
Zde	j
fddZ  ZS )BlipEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self
    attention layers. Each layer is a [`BlipEncoderLayer`].

    Args:
        config: BlipConfig
    NrL   r   rM   num_hidden_layers_overriderN   r   c                    sJ   t     | _|d u r j}n|}t fddt|D | _d S )Nc                    s$   g | ]}t   d | dqS )z.layers.)r   rM   rN   )r|   ).0	layer_idxr   rN   rM   r   r   
<listcomp>   s    z(BlipEncoder.__init__.<locals>.<listcomp>)r!   r"   r   num_hidden_layersr%   
ModuleListrangelayers)r.   r   rM   r   rN   r   r/   r   r   r"      s   

zBlipEncoder.__init__inputs_embedsc                 C   s   |}| j D ]}||}q|S rz   )r   )r.   r   rf   encoder_layerr   r   r   rE     s   

zBlipEncoder.forward)NNrL   )rF   rG   rH   rp   r   r   rr   rq   r"   r'   rI   rE   rJ   r   r   r/   r   r      s"    r   c                       s   e Zd ZeZdZdg diZ	ddddddededB d	edB d
e	dB de
ddf fddZdejdejfddZdeee
ejf  dee
 fddZ  ZS )BlipVisionModelr1   qkv_proj)q_projk_projv_projNrL   )r   require_post_normrN   r   rM   r   r   rN   r   c                   s   t    || _t|| _t|||| dd| _|j}t| jj	|jkr4t
d| dt| jj	 d|d u r@t| jj	|k}|rNtj|j|jd| _d S d | _d S )Nz.encoder)r   rM   r   rN   zThe original encoder only has z layers, but you requested z layers.r}   )r!   r"   r   r   rC   r   encoderr   lenr   rU   r%   r   r#   r   post_layernorm)r.   r   rM   r   r   rN   r   r/   r   r   r"     s.   
	


zBlipVisionModel.__init__c                 C   s.   |  |}| j|d}| jd u r|S | |S )N)r   )rC   r   r   )r.   r1   rf   r   r   r   rE   5  s
   


zBlipVisionModel.forwardweightsc                 C   s   g d}t |  }t }t| jj}|D ]W\}}|dr$| jd u r$q|dr7t|	dd }||kr7q|D ]\}	}
}|
|vrCq9|
|
|	}|| }|j}||||  n|| }t|dt}||| || q|S )N))r   r   q)r   r   k)r   r   vr   zencoder.layers.r3   weight_loader)dictnamed_parameterssetr   r   r   
startswithr   rr   splitreplacer   getattrr   add)r.   r   stacked_params_mappingparams_dictloaded_paramslayer_countnameloaded_weightr   
param_nameweight_nameshard_idparamr   r   r   r   load_weights>  s0   

zBlipVisionModel.load_weightsrz   )rF   rG   rH   r   config_classmain_input_namepacked_modules_mappingr   rr   boolrq   r"   r'   rI   rE   r   tupler   r   rJ   r   r   r/   r   r   
  s0    &,	r   )&rp   collections.abcr   r'   torch.nnr%   transformersr   r   vllm.distributedr   r   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   vllm.model_executor.layers.convr	   !vllm.model_executor.layers.linearr
   r   r   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr   
interfacesr   rr   r   r   Moduler   rK   rt   r|   r   r   r   r   r   r   <module>   s*   ,E#",