o
    
۾i`                  	   @   sd  U d dl mZmZmZ d dlmZmZmZ d dlZd dl	m
Z
 d dlmZmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ d dlm Z  d dl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z:m;Z; G dd de+Z<G dd de+Z=e<e=B Z>ee?d< G dd de
j@ZAG dd de
j@ZBG dd de
j@ZCG dd  d e
j@ZDG d!d" d"e
j@ZEG d#d$ d$e
j@ZFG d%d& d&e
j@ZGG d'd( d(e
j@ZHG d)d* d*e$ZIG d+d, d,e"eI ZJG d-d. d.e#eI ZKejLeKeIeJd/G d0d1 d1e
j@e2e3e4e5ZMdS )2    )IterableMappingSequence)	AnnotatedLiteral	TypeAliasN)BatchFeatureBlip2ConfigBlip2QFormerConfigapply_chunking_to_forward)CacheConfig
VllmConfig)BaseDummyOptions)
get_act_fn)QuantizationConfig)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptIndexTargetsPromptInsertionPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )BlipVisionModelget_blip_num_patches)MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPPSupportsQuant)MultiModelKeys)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixc                   @   s:   e Zd ZU dZed ed< eeje	ddddf ed< d	S )
Blip2ImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image
    pixel_valuestypebn   hwdataN
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr    r;   r;   T/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/blip2.pyr+   2   s   
  r+   c                   @   s8   e Zd ZU dZed ed< eeje	dddf ed< dS )	Blip2ImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - f: Image feature size
        - h: Hidden size (must match the hidden size of language model backbone)
    image_embedsr-   r.   fr0   r2   Nr3   r;   r;   r;   r<   r=   ?   s   
 r=   Blip2ImageInputsc                       sl   e Zd ZddddededB dedB ded	ed
df fddZdd Z		dde
jde
jdB fddZ  ZS )Blip2QFormerMultiHeadAttentionF is_cross_attentionprefixconfigquant_configNcache_configrD   rE   returnc                   s   t    || _|j|j dkrtd|j d|j d|j| _|j|j | _| j| j | _| jd | _t	
|j| j| _|rD|j}n|j}t	
|| j| _t	
|| j| _t|dd| _| jdkrktd| j t	|j| _d S )	Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()g      position_embedding_typeabsolutez%Unsupported position_embedding_type: )super__init__rF   hidden_sizenum_attention_heads
ValueErrorattention_head_sizeall_head_sizescalingnnLinearqueryencoder_hidden_sizekeyvaluegetattrrK   NotImplementedErrorDropoutattention_probs_dropout_probdropout)selfrF   rG   rH   rD   rE   kv_hidden_size	__class__r;   r<   rN   O   s4   
	


z'Blip2QFormerMultiHeadAttention.__init__c                 C   s8   |j g | d d | j| jR  }|ddddS )Nr      r   r/   )viewsizerP   rR   permute)r`   xr;   r;   r<   transpose_for_scoresy   s   (z3Blip2QFormerMultiHeadAttention.transpose_for_scoreshidden_statesencoder_hidden_statesc                 C   s   |d u}|r|  | |}|  | |}n|  | |}|  | |}| |}|  |}t||dd}tj|| j dd}	| 	|	}
t|
|}|
dddd }|jg | d d | jR  }|S )Nrd   dimr   re   r   r/   )rj   rY   rZ   rW   r9   matmul	transposesoftmaxrT   r_   rh   
contiguousrf   rg   rS   )r`   rk   rl   rD   	key_layervalue_layermixed_query_layerquery_layerattention_scoresattention_probsattention_probs_droppedcontext_layerr;   r;   r<   forward}   s&   


z&Blip2QFormerMultiHeadAttention.forwardN)r4   r5   r6   r
   r   r   boolstrrN   rj   r9   r:   FloatTensorr|   __classcell__r;   r;   rb   r<   rA   N   s.    *rA   c                       H   e Zd Zddededdf fddZdejd	ejdejfd
dZ  Z	S )Blip2QFormerSelfOutputrB   rF   rE   rI   Nc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Neps)rM   rN   rU   rV   rO   dense	LayerNormlayer_norm_epsr]   hidden_dropout_probr_   r`   rF   rE   rb   r;   r<   rN         
zBlip2QFormerSelfOutput.__init__rk   input_tensorc                 C   &   |  |}| |}| || }|S r}   r   r_   r   r`   rk   r   r;   r;   r<   r|         

zBlip2QFormerSelfOutput.forwardrB   
r4   r5   r6   r
   r   rN   r9   r:   r|   r   r;   r;   rb   r<   r          r   c                       sn   e Zd ZddddededB dedB ded	ed
df fddZ	dde	j
de	jdB d
ee	j
 fddZ  ZS )Blip2QFormerAttentionFrB   rC   rF   rG   NrH   rD   rE   rI   c                   s<   t    t||||| dd| _t|| dd| _d S )N
.attentionrG   rH   rD   rE   z.outputrE   )rM   rN   rA   	attentionr   output)r`   rF   rG   rH   rD   rE   rb   r;   r<   rN      s   
	zBlip2QFormerAttention.__init__rk   rl   c                 C   s   | j ||d}| ||}|S )Nrl   )r   r   )r`   rk   rl   self_outputattention_outputr;   r;   r<   r|      s   zBlip2QFormerAttention.forwardr}   )r4   r5   r6   r
   r   r   r~   r   rN   r9   r:   r   tupler|   r   r;   r;   rb   r<   r      s0    r   c                       sB   e Zd Zddededdf fddZdejdejfd	d
Z  Z	S )Blip2QFormerIntermediaterB   rF   rE   rI   Nc                    s,   t    t|j|j| _t|j| _	d S r}   )
rM   rN   rU   rV   rO   intermediate_sizer   r   
hidden_actintermediate_act_fnr   rb   r;   r<   rN      s   
z!Blip2QFormerIntermediate.__init__rk   c                 C   s   |  |}| |}|S r}   )r   r   r`   rk   r;   r;   r<   r|      s   

z Blip2QFormerIntermediate.forwardr   r   r;   r;   rb   r<   r      s    r   c                       r   )Blip2QFormerOutputrB   rF   rE   rI   Nc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )rM   rN   rU   rV   r   rO   r   r   r   r]   r   r_   r   rb   r;   r<   rN      r   zBlip2QFormerOutput.__init__rk   r   c                 C   r   r}   r   r   r;   r;   r<   r|      r   zBlip2QFormerOutput.forwardr   r   r;   r;   rb   r<   r      r   r   c                       s   e Zd ZdddededB dedB deded	df fd
dZde	j
de	j
defddZde	jd	e	jfddZde	jd	e	jfddZ  ZS )Blip2QFormerLayerrB   r   rF   rG   NrH   	layer_idxrE   rI   c                   s   t    |j| _d| _t|||| dd| _|| _||j dkr3t|||d| dd| _d| _	nd| _	t
|| d	d
| _t|| dd
| _d S )Nr   r   rG   rH   rE   r   Tz.crossattentionr   Fz.intermediate_queryr   z.output_query)rM   rN   chunk_size_feed_forwardseq_len_dimr   r   r   cross_attention_frequencycrossattentionhas_cross_attentionr   intermediate_queryr   output_query)r`   rF   rG   rH   r   rE   rb   r;   r<   rN      s0   
	
zBlip2QFormerLayer.__init__rk   rl   query_lengthc              	   C   s   |  |}|dkrQ|d d d |d d f }| jr | j||d}t| j| j| j|}|jd |krOt| j| j| j|d d |d d d f }t	j
||gdd}|S t| j| j| j|}|S )Nr   r   r   rn   )r   r   r   r   feed_forward_chunk_queryr   r   shapefeed_forward_chunkr9   cat)r`   rk   rl   r   r   query_attention_outputlayer_outputlayer_output_textr;   r;   r<   r|     s<   
	zBlip2QFormerLayer.forwardr   c                 C      |  |}| ||}|S r}   )intermediater   r`   r   intermediate_outputr   r;   r;   r<   r   I     
z$Blip2QFormerLayer.feed_forward_chunkc                 C   r   r}   )r   r   r   r;   r;   r<   r   N  r   z*Blip2QFormerLayer.feed_forward_chunk_query)r4   r5   r6   r
   r   r   intr   rN   r9   r   r|   r:   r   r   r   r;   r;   rb   r<   r      s0    '
*r   c                       s`   e Zd ZdddededB dedB deddf
 fd	d
Zdej	dej	de
dejfddZ  ZS )Blip2QFormerEncoderrB   r   rF   rG   NrH   rE   rI   c                   s:   t    | _t fddtjD | _d S )Nc              
      s(   g | ]}t  | d | dqS )z.layer.)rG   rH   r   rE   )r   ).0r   rH   rF   rE   rG   r;   r<   
<listcomp>b  s    z0Blip2QFormerEncoder.__init__.<locals>.<listcomp>)rM   rN   rF   rU   
ModuleListrangenum_hidden_layerslayerr`   rF   rG   rH   rE   rb   r   r<   rN   U  s   

zBlip2QFormerEncoder.__init__rk   rl   r   c                 C   s.   t | jjD ]}| j| }||||d}q|S )Nrl   r   )r   rF   r   r   )r`   rk   rl   r   ilayer_moduler;   r;   r<   r|   n  s   
zBlip2QFormerEncoder.forward)r4   r5   r6   r
   r   r   r   rN   r9   r   r   r:   r|   r   r;   r;   rb   r<   r   T  s,    r   c                       s\   e Zd ZdddededB dedB deddf
 fd	d
Zdej	dej	dej
fddZ  ZS )Blip2QFormerModelrB   r   rF   rG   NrH   rE   rI   c                   sN   t    || _tj|j|jd| _t|j	| _
t|||| dd| _d S )Nr   z.encoderr   )rM   rN   rF   rU   r   rO   r   	layernormr]   r   r_   r   encoderr   rb   r;   r<   rN     s   
zBlip2QFormerModel.__init__query_embedsrl   c                 C   s2   |j d }| |}| |}| j|||d}|S )Nr   r   )r   r   r_   r   )r`   r   rl   r   embedding_outputsequence_outputr;   r;   r<   r|     s   


zBlip2QFormerModel.forward)r4   r5   r6   r
   r   r   r   rN   r9   r   r:   r|   r   r;   r;   rb   r<   r     s(    r   c                   @   s<   e Zd Zdd ZdeeedB f fddZdefddZdS )	Blip2ProcessingInfoc                 C   s   | j tS r}   )ctxget_hf_configr	   r`   r;   r;   r<   r     s   z!Blip2ProcessingInfo.get_hf_configrI   Nc                 C   s   ddiS )Nimager   r;   r   r;   r;   r<   get_supported_mm_limits  s   z+Blip2ProcessingInfo.get_supported_mm_limitsc                 C   s   |   }|jS r}   )r   num_query_tokens)r`   	hf_configr;   r;   r<   get_num_image_tokens  s   z(Blip2ProcessingInfo.get_num_image_tokens)	r4   r5   r6   r   r   r   r   r   r   r;   r;   r;   r<   r     s    r   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Blip2DummyInputsBuilder	mm_countsrI   c                 C   s   dS )NrB   r;   )r`   r   r;   r;   r<   get_dummy_text  s   z&Blip2DummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc           	      C   sJ   | j  }|j}|j}|dd}|r|dnd }d| j||||diS )Nr   r   )widthheight
num_images	overrides)infor   vision_config
image_sizeget_get_dummy_images)	r`   r   r   r   r   r   max_image_sizer   image_overridesr;   r;   r<   get_dummy_mm_data  s   
z)Blip2DummyInputsBuilder.get_dummy_mm_datar}   )
r4   r5   r6   r   r   r   r   r   r   r   r;   r;   r;   r<   r     s    
r   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ  ZS )Blip2MultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrI   c                    s@   |s| j  }||}tt|gdddS t j||||dS )N)	input_idspt)tensor_type)r   r   r   r   )r   get_tokenizerencoder   dictrM   _call_hf_processor)r`   r   r   r   r   	tokenizer
prompt_idsrb   r;   r<   r     s   

z+Blip2MultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s   t tdtddS )Nr   )r,   r>   )r   r   batched)r`   r   r   r;   r;   r<   _get_mm_fields_config  s   z.Blip2MultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc           	      C   sB   | j  }| }|d }| j  }|g| }tdt |dgS )Nz<image>r   )modalitytarget	insertion)r   r   	get_vocabr   r   r   start)	r`   r   r   r   r   vocabimage_token_idnum_image_tokensimage_tokensr;   r;   r<   _get_prompt_updates  s   


z,Blip2MultiModalProcessor._get_prompt_updates)r4   r5   r6   r   r   objectr   r   r   r   r   r   r   r   r  r   r;   r;   rb   r<   r     s8    







r   )r   dummy_inputsc                       s^  e Zd ZededededB fddZddd	ed
ef fddZde	de
dB fddZdedejdejfddZdedejfddZde
dejfddZde	defddZ		d0dejdB dejdedB dejdB de	defd d!Zd"ejdejdB fd#d$Zd%eeeejf  dee fd&d'Zdefd(d)Zd*edefd+d,Zd-edefd.d/Z  Z S )1Blip2ForConditionalGenerationr   r   rI   Nc                 C   s   | drd S td)Nr   z Only image modality is supported)
startswithrQ   )clsr   r   r;   r;   r<   get_placeholder_str  s   
z1Blip2ForConditionalGeneration.get_placeholder_strrB   r   vllm_configrE   c                   s&  t    |jj}|j}|j}|jj}|| _|| _|j}t	|j
|jdd | _| |d6 t||| _ttd|j|jj| _t|j||| dd| _tj|jj|jjdd| _W d    n1 sfw   Y  | | t||jt|dd	| _ W d    n1 sw   Y  | j j!| _!d S )
N)r   
patch_sizer   r   z.qformer)rH   rG   rE   T)biaslanguage_model)r  r   rE   )"rM   rN   model_configr   rH   rG   multimodal_configrF   r   r!   r   r  _vision_tokens_per_image_mark_tower_modelr    vision_modelrU   	Parameterr9   zerosr   qformer_configrO   query_tokensr   qformerrV   text_configlanguage_projection_mark_language_modelr)   r*   r  make_empty_intermediate_tensors)r`   r  rE   rF   rH   rG   r  r   rb   r;   r<   rN     sV   


z&Blip2ForConditionalGeneration.__init__kwargsc                 K   sr   | dd }| dd }|d u r|d u rd S |d ur+| jjj }}td|||ddS |d ur5td|dS td)Nr,   r>   )r0   r1   )r-   r2   resolve_bindings)r-   r2   z This line should be unreachable.)poprF   r   r   r+   r=   AssertionError)r`   r  r,   r>   
expected_h
expected_wr;   r;   r<   _parse_and_validate_image_inputF  s"   z=Blip2ForConditionalGeneration._parse_and_validate_image_inputr  r,   c                 C   s   ||}|S r}   r;   )r`   r  r,   image_featuresr;   r;   r<   _image_pixels_to_features_  s   z7Blip2ForConditionalGeneration._image_pixels_to_featuresinputsc                 C   s   |d }|  | j|S )Nr2   )r'  r  )r`   r(  r,   r;   r;   r<   _process_image_pixelsh  s   z3Blip2ForConditionalGeneration._process_image_pixelsimage_inputc                 C   sL   |d dkr
|d S |  |}| j|jd dd}| j||d}| |S )Nr-   r>   r2   r   rd   )r   rl   )r)  r  expandr   r  r  )r`   r*  r&  r  query_outputr;   r;   r<   _process_image_inputm  s   

z2Blip2ForConditionalGeneration._process_image_inputc                 K   s*   | j di |}|d u rg S | |}|S )Nr;   )r%  r-  )r`   r  r*  vision_embeddingsr;   r;   r<   embed_multimodal{  s
   
z.Blip2ForConditionalGeneration.embed_multimodalr   	positionsintermediate_tensorsinputs_embedsc                 K   s$   |durd}| j j||||d}|S )af  Run forward pass for BLIP-2.

        One key thing to understand is the `input_ids` already accounts for the
        positions of the to-be-inserted image embeddings.

        Concretely, consider a text prompt:
        `"Question: What's the content of the image? Answer:"`.

        Tokenizer outputs:
        `[2, 45641, 35, 653, 18, 5, 1383, 9, 5, 2274, 116, 31652, 35]`.

        To reserve space in KV cache, we have to insert placeholder tokens
        before they are inputted to the model, so the input processor prepends
        dummy tokens (denoted as `50265`), resulting in:
        `[50265, ..., 50265, 2, 45641, 35, ..., 31652, 35]`.

        We insert 32 tokens since it corresponds to the number of query
        embeddings outputted by the Q-Former and inputted to the language model.

        This way, the `positions` and `attn_metadata` are consistent
        with the `input_ids`.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.

        Info:
            [`Blip2ImageInputs`][vllm.model_executor.models.blip2.Blip2ImageInputs]
        N)r2  )r  model)r`   r   r0  r1  r2  r  rk   r;   r;   r<   r|     s   &z%Blip2ForConditionalGeneration.forwardrk   c                 C   s   | j |S r}   )r  compute_logitsr   r;   r;   r<   r4    s   z,Blip2ForConditionalGeneration.compute_logitsweightsc                 C   s   t | }||S r}   )r(   load_weights)r`   r5  loaderr;   r;   r<   r6    s   
z*Blip2ForConditionalGeneration.load_weightsc                 C   s   t jdddgddS )Nr  r  r  r  )r  	connectortower_model)r'   from_string_fieldr   r;   r;   r<   get_mm_mapping  s
   z,Blip2ForConditionalGeneration.get_mm_mappingr  c                 C   s:   |dkrdS || j j dksJ d|| j j }|| j S )Nr   zLThe number of image tokens must be a multiple of the number of query tokens.)rF   r   r  )r`   r  r   r;   r;   r<   get_num_mm_encoder_tokens  s   
z7Blip2ForConditionalGeneration.get_num_mm_encoder_tokensnum_vision_tokensc                 C   s8   |dkrdS || j  dksJ d|| j  }|| jj S )Nr   zQThe number of vision tokens must be a multiple of the number of tokens per image.)r  rF   r   )r`   r=  r   r;   r;   r<   get_num_mm_connector_tokens  s   
z9Blip2ForConditionalGeneration.get_num_mm_connector_tokens)NN)!r4   r5   r6   classmethodr   r   r  r   rN   r  r@   r%  r    r9   r:   r'  r+   r)  r-  r"   r/  r   r|   r4  r   r   setr6  r'   r;  r<  r>  r   r;   r;   rb   r<   r	    sf    /

	
/
$
r	  )Ncollections.abcr   r   r   typingr   r   r   r9   torch.nnrU   transformersr   r	   r
   r   vllm.configr   r   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   vllm.multimodal.processingr   r   r   r   r   r   vllm.sequencer   vllm.utils.tensor_schemar   r   blipr    r!   
interfacesr"   r#   r$   r%   r&   module_mappingr'   utilsr(   r)   r*   r+   r=   r@   r8   ModulerA   r   r   r   r   r   r   r   r   r   r   register_processorr	  r;   r;   r;   r<   <module>   sR    R$]-*5

