o
    
۾ide                     @   s  U d Z ddlZddlZddlZddlmZmZmZmZm	Z	 ddl
mZmZ ddlmZmZmZ ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZE ddlFmGZGmHZH ddlImJZJmKZKmLZLmMZM ddlNmOZOmPZPmQZQ G dd deGZRG dd  d eGZSeReSB ZTeeUd!< G d"d# d#ejVZWG d$d% d%ejVZXG d&d' d'ejVZYG d(d) d)ejVZZG d*d+ d+ejVZ[G d,d- d-eQZ\edd.d/ed0efd1d2Z]G d3d4 d4Z^G d5d6 d6e@Z_G d7d8 d8e>e_ Z`G d9d: d:e?e_ Zae6jbeae_e`d;G d<d= d=eOeMeKeLZcdS )>zAInference-only Qwen-VL model compatible with HuggingFace weights.    N)Callable
CollectionMappingSequenceSet)	lru_cachepartial)	AnnotatedLiteral	TypeAlias)nn)
transforms)InterpolationMode)BatchFeaturePretrainedConfigPreTrainedTokenizer
TensorType)
ImageInput)	TextInput)
VllmConfig)BaseDummyOptions)
get_act_fn)Conv2dLayer)ColumnParallelLinearReplicatedLinearRowParallelLinear)QuantizationConfig)
Resampler2get_abs_pos)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)QWenBaseModel	QWenBlock	QWenModelc                   @   s>   e Zd ZU dZdZed ed< eej	e
ddddf ed< d	S )
QwenImagePixelInputsaj  
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height
        - w: Width

    Note that image_size is the value in the vision config to which we resize
    the image to in the normalization transform. Currently multi-image support
    can only be leveraged by passing image embeddings directly.
    pixel_valuestypebn   hwdataN__name__
__module____qualname____doc__r8   r
   __annotations__r	   torchTensorr-    rF   rF   V/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/qwen_vl.pyr6   @   s   
  r6   c                   @   s<   e Zd ZU dZdZed ed< eej	e
dddf ed< dS )	QwenImageEmbeddingInputsa  
    Dimensions:
        - bn: Batch size * number of images
        - ifs: Image feature size (256)
        - hs: Hidden size

    `hidden_size` must match the hidden size of the language model backbone
    and is stored in the visual config of the model if we have one.
    image_embedsr8   r9      hsr=   Nr>   rF   rF   rF   rG   rH   Q   s   
 
rH   QwenImageInputsc                       sp   e Zd ZdZ				ddededededB d	edB d
ef fddZ	ddej	dej	dB dej	fddZ
  ZS )VisualAttentionzself-attention layer class.
    Self-attention layer takes input with size [s, b, h]
    and returns output of the same size.
    TN 	embed_dim	num_headsbiaskdimvdimprefixc                    s   t    || _|d ur|n|| _|d ur|n|| _| j|ko#| j|k| _|| _|| dks0J || | _|| _|| _	| jsBJ dt
|d| | dd| _t
||| dd| _t| j| _d S )Nr   z<Visual Attention implementation only supports self-attentionr:   z.in_projrT   z	.out_proj)super__init__rO   rR   rS   _qkv_same_embed_dimrP   hidden_size_per_attention_head!num_attention_heads_per_partitionhidden_size_per_partitionr   in_projout_projmathsqrtnorm_factor)selfrO   rP   rQ   rR   rS   rT   	__class__rF   rG   rW   i   s(   
	
zVisualAttention.__init__x	attn_maskreturnc                 C   sh  |  \}}}| |\}}|  d d | jd| j f }|j| }|j| jdd\}}	}
|||| j | jdd}|	||| j | jdd}	|| j }|d urat	|||	dd}n
t
||	dd}|jdd}|
||| j | jdd}
t
||
}||| j|| j}|dddd }|  d d | jf }|j| }| |\}}|S )Nr:   )dimr   r.      )sizer\   rZ   rY   viewsplit	transposer`   rD   baddbmmbmmsoftmaxpermute
contiguousr[   r]   )ra   rd   re   sqb_mixed_x_layernew_tensor_shapequery_layer	key_layervalue_layerq_scaledattention_probscontext_layernew_context_layer_shapeoutputrF   rF   rG   forward   sh   


zVisualAttention.forward)TNNrN   N)r?   r@   rA   rB   intboolstrrW   rD   rE   r   __classcell__rF   rF   rb   rG   rM   c   s6    	&rM   c                	       sD   e Zd ZdZ		ddedededB def fdd	Zd
d Z  Z	S )	QwenVLMLPz/MLP for the visual component of the Qwen model.NrN   hidden_sizeintermediate_sizequant_configrT   c                    sL   t    t||d|| dd| _td| _t||d|| dd| _d S )NTz.c_fc)rQ   r   rT   geluz.c_proj)rV   rW   r   c_fcr   act_fnr   c_proj)ra   r   r   r   rT   rb   rF   rG   rW      s    

zQwenVLMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S r   )r   r   r   )ra   rd   rv   rF   rF   rG   r      s   
zQwenVLMLP.forward)NrN   )
r?   r@   rA   rB   r   r   r   rW   r   r   rF   rF   rb   rG   r      s    r   c                       s   e Zd Zdejddfdedededeegejf de	dB d	e
f fd
dZ	ddejdejdB dejfddZ	ddejdejdB dejfddZ  ZS )VisualAttentionBlock      @NrN   d_modeln_head	mlp_ratio
norm_layerr   rT   c                    s\   t    ||| _||| _t|| }t||| dd| _t|||| dd| _d S )Nz.attnrU   z.mlp)r   r   r   rT   )	rV   rW   ln_1ln_2r   rM   attnr   mlp)ra   r   r   r   r   r   rT   	mlp_widthrb   rF   rG   rW      s   
	

zVisualAttentionBlock.__init__rd   re   rf   c                 C   s&   |d ur
| |jnd }| j||dS N)re   )todtyper   ra   rd   re   rF   rF   rG   	attention  s   zVisualAttentionBlock.attentionc                 C   s0   || j | ||d }|| | | }|S r   )r   r   r   r   r   rF   rF   rG   r     s   zVisualAttentionBlock.forwardr   )r?   r@   rA   r   	LayerNormr   floatr   Moduler   r   rW   rD   rE   r   r   r   rF   rF   rb   rG   r      sD    
r   c                       s   e Zd Zdejddfdededededeegejf d	e	dB d
e
f fddZdejfddZdejfddZ	ddejdejdB dejfddZ  ZS )TransformerBlockr   NrN   widthlayersheadsr   r   r   rT   c                    sB   t    | _|| _t fddt|D | _d S )Nc                    s*   g | ]}t   d | dqS )z.resblocks.r   r   rT   )r   .0ir   r   r   rT   r   r   rF   rG   
<listcomp>1  s    	z-TransformerBlock.__init__.<locals>.<listcomp>)rV   rW   r   r   r   
ModuleListrange	resblocks)ra   r   r   r   r   r   r   rT   rb   r   rG   rW   "  s   

	
zTransformerBlock.__init__rf   c                 C      | j d jjjjS Nr   )r   r   r   weightr   ra   rF   rF   rG   get_cast_dtype>     zTransformerBlock.get_cast_dtypec                 C   r   r   )r   r   r   r   devicer   rF   rF   rG   get_cast_deviceA  r   z TransformerBlock.get_cast_devicerd   re   c                 C   s   | j D ]}|||d}q|S r   )r   )ra   rd   re   rrF   rF   rG   r   D  s   
zTransformerBlock.forwardr   )r?   r@   rA   r   r   r   r   r   r   r   r   rW   rD   r   r   r   r   rE   r   r   rF   rF   rb   rG   r   !  s<    r   c                       sp   e Zd Z					ddededed	ed
ededededededB def fddZdej	dej	fddZ
  ZS )VisionTransformerrJ      1Q NrN   
image_size
patch_sizer   r   r   r   	n_queries
output_dimimage_start_idr   rT   c              
      s4  t    ||f \}}| _||f \}}| _|| || f| _|| _td|||dd| _|d }t	|t
d| | _ttjdd}||| _t||||||
| dd	| _ttt|||d
 ||dd| ddj| jj| jjd| _||| _t	|d t
|| | _|	| _|	d | _|	d | _d S )Nr:   F)in_channelsout_channelskernel_sizestriderQ   g      rJ   gư>)epsz.transformerr      z
.attn_pool)	grid_sizerO   rP   kv_dimr   adaptivedo_post_projectionrT   )r   r   r.   rj   )rV   rW   r   r   r   r   r   conv1r   	ParameterrD   randnpositional_embeddingr   r   ln_prer   transformerr   r   r^   r_   r   r   r   	attn_poolln_postprojr   image_end_idimage_pad_id)ra   r   r   r   r   r   r   r   r   r   r   rT   kwargsimage_heightimage_widthpatch_heightpatch_widthscaler   rb   rF   rG   rW   M  s\   
	

	

zVisionTransformer.__init__rd   rf   c              	   C   s   |j | j | j d}| |}||jd |jd d}|ddd}|t| j	t
t|d }| |}|ddd}| |}|ddd}| |}| |}|| j }|S )N)r   r   r   r.   rg   rj   )r   r   r   r   r   reshapeshaperr   r   r   r   r^   r_   rk   r   r   r   r   )ra   rd   rF   rF   rG   r     s    
 




zVisionTransformer.forward)rJ   r   r   NrN   )r?   r@   rA   r   r   r   r   rW   rD   rE   r   r   rF   rF   rb   rG   r   L  s<    		
Dr   c                       s,   e Zd Zdddedef fddZ  ZS )QwenVLModelrN   rU   vllm_configrT   c                   sD   t  j||d |jj}|j}tdi |j|| dd| _d S )N)r   rT   z.visual)r   rT   rF   )rV   rW   model_config	hf_configr   r   visual)ra   r   rT   configr   rb   rF   rG   rW     s   
zQwenVLModel.__init__)r?   r@   rA   r   r   rW   r   rF   rF   rb   rG   r     s    $r   )maxsize	tokenizerrf   c                 C   s6   t | }G dd d| j}| jj d|_||_|S )a>  
    The logic of adding image pad tokens should only be applied in
    [`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
    so they are patched out here.

    The definition of the wrapped tokenizer can be found here:
    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
    c                   @   sp   e Zd Z		ddedee eB dee eB deeeB  fddZ			
dde	ee	 B de
ded
B defddZd
S )zB_get_tokenizer_without_image_pad.<locals>.TokenizerWithoutImagePadallrF   textallowed_specialdisallowed_specialrf   c                    s,   t d|} fdd jj|||dD S )NNFCc                    s   g | ]} j | qS rF   )decoder)r   tr   rF   rG   r     s    z__get_tokenizer_without_image_pad.<locals>.TokenizerWithoutImagePad.tokenize.<locals>.<listcomp>)r   r   )unicodedata	normalizer   encode)ra   r   r   r   r   rF   r   rG   tokenize  s   
zK_get_tokenizer_without_image_pad.<locals>.TokenizerWithoutImagePad.tokenizeFN	token_idsskip_special_tokenserrorsc                 [   s&   t |tr|g}| jj||p| jdS )N)r   )
isinstancer   r   decoder   )ra   r   r   r   r   rF   rF   rG   _decode  s   
zJ_get_tokenizer_without_image_pad.<locals>.TokenizerWithoutImagePad._decode)r   rF   )FN)r?   r@   rA   r   r   r   listbytesr   r   r   r   rF   rF   rF   rG   TokenizerWithoutImagePad  s.    




r   WithoutImagePad)copydeepcopyrc   r?   )r   new_tokenizerr   rF   rF   rG    _get_tokenizer_without_image_pad  s
   
"r   c                	       s   e Zd ZdZdededdf fddZedefdd	Z	edefd
dZ
edefddZ			ddeee B dB deee B dB deeB dB defddZ  ZS )QwenVLProcessorac  
    This model doesn't define its own HF processor,
    so we implement our own one here.

    We call the wrapped tokenizer to automatically insert image pad tokens:
    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py#L245

    The image processor is defined here:
    https://huggingface.co/Qwen/Qwen-VL/blob/main/visual.py#L354
    r   r   rf   Nc                    sX   t    || _|| _|j}|d }ttj||ftj	dt
 tjdddg| _d S )Nr   )interpolation)g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?)meanstd)rV   rW   r   r   r   r   ComposeResizer   BICUBICToTensor	Normalizeimage_transform)ra   r   r   vision_configr   rb   rF   rG   rW     s"   

zQwenVLProcessor.__init__c                 C      | j jS r   )r   image_start_tagr   rF   rF   rG   r
       zQwenVLProcessor.image_start_tagc                 C   r	  r   )r   image_end_tagr   rF   rF   rG   r    r  zQwenVLProcessor.image_end_tagc                 C   r	  r   )r   image_pad_tagr   rF   rF   rG   r    r  zQwenVLProcessor.image_pad_tagr   imagesreturn_tensorsc                    s   |d u rg }t |ts|g}|d u rg }t |ts|g} |}t|dkr*i }n fdd|D }dt|i}ti |||dS )Nr   c                    s   g | ]}  |qS rF   )r  )r   imager   rF   rG   r   4  s    z,QwenVLProcessor.__call__.<locals>.<listcomp>r7   )tensor_type)r   r   r   lenrD   stackr   )ra   r   r  r  text_inputsimage_inputsr7   rF   r   rG   __call__   s*   


zQwenVLProcessor.__call__)NNN)r?   r@   rA   rB   r   r   rW   propertyr   r
  r  r  r   r   r   r   r   r  r   rF   rF   rb   rG   r     s6    
r   c                   @   sT   e Zd ZdefddZdedefddZdee	e
dB f fdd	Zde
fd
dZdS )QwenVLProcessingInforf   c                 C   s    | j  }t|tsJ t|S r   )ctxget_tokenizerr   r   r   )ra   r   rF   rF   rG   r  A  s   
z"QwenVLProcessingInfo.get_tokenizerr   c                 K   s"   | j jtf|  |  d|S )N)r   r   )r  init_processorr   get_hf_configr  )ra   r   rF   rF   rG   get_hf_processorG  s   z%QwenVLProcessingInfo.get_hf_processorNc                 C   s   dd iS )Nr  rF   r   rF   rF   rG   get_supported_mm_limitsO  s   z,QwenVLProcessingInfo.get_supported_mm_limitsc                 C   s2   |   }|j}|d }|d }|| d }|| S )Nr   r   rj   )r  r   )ra   r   r  r   r   grid_lengthrF   rF   rG   get_num_image_tokensR  s   z)QwenVLProcessingInfo.get_num_image_tokens)r?   r@   rA   r   r  objectr   r  r   r   r   r  r   rF   rF   rF   rG   r  @  s
    r  c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )QwenVLDummyInputsBuilder	mm_countsrf   c                    sF   | dd}| j }|j|j d fddtd|d D S )Nr  r   rN   c                 3   s&    | ]}d | d   dV  qdS )Picture z: 
NrF   r   img_end	img_startrF   rG   	<genexpr>d  s    
z:QwenVLDummyInputsBuilder.get_dummy_text.<locals>.<genexpr>r.   )getinfor  r
  r  joinr   )ra   r#  
num_imageshf_processorrF   r&  rG   get_dummy_text]  s   
z'QwenVLDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc           
      C   sP   | j  }|j}|d  }}|dd}|r|dnd }	d| j||||	diS )Nr   r  r   )r   heightr-  	overrides)r+  r  r   r*  _get_dummy_images)
ra   r0  r#  r1  r   r  target_widthtarget_heightr-  image_overridesrF   rF   rG   get_dummy_mm_datah  s   
z*QwenVLDummyInputsBuilder.get_dummy_mm_datar   )
r?   r@   rA   r   r   r   r/  r   r!   r8  rF   rF   rF   rG   r"  \  s    
r"  c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	ed
eeef deeef de	f
ddZ
ded
eeef deeef fddZd	ed
eeef dedee fddZ  ZS )QwenVLMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrf   c                    sZ   t dd|\}}|d}|d ur#t|tsJ t|}||ks#J t j||||dS )Nz"(Picture \d*: <img>).*?(<\/img>\n)z\1\2r  )r:  r;  r<  r=  )resubnr*  r   r   r  rV   _call_hf_processor)ra   r:  r;  r<  r=  num_matched_images
image_datar-  rb   rF   rG   r@    s    	
z,QwenVLMultiModalProcessor._call_hf_processorprompt_textmm_itemshf_processor_mm_kwargstokenization_kwargsc                 C   s   dS )NFrF   )ra   rC  rD  rE  rF  rF   rF   rG   _hf_processor_applies_updates  s   z7QwenVLMultiModalProcessor._hf_processor_applies_updates	hf_inputsc                 C   s   t tdtddS )Nr  )r7   rI   )dictr"   batched)ra   rH  rE  rF   rF   rG   _get_mm_fields_config  s   z/QwenVLMultiModalProcessor._get_mm_fields_configout_mm_kwargsc                 C   sv   | j  }|j}| j  }||j }||j }||j }	| j  }
|	g|
 }td||gt	j
|g| |g |	ddgS )Nr  )embed_token_id)modalitytargetreplacement)r+  r  special_tokensr  r
  r  r  r   r(   r*   select_token_id)ra   rD  rE  rL  r   rQ  	processorimg_start_id
img_end_id
img_pad_idnum_image_tokensimage_tokensrF   rF   rG   _get_prompt_updates  s"   






z-QwenVLMultiModalProcessor._get_prompt_updates)r?   r@   rA   r   r   r!  r   r@  r$   r   rG  r"   rK  r#   r   r)   rY  r   rF   rF   rb   rG   r9    sN    





	




r9  )r+  dummy_inputsc                       s   e Zd ZdgddgdZejZdefddZede	d	e
de	d
B fddZdeddede	dee dd
f fddZdeded
B fddZdedejfddZdedefddZ	
	
d"dejd
B dejded
B dejd
B dedejeB fd d!Z  ZS )#QwenVLForConditionalGenerationc_attnw2w1)r\  gate_up_projrf   c                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        ztransformer.hztransformer.visual.attn_poolztransformer.visual.transformer)language_model	connectortower_model)r   from_string_fieldr   rF   rF   rG   get_mm_mapping  s
   z-QwenVLForConditionalGeneration.get_mm_mappingrN  r   Nc                 C   s   | drd| dS td)Nr  r$  z: <img></img>z Only image modality is supported)
startswith
ValueError)clsrN  r   rF   rF   rG   get_placeholder_str  s   
z2QwenVLForConditionalGeneration.get_placeholder_strrN   )rT   transformer_typer   rT   ri  c                   sT   | j |tdtid t j|||d W d    |  d S 1 s!w   Y  |  d S )Nr  )language_targetstower_targets)r   rT   ri  )_mark_composite_modelr4   r   rV   rW   )ra   r   rT   ri  rb   rF   rG   rW     s   
z'QwenVLForConditionalGeneration.__init__r   c                 K   s`   | dd }| dd }|d ur$| jjd  }}||d}td||dS |d ur.td|dS d S )Nr7   rI   r   )r;   r<   )r8   r=   resolve_bindings)r8   r=   )popr   r   r6   rH   )ra   r   r7   rI   
expected_h
expected_wrm  rF   rF   rG   _parse_and_validate_image_input  s    
z>QwenVLForConditionalGeneration._parse_and_validate_image_inputimage_inputc                 C   s$   |d dkr
|d S | j |d S )Nr8   rI   r=   )r   r   )ra   rr  rF   rF   rG   _process_image_input  s   z3QwenVLForConditionalGeneration._process_image_inputc                 K   s*   | j di |}|d u rg S | |}|S )NrF   )rq  rs  )ra   r   rr  vision_embeddingsrF   rF   rG   embed_multimodal#  s
   
z/QwenVLForConditionalGeneration.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s    |d urd }|  ||||}|S r   )r   )ra   rv  rw  rx  ry  r   hidden_statesrF   rF   rG   r   +  s   z&QwenVLForConditionalGeneration.forward)NN)r?   r@   rA   packed_modules_mappingr1   embed_input_idsr   rd  classmethodr   r   rh  r   r   r8   rW   r!  rL   rq  rD   rE   rs  r/   ru  r+   r   r   rF   rF   rb   rG   r[    sV    	


r[  )drB   r   r^   r   collections.abcr   r   r   r   r   	functoolsr   r   typingr	   r
   r   regexr>  rD   r   torchvisionr   torchvision.transformsr   transformersr   r   r   r   transformers.image_utilsr   $transformers.tokenization_utils_baser   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   $vllm.model_executor.layers.resamplerr   r   )vllm.model_executor.models.module_mappingr   vllm.multimodalr    vllm.multimodal.inputsr!   r"   r#   vllm.multimodal.parser$   vllm.multimodal.processingr%   r&   r'   r(   r)   r*   vllm.sequencer+   vllm.utils.tensor_schemar,   r-   
interfacesr/   r0   r1   r2   qwenr3   r4   r5   r6   rH   rL   rC   r   rM   r   r   r   r   r   r   r   r  r"  r9  register_processorr[  rF   rF   rF   rG   <module>   st    s")+_5S$N

