o
    -i                  
   @   s  d dl Z d dlmZmZmZ d dlmZmZ d dlm	Z	 d dl
mZmZ d dlZd dlmZ d dlm  mZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZm Z m!Z! d dl"m#Z# d dl$m%Z& d dl'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z;m<Z<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZCmDZD d dlEmFZFmGZGmHZHmIZI d dlJmKZKmLZLmMZM d dlNmOZOmPZP d dlQmRZRmSZSmTZTmUZUmVZVmWZW d dlXmYZY d dlZm[Z[ d d l\m]Z] d d!l^m_Z_ d d"l`maZambZb d#d$lcmdZdmeZemfZfmgZg d#d%lhmiZi d#d&ljmkZkmlZl d#d'lmmnZnmoZompZpmqZq zd d(lrmsZt eYu rJeYvd)rJd*Zwnd+ZwW n exyY   d*ZwY nw d,ZyG d-d. d.eaZzG d/d0 d0Z{G d1d2 d2eSZ|G d3d4 d4eOe| Z}G d5d6 d6eRe| Z~eCje~e|e}d7G d8d9 d9ejeeefegZeG d:d; d;Zd<ejd=ejd>ejfd?d@ZdAedBedCedDed>ejf
dEdFZdGejdHejd<ejd>eejejf fdIdJZG dKdL dLejZG dMdN dNejZG dOdP dPejZG dQdR dRejZdSeej d>ejfdTdUZG dVdW dWejZG dXdY dYejZG dZd[ d[ejZd=ejd\eeeef  d]ed>eej fd^d_ZG d`da daene  ZG dbdc dcejZG ddde deejZG dfdg dgejZG dhdi diejZG djdk dkejZdS )l    N)IterableMappingSequence)	dataclassfields)cached_property)	AnnotatedLiteral)
ImageChunk	TextChunk)UserMessage)ChatCompletionRequest)ImageEncoder)Image)BatchFeaturePixtralVisionConfig
TensorType)
ImageInput)_num_image_tokens)PixtralRotaryEmbeddingapply_rotary_pos_embposition_ids_in_meshgrid)	TextInput)
VllmConfig)BaseDummyOptions)divide$get_tensor_model_parallel_world_size)get_act_and_mul_fn)Conv2dLayer)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)default_weight_loader)MULTIMODAL_REGISTRYMultiModalKwargsItems)MultiModalDataDictMultiModalFieldConfigMultiModalUUIDDictNestedTensors)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderProcessorInputs)BaseMultiModalProcessorBaseProcessingInfoMultiModalProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)current_platform)IntermediateTensors)cached_tokenizer_from_config)MistralTokenizer)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)MultiModelKeys)init_vllm_registered_modelmaybe_prefix)VisionEncoderInfoVisionFeatureSelectStrategyis_vit_use_data_parallelresolve_visual_encoder_outputs)opsd   FTpatch_mergec                	   @   sP   e Zd ZU dZdZed ed< eej	e
ej	 B eddddddhdf ed	< d
S )PixtralImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image

    The result of stacking `ImageEncoding.tokens` from each prompt.
    pixel_valuestypebn   hw)dynamic_dimsimagesN)__name__
__module____qualname____doc__rM   r	   __annotations__r   torchTensorlistr;    r\   r\   _/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/pixtral.pyrK   `   s   
 
rK   c                       s   e Zd ZdZdeddf fddZedefddZe	de
fd	d
Ze	de
fddZe	de
fddZe	de
fddZe	de
fddZ			ddeee B dB deee B dB deeB dB deeef fddZ  ZS )PixtralProcessorAdapterzo
    Provide a HF-compatible interface for
    `mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
    	tokenizerreturnNc                    s   t    || _d S N)super__init__r_   selfr_   	__class__r\   r]   rc   y   s   

z PixtralProcessorAdapter.__init__c                 C   s   | j jj}t|tsJ |S ra   )r_   instruct
mm_encoder
isinstancer   )re   image_encoderr\   r\   r]   image_processor~   s   
z'PixtralProcessorAdapter.image_processorc                 C   
   | j jjS ra   )rl   special_ids	img_breakre   r\   r\   r]   image_break_id      
z&PixtralProcessorAdapter.image_break_idc                 C   rm   ra   )rl   rn   imgrp   r\   r\   r]   image_token_id   rr   z&PixtralProcessorAdapter.image_token_idc                 C   rm   ra   )rl   rn   img_endrp   r\   r\   r]   image_end_id   rr   z$PixtralProcessorAdapter.image_end_idc                 C   rm   ra   )rl   	mm_configmax_image_sizerp   r\   r\   r]   
image_size   rr   z"PixtralProcessorAdapter.image_sizec                 C   rm   ra   )rl   rw   image_patch_sizerp   r\   r\   r]   
patch_size   rr   z"PixtralProcessorAdapter.patch_sizetextrS   return_tensorsc                 K   s   |d u rg }t |ts|g}|d u rg }t |ts|g}|s+| |j}dt|iS tdd |D r8tdttj  }ttj  }|D ] }| 	t
|d}	t|	j}
t|	j}||
 || qFtt|d  t|d|dS )N	input_idsc                 s   s    | ]	}t |d kV  qdS )r   N)len).0tr\   r\   r]   	<genexpr>   s    z3PixtralProcessorAdapter.__call__.<locals>.<genexpr>zYou've passed text inputs instead of token inputs. Make sure to process your input via `mistral_common`'s tokenizer or pass a chat completion request. For more info, see: https://github.com/vllm-project/vllm/issues/8411.image)r~   rS   )rj   r[   r_   r~   rY   tensorany
ValueErrorrZ   rl   r
   r   tokensappendr   catexpandr   )re   r|   rS   r}   kwargsr~   images_processedimages_tokensr   image_inputsimage_processedimage_tokensr\   r\   r]   __call__   s8   


z PixtralProcessorAdapter.__call__)NNN)rT   rU   rV   rW   r9   rc   propertyr   rl   r   intrq   rt   rv   ry   r{   r   r[   r   strr   r   r*   r   __classcell__r\   r\   rf   r]   r^   s   s6    

r^   c                	   @   s   e Zd ZdefddZdefddZdeee	dB f fddZ
	dd	edB fd
dZddde	de	d	edB de	fddZdefddZdS )PixtralProcessingInfor`   c                 C   s"   t | jj}t|tstd|S )Nz.This model requires `--tokenizer-mode mistral`)r8   ctxmodel_configrj   r9   r   rd   r\   r\   r]   get_tokenizer   s   
z#PixtralProcessingInfo.get_tokenizerc                 C   s   t |  S ra   )r^   r   rp   r\   r\   r]   get_hf_processor   s   z&PixtralProcessingInfo.get_hf_processorNc                 C   s   dd iS )Nr   r\   rp   r\   r\   r]   get_supported_mm_limits      z-PixtralProcessingInfo.get_supported_mm_limits	processorc                 C   s    |d u r|   }t|j|jdS )N)ry   r{   )r   r   ry   r{   )re   r   r\   r\   r]   get_vision_config   s   z'PixtralProcessingInfo.get_vision_config)r   image_widthimage_heightc                C   s4   |d u r|   }|jtd||f\}}|| S )NRGB)r   rl   _image_to_num_tokensr   new)re   r   r   r   ncolsnrowsr\   r\   r]   get_num_image_tokens   s   z*PixtralProcessingInfo.get_num_image_tokensc                 C   s   |   j}|jj}t||dS )N)widthheight)r   rl   rw   rx   r,   )re   rl   rx   r\   r\   r]   !get_image_size_with_most_features   s   
z7PixtralProcessingInfo.get_image_size_with_most_featuresra   )rT   rU   rV   r9   r   r^   r   r   r   r   r   r   r   r,   r   r\   r\   r\   r]   r      s&    

r   c                	   @   s   e Zd Zdeeef defddZ	ddedeeef deeef dB defdd	Z		ddedeeef deeef dB de
fd
dZdS )PixtralDummyInputsBuilder	mm_countsr`   c                 C   s   dS )N r\   )re   r   r\   r\   r]   get_dummy_text   s   z(PixtralDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j \}}|r| dnd }d| j||||diS )Nr   r   )r   r   
num_images	overrides)getinfor   _get_dummy_images)re   r   r   r   r   target_widthtarget_heightimage_overridesr\   r\   r]   get_dummy_mm_data   s   z+PixtralDummyInputsBuilder.get_dummy_mm_datac                 C   s~   | j  }| |}| |||}|dg }ddi}ttt|dgdd |D dgd}	|j	|	}
|
j
}t|||d	S )
Nr   
truncationF)r|   c                 s   s    | ]}t |d V  qdS )r   N)r
   )r   r   r\   r\   r]   r   &  s    zGPixtralDummyInputsBuilder.get_dummy_processor_inputs.<locals>.<genexpr>)content)messages)promptmm_datatokenization_kwargs)r   r   r   r   r   r   r   r   mistralencode_chat_completionr   r/   )re   r   r   r   r_   
dummy_textdummy_mm_datadummy_imagesr   requestresdummy_tokensr\   r\   r]   get_dummy_processor_inputs  s*   


z4PixtralDummyInputsBuilder.get_dummy_processor_inputsra   )rT   rU   rV   r   r   r   r   r   r'   r   r/   r   r\   r\   r\   r]   r      s,    


r   c                       s   e Zd Zdeeef deeef deeef fddZde	deeef de
dee fdd	Z	
ddeee B de	deeef deeef ded
B deee eef f fddZ  ZS )PixtralMultiModalProcessor	hf_inputshf_processor_mm_kwargsr`   c                 C   s   t tddS )Nr   )rS   )dictr(   batched)re   r   r   r\   r\   r]   _get_mm_fields_config6  s   z0PixtralMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    sN   | j jdi |j jjdtf fdd}tdd|dgS )Nitem_idxc                    s^    dt}|| }jtd|j|jf\}}g|  g | }|d< t	
|S )Nr   r   r   )	get_itemsr+   get_image_sizerl   r   r   r   r   r   r5   select_token_id)r   rS   ry   r   r   r   rq   rv   rt   r   r   r\   r]   get_replacementI  s   
zGPixtralMultiModalProcessor._get_prompt_updates.<locals>.get_replacementr   r   )modalitytargetreplacementr\   )r   r   rq   rt   rv   r   r3   )re   r   r   r   r   r\   r   r]   _get_prompt_updates=  s   z.PixtralMultiModalProcessor._get_prompt_updatesNr   mm_data_itemsr   mm_uuidsc           	         s&   t  j|||||d\}}}||dfS )N)r   r   r   r   r   T)rb   _cached_apply_hf_processor)	re   r   r   r   r   r   
prompt_idsmm_info_rf   r\   r]   r   ^  s   
	z5PixtralMultiModalProcessor._cached_apply_hf_processorra   )rT   rU   rV   r   r   r*   objectr(   r   r-   r&   r   r4   r   r[   r   r)   tupler2   boolr   r   r\   r\   rf   r]   r   5  s>    





'


r   )r   dummy_inputsc                       s2  e Zd ZededededB fddZddd	ed
ef fddZde	de
dB fddZde
deejdf fddZde	defddZ		d*dejdejdedB dejdB de	dejeB fddZdejdejdB fddZdeeeejf  fd d!Zdefd"d#Zd$edefd%d&Zd'edefd(d)Z  ZS )+PixtralForConditionalGenerationr   ir`   Nc                 C   s   | drd S td)Nr   z Only image modality is supported)
startswithr   )clsr   r   r\   r\   r]   get_placeholder_strz  s   
z3PixtralForConditionalGeneration.get_placeholder_strr   prefixvllm_configr   c                   sH  t    |jj}|jj}|| _|| _dd ttD   fdd| jj	 
 D }tdi || _| | t||jt|dd| _W d    n1 sOw   Y  | |d; t| j| _| jjrmt| jjdd	nd | _| jjtkrt| jj| jjd
dnd | _t| j|jjd| _W d    n1 sw   Y  | jj| _d S )Nc                 S   s   h | ]}|j qS r\   )name)r   fieldr\   r\   r]   	<setcomp>  s    z;PixtralForConditionalGeneration.__init__.<locals>.<setcomp>c                    s   i | ]\}}| v r||qS r\   r\   )r   keyvaluedataclass_fieldsr\   r]   
<dictcomp>  s
    z<PixtralForConditionalGeneration.__init__.<locals>.<dictcomp>language_model)r   	hf_configr   r   h㈵>epsF)vision_encoder_dimspatial_merge_sizeuse_mlp_biasdimr\   ) rb   rc   r   r   multimodal_configconfigr   VisionEncoderArgsvision_configto_dictitemsvision_args_mark_language_modelrB   text_configrC   r   _mark_tower_modelVisionTransformervision_encoderadd_pre_mm_projector_layer_normr   hidden_sizepre_mm_projector_normmm_projector_idPATCH_MERGEPatchMergerr   patch_mergerVisionLanguageAdaptervision_language_adaptermake_empty_intermediate_tensors)re   r   r   r  r   r  rf   r   r]   rc     sJ   


	

z(PixtralForConditionalGeneration.__init__r   c                 K   s$   | dd }|d u rd S td|dS )NrS   rL   )rM   rS   )poprK   )re   r   rS   r\   r\   r]   _parse_and_validate_image_input  s   z?PixtralForConditionalGeneration._parse_and_validate_image_inputimage_input.c                    s   |d }|  |}dd |D }t|}| jd ur| |}| jd urG| jj | jjd  fdd|D }fdd|D }| j||d}| |}t	||}|S )NrS   c                 S      g | ]}|j d  qS r   shape)r   image_featurer\   r\   r]   
<listcomp>      zHPixtralForConditionalGeneration._process_image_input.<locals>.<listcomp>   c                    s(   g | ]}|j d    |j d   fqS )r<   r   r  r   rs   )r{   r\   r]   r    s    c                    s   g | ]}|  qS r\   r\   )r   feature_size)spatial_merge_size_squarer\   r]   r    s    )image_sizes)
r  rY   r   r  r  r  r{   r   r  split)re   r  rS   image_featuresfeature_sizesimg_patch_dimsimage_embedsr\   )r{   r#  r]   _process_image_input  s*   







z4PixtralForConditionalGeneration._process_image_inputc                 K   s&   | j di |}|d u rg S | |S )Nr\   )r  r*  )re   r   r  r\   r\   r]   embed_multimodal  s   
z0PixtralForConditionalGeneration.embed_multimodalr~   	positionsintermediate_tensorsinputs_embedsc                 K   s$   |durd}| j j||||d}|S )zRun forward pass for pixtral.N)r.  )r   model)re   r~   r,  r-  r.  r   hidden_statesr\   r\   r]   forward  s   	z'PixtralForConditionalGeneration.forwardr0  c                 C   s   | j |S ra   )r   compute_logits)re   r0  r\   r\   r]   r2    s   z.PixtralForConditionalGeneration.compute_logitsweightsc              
      s   dt ttjf fdddt ttjf fdddt ttjf fdd dt ttjf fdd	jd ur<tj ni jd urJtj ni jd urXtj ni j	d urftj	 ni  	f
d
d}j
|  d S )Nweightc                 S      | d  dS )Nr   )r  vision_towerr   r4  r\   r\   r]   is_vision_encoder_weights     zOPixtralForConditionalGeneration.load_weights.<locals>.is_vision_encoder_weightsc                 S   r5  )Nr   )r  multi_modal_projectorr7  r8  r\   r\   r]   is_vision_lang_adapter_weights  s   zTPixtralForConditionalGeneration.load_weights.<locals>.is_vision_lang_adapter_weightsc                 S   r5  )Nr   r  r7  r8  r\   r\   r]   is_patch_merger  r:  zEPixtralForConditionalGeneration.load_weights.<locals>.is_patch_mergerc                 S   r5  )Nr   r  r7  r8  r\   r\   r]   is_pre_mm_projector_norm  r:  zNPixtralForConditionalGeneration.load_weights.<locals>.is_pre_mm_projector_normc               	   3   s   	D ]\} }| |frBj d u rqd| ddd  }|}|d urAt  t|| W d    n1 s<w   Y  q | |frxjd u rNqd| ddd  }| }t  t|| W d    n1 srw   Y  q| |frjd u rqd| ddd  }| }t  t|| W d    n1 sw   Y  q| |frj	d u rqd| ddd  }|}|d urt  t|| W d    n1 sw   Y  q| 
d} | |fV  qd S )N.r<   zlanguage_model.)r  joinr%  r   rY   no_gradr$   r  r  r  removeprefix)r   rQ   trimmed_nameparam
r=  r>  r9  r<  patch_merger_dictpre_mm_projector_norm_dictre   vision_encoder_dictvision_lang_adapter_dictr3  r\   r]   llm_weights_generator  sV   










zKPixtralForConditionalGeneration.load_weights.<locals>.llm_weights_generator)r   r   rY   rZ   r  r   named_parametersr  r  r  r   load_weights)re   r3  rJ  r\   rE  r]   rL    s,   



-z,PixtralForConditionalGeneration.load_weightsc                 C   s   t jddddS )Nr   r  r  )r   	connectortower_model)rA   from_string_fieldrp   r\   r\   r]   get_mm_mappingL  s
   z.PixtralForConditionalGeneration.get_mm_mappingnum_image_tokensc                 C   s(   t | dd d u r
|S | jj}||d  S Nr  r   getattrr  r   )re   rQ  
merge_sizer\   r\   r]   get_num_mm_encoder_tokensS     z9PixtralForConditionalGeneration.get_num_mm_encoder_tokensnum_vision_tokensc                 C   s(   t | dd d u r
|S | jj}||d  S rR  rS  )re   rX  rU  r\   r\   r]   get_num_mm_connector_tokensY  rW  z;PixtralForConditionalGeneration.get_num_mm_connector_tokens)NN)rT   rU   rV   classmethodr   r   r   r   rc   r   rK   r  r   rY   rZ   r*  r=   r+  r7   r1  r2  r   rL  rA   rP  rV  rY  r   r\   r\   rf   r]   r   r  sN    0



Tr   c                   @   s   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< d
Zeed< dZeed< dZ	eed< dZ
eed< dS )r  r  num_channelsry   r{   intermediate_sizenum_hidden_layersnum_attention_heads
rope_thetart   Tadapter_biasr<   r   Fr  r   r  N)rT   rU   rV   r   rX   floatr`  r   r   r  r  r   r\   r\   r\   r]   r  a  s   
 r  	freqs_cisxr`   c                    sl   |j   dks	J | j|jd |jd fks%J | j|jd |jd ff fddt|jD }| j| S )zd
    freqs_cis: complex - (seq_len, head_dim / 2)
    x: complex - (bsz, seq_len, head_dim / 2)
    r<   r   c                    s,   g | ]\}}|d ks| d  kr|nd qS r<   r\   )r   r   dndimr\   r]   r  }  s   , z*_reshape_for_broadcast.<locals>.<listcomp>)rg  r  	enumerateview)rb  rc  r  r\   rf  r]   _reshape_for_broadcastr  s   
rj  r   r   r   thetac           
      C   s   d|t d| d |    }t j||jd}t j||jd}t ||ddd  }t ||ddd  }t j|dddddf d|d|dddddf |ddgdd}	t t |	|	S )	z
    freqs_cis: 2D complex tensor of shape (height, width, dim // 2)
        to be indexed by (height, width) position tuples
    g      ?r   r   deviceNr<   r   r   )	rY   arangera  rm  outerr   repeatpolar	ones_like)
r   r   r   rk  freqsrP   rQ   freqs_hfreqs_wfreqs_2dr\   r\   r]   precompute_freqs_cis_2d  s   rw  xqxkc                 C   s   t |  jg | jd d ddR  }t | jg |jd d ddR  }|jt jks4J t||}t || 	d}t || 	d}|
| |
|fS )Nr   r   rO   )rY   view_as_complexra  reshaper  dtype	complex64rj  view_as_realflattentype_as)rx  ry  rb  xq_xk_xq_outxk_outr\   r\   r]   apply_rotary_emb_vit  s   ,,
r  c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )FeedForwardargsc                    s^   t    |jd usJ tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _d S NFbias)	rb   rc   r\  nnLinearr  w1w2w3re   r  rf   r\   r]   rc     s
   
zFeedForward.__init__rc  r`   c                 C   s    |  t| || | S ra   )r  Fsilur  r  re   rc  r\   r\   r]   r1    s    zFeedForward.forward	rT   rU   rV   r  rc   rY   rZ   r1  r   r\   r\   rf   r]   r    s    r  c                       D   e Zd Zdef fddZdejdejdejdejfdd	Z  ZS )
	Attentionr  c                    s   t    || _|j|j rJ |j| _|j|j | _tj|j|jdd| _	tj|j|jdd| _
tj|j|jdd| _tj|j|jdd| _d S r  )rb   rc   r  r  r^  n_headshead_dimr  r  wqwkwvwor  rf   r\   r]   rc     s   
zAttention.__init__rc  maskrb  r`   c                 C   s   |j \}}}| || || |}}}	|||| j| j}|||| j| j}|	||| j| j}	t|||d\}}trJt	j
|||	|d}
n"|dd}|dd}|	dd}	tjj|||	|d}
|
dd}
|
||| j| j }
| |
S )N)rb  	attn_biasr<   r   	attn_mask)r  r  r  r  r{  r  r  r  USE_XFORMERS_OPSxopsmemory_efficient_attention	transposer  
functionalscaled_dot_product_attentionr  )re   rc  r  rb  batchpatchesr   qkvoutr\   r\   r]   r1    s   "
zAttention.forwardr  r\   r\   rf   r]   r    s    r  c                       r  )
TransformerBlockr  c                    sB   t    t|| _t|| _t|jdd| _t|jdd| _	d S )Nr   r   )
rb   rc   r  	attentionr  feed_forwardr   r  attention_normffn_normr  rf   r\   r]   rc     s
   


zTransformerBlock.__init__rc  r  rb  r`   c                 C   s>   | j j| |||d}|| }| j| |}|| }|S Nr  rb  r  r1  r  r  r  )re   rc  r  rb  rrP   r  r\   r\   r]   r1    s   zTransformerBlock.forwardr  r\   r\   rf   r]   r    s    r  c                       sH   e Zd Zdef fddZdejdejdejdB dejfd	d
Z  ZS )Transformerr  c                    s:   t    tj | _t|jD ]
}| jt	| qd S ra   )
rb   rc   rY   r  
ModuleListlayersranger]  r   r  )re   r  r   rf   r\   r]   rc     s
   
zTransformer.__init__rc  r  rb  Nr`   c                 C   s   | j D ]	}||||d}q|S r  )r  )re   rc  r  rb  layerr\   r\   r]   r1    s   
zTransformer.forwardr  r\   r\   rf   r]   r    s    r  patch_embeds_listc                 C   s   t dd | D }|S )Nc              	   S   sF   g | ]}t jt jt |jd  t |jd ddddddqS )r   ij)indexingr   r   )rY   stackmeshgridrn  r  r{  r   pr\   r\   r]   r    s    	z%position_meshgrid.<locals>.<listcomp>)rY   r   )r  r,  r\   r\   r]   position_meshgrid	  s   	r  c                       s   e Zd Zdef fddZedefddZedej	j
fddZedejfd	d
ZedejfddZdeej dejfddZ  ZS )r
  r  c                    st   t    || _t|j|j|j|jdd| _t|jdd| _	t
|| _| jj| jj }|d dks5J dd | _d S )NFin_channelsout_channelskernel_sizestrider  r   r   r   r   zROPE requires even head_dim)rb   rc   r  r   r[  r  r{   
patch_convr   ln_prer  transformerr^  
_freqs_cis)re   r  r  rf   r\   r]   rc     s   


zVisionTransformer.__init__r`   c                 C   s   | j j| j j S ra   )r  ry   r{   rp   r\   r\   r]   max_patches_per_side.  s   z&VisionTransformer.max_patches_per_sidec                 C      t |  jS ra   )next
parametersrm  rp   r\   r\   r]   rm  2     zVisionTransformer.devicec                 C   r  ra   )r  r  r|  rp   r\   r\   r]   r|  6  r  zVisionTransformer.dtypec                 C   sV   | j d u rt| jj| jj | j| j| jjd| _ | j j| jkr(| j j| jd| _ | j S )N)r   r   r   rk  rl  )	r  rw  r  r  r^  r  r_  rm  torp   r\   r\   r]   rb  :  s   
zVisionTransformer.freqs_cisrS   c           
         s    fdd|D }dd |D }dd |D }t j|dd} |}t| j} j|dddf |dddf f }trNtj	j
jd	d |D }ndd
lm} |dd |D |} j|||d}	t |	d|S )a  
        Args:
            images: list of N_img images of variable sizes,
                each of shape (C, H, W)
        Returns:
            image_features: tensor of token features for
                all tokens of all images of shape (N_toks, D)
        c                    $   g | ]}  |d  jqS r  r  	unsqueezer  r|  r!  rp   r\   r]   r  V      z-VisionTransformer.forward.<locals>.<listcomp>c                 S       g | ]}| d dd dqS r   r   r<   r  permuter  r\   r\   r]   r  Z       c                 S   r  rd  r  r  r\   r\   r]   r  [  r  r<   r   Nr   c                 S       g | ]}|j d  |j d  qS r  r   r  r  r\   r\   r]   r  h  r  generate_block_attention_maskc                 S   r  r  r  r  r\   r\   r]   r  p  r  r  )rY   r   r  r  r  rm  rb  r  r  fmhar  BlockDiagonalMaskfrom_seqlens,transformers.models.pixtral.modeling_pixtralr  r  r%  squeeze)
re   rS   r  patch_embedsembed_sizesr,  rb  r  r  r  r\   rp   r]   r1  I  s&   

&
zVisionTransformer.forward)rT   rU   rV   r  rc   r   r   r  rY   typesDevicerm  r|  rZ   rb  r[   r1  r   r\   r\   rf   r]   r
    s    r
  c                       s<   e Zd Zdedef fddZdejdejfddZ  Z	S )	r  r  r   c                    sP   t    t|tsJ tj|j||jd| _t	 | _
tj|||jd| _d S )Nr  )rb   rc   rj   r  r  r  r  r`  w_inGELUgeluw_out)re   r  r   rf   r\   r]   rc   y  s   

zVisionLanguageAdapter.__init__rc  r`   c                 C   s   |  | | |S ra   )r  r  r  r  r\   r\   r]   r1    s   zVisionLanguageAdapter.forward)
rT   rU   rV   r  r   rc   rY   rZ   r1  r   r\   r\   rf   r]   r  x  s    r  c                	       s   e Zd ZdZ	ddedededdf fdd	Zd
ejde	e
eef  dejfddZd
ejde	e
eef  dejfddZ  ZS )r  z<
    Learned merging of spatial_merge_size ** 2 patches
    Fr   r   r   r`   Nc                    s8   t    ||d  }|| _|| _tj|||d| _d S )Nr   r  )rb   rc   r   mlp_input_dimr  r  merging_layer)re   r   r   r   r  rf   r\   r]   rc     s   
zPatchMerger.__init__rc  r$  c                 C   s8   t dd |D t|ksJ | ||}| |}|S )Nc                 S      g | ]\}}|| qS r\   r\   r   rP   rQ   r\   r\   r]   r        z'PatchMerger.forward.<locals>.<listcomp>)sumr   r  r  )re   rc  r$  r\   r\   r]   r1    s   
zPatchMerger.forwardc                 C   sL   t ||| jd}g }|D ]}|jd }||d|  qtj|ddS )a  
        Args:
            x: (N, D) where N is flattened and concatenated patch tokens
                for all images
            image_sizes: list of tuple of (height, width) in tokens for
                each image
        Returns:
            image_features: reorders patch tokens so each grid of
                (spatial_merge_size, spatial_merge_size) is contiguous.
                now (N / spatial_merge_size ** 2, D * spatial_merge_size ** 2)
        )rc  r$  r   r   r   r   )get_sub_gridsr   r  r   ri  r   rY   r   )re   rc  r$  	sub_gridspermuted_tensorgrid	n_patchesr\   r\   r]   r    s   
zPatchMerger.permute)F)rT   rU   rV   rW   r   r   rc   rY   rZ   r[   r   r1  r  r   r\   r\   rf   r]   r    s4    
r  r$  r   c                 C   s   dd |D }| j d }g }|}t| |D ]<\}}|| \}	}
||	|
|dddd d d d d d d f }tjjj|||d}|d|||d}|	|d  q|S )Nc                 S   r  r\   r\   r  r\   r\   r]   r    r  z!get_sub_grids.<locals>.<listcomp>r   r   r   r<   )r  r  )
r  rh  r%  ri  r  rY   r  r  unfoldr   )rc  r$  r   tokens_per_imagere  all_img_sub_gridssub_grid_sizeimage_indexr   rP   rQ   
image_gridr  r\   r\   r]   r    s"   

r  c                   @   sj   e Zd ZdededefddZdefddZdefdd	Zdefd
dZdededeeef fddZ	dS )PixtralHFEncoderInfor   r   r`   c                C   s   | j ||d\}}|| S )N)r   r   )get_patch_grid_size)re   r   r   r   r   r\   r\   r]   r     s
   
z)PixtralHFEncoderInfo.get_num_image_tokensc                 C   s   | j jS ra   )r  ry   rp   r\   r\   r]   r     r   z#PixtralHFEncoderInfo.get_image_sizec                 C   s   t | jdd}| jj| S )Nr   r<   )rT  r   r  r{   )re   r   r\   r\   r]   get_patch_size  s   z#PixtralHFEncoderInfo.get_patch_sizec                 C   s   |   |  }}|| S ra   )r   r  )re   ry   r{   r\   r\   r]   get_patch_grid_length	  s   z*PixtralHFEncoderInfo.get_patch_grid_lengthc          
      C   st   |    }}|   }}t|| || }|dkr+tt|| }tt|| }t||f||f\}}	|	|fS )Nr<   )r   r  maxr   mathfloor _get_pixtral_hf_num_image_tokens)
re   r   r   	max_width
max_heightpatch_widthpatch_heightratior   r   r\   r\   r]   r     s   z(PixtralHFEncoderInfo.get_patch_grid_sizeN)
rT   rU   rV   r   r   r   r  r  r   r   r\   r\   r\   r]   r    s$    

r  c                
       sR   e Zd Z	ddddededB deddf fdd	Zd
ejdejfddZ	  Z
S )PixtralHFMLPNr   r   r  quant_configr   r`   c                   st   t    t }|jd usJ t|j|jgd d|| d|d| _t|j|jd|| d|d| _t	|j
| _d S )Nr   F.gate_up_proj)
input_sizeoutput_sizesr  r  r   
disable_tpz
.down_projr  output_sizer  r  r   r  )rb   rc   rF   r\  r    r  gate_up_projr"   	down_projr   
hidden_actact_and_mulre   r  r  r   use_data_parallelrf   r\   r]   rc   )  s(   

zPixtralHFMLP.__init__rc  c                 C   s*   |  |\}}| |}| |\}}|S ra   )r  r  r  )re   rc  gate_upr   r\   r\   r]   r1  G  s   
zPixtralHFMLP.forwardra   rT   rU   rV   r   r#   r   rc   rY   rZ   r1  r   r\   r\   rf   r]   r  (  s    r  c                       sl   e Zd Z	ddddededB deddf fdd	Zd
ejdejdejde	ejejdB f fddZ
  ZS )PixtralHFAttentionNr   r   r  r  r   r`   c             	      s   t    || _|j|j rJ |j| _|j|j | _| j| j |jks&J t }t|j| j| jd|| d|d| _	t
|j|jd|| d|d| _|rOdnt | _t|j| j| _d S )NF	.qkv_proj)r  	head_sizetotal_num_headsr  r  r   r  z.o_projr  r<   )rb   rc   r  r  r^  r  r  rF   r!   qkv_projr"   o_projr   tp_sizer   r  r  rf   r\   r]   rc   O  s6   
	
zPixtralHFAttention.__init__r0  attention_maskposition_embeddingsc                 C   s*  |  \}}}| |\}}|jddd\}}	}
|||| j| jdd}|	||| j| jdd}	|
||| j| j}
|\}}t||	||dd\}}	tri|dd	 }|	dd	 }	t
j||	|
|d}n|
dd}
tjj||	|
|d	}|dd}|||| j| j }| |\}}|d fS )
NrO   r   r   r<   r   r   )unsqueeze_dimr  r  )sizer   chunkri  r  r  r  r   r  
contiguousr  r  r  r  r  r{  r!  )re   r0  r#  r$  r  r  r   
qkv_statesr  r  r  cossinr  attn_outputr\   r\   r]   r1  v  s(   zPixtralHFAttention.forwardra   )rT   rU   rV   r   r#   r   rc   rY   rZ   r   r1  r   r\   r\   rf   r]   r  N  s,    'r  c                
       s^   e Zd Z	ddddededB deddf fdd	Zd
ejdejdejdejfddZ	  Z
S )PixtralHFTransformerBlockNr   r   r  r  r   r`   c                   sZ   t    t|jdd| _t||| dd| _t||| dd| _t|jdd| _	d S )Nr   r   z
.attention)r  r   z.feed_forward)
rb   rc   r   r  r  r  r  r  r  r  )re   r  r  r   rf   r\   r]   rc     s   
z"PixtralHFTransformerBlock.__init__r0  r#  r$  c                 C   sB   | j j| |||d\}}|| }| j| |}|| }|S )N)r#  r$  r  )re   r0  r#  r$  r  r   rP   r  r\   r\   r]   r1    s   
z!PixtralHFTransformerBlock.forwardra   r  r\   r\   rf   r]   r-    s,    r-  c                       sl   e Zd Z	dddddededB dedB deddf
 fd	d
Zdej	dej	dej	de
dej	f
ddZ  ZS )PixtralHFTransformerNr   )num_hidden_layers_overrider   r  r  r/  r   r`   c                   sD   t    |d u r j}n|}t fddt|D | _d S )Nc                    s$   g | ]}t   d | dqS )z.layers.)r  r  r   )r-  )r   	layer_idxr  r   r  r\   r]   r    s    z1PixtralHFTransformer.__init__.<locals>.<listcomp>)rb   rc   r]  r  r  r  r  )re   r  r  r/  r   r]  rf   r1  r]   rc     s   

zPixtralHFTransformer.__init__rc  r#  r$  return_all_hidden_statesc                 C   s8   |g}| j D ]}||||}|r|| q|r|S |S ra   )r  r   )re   rc  r#  r$  r2  hidden_states_poolr  r\   r\   r]   r1    s   

zPixtralHFTransformer.forwardra   )rT   rU   rV   r   r#   r   r   rc   rY   rZ   r   r1  r   r\   r\   rf   r]   r.    s6    r.  c                       s   e Zd Z	ddddddededB dedB dedB ded	df fd
dZdddde	e
j de	e dB dedB d	ee
jdf fddZdeeee
jf  d	ee fddZ  ZS )PixtralHFVisionModelNr   )r/  require_post_normr   r  r  r/  r5  r   r`   c                   s   t    || _t|j|j|j|jdd| _t|jdd| _	t
|||| dd| _|j}t| jj|jkrEtd| dt| jj d	|d
u rOd}t|t|  j| _t|  j| _t|| j| _d S )NFr  r   r   z.transformer)r  r/  r   zThe original encoder only has z layers, but you requested z layers.Tz1PixtralHFVisionModel does not have post-layernorm)rb   rc   r  r   r[  r  r{   r  r   r  r.  r  r]  r   r  r   r  r  r|  rm  r   patch_positional_embedding)re   r  r  r/  r5  r   r]  msgrf   r\   r]   rc     s:   
	
zPixtralHFVisionModel.__init__)select_layersfeature_select_strategyrL   r8  r9  .c                   s    fdd|D }dd |D }dd |D }t j|dd} |}t| jj jj d j} 	||}t
rItjjjdd |D }	nd	d
lm}
 |
dd |D |}	 j||	||dud}t|d| jj|d}t |d	|S )a~  
        Args:
            pixel_values: Each image to be processed will be a separate tensor
                in pixel_values. This means it will be a list of tensors
                because multiple requests batched can have multiple images,
                each with their own shape potentially
            select_layers: Layer indices whose features should be
                concatenated and used as the visual encoder output. If none
                are provided, the last layer is used.

        Returns:
            image_features: tensor of token features for
                all tokens of all images of shape (N_toks, D)
        c                    r  r  r  r!  rp   r\   r]   r  4  r  z0PixtralHFVisionModel.forward.<locals>.<listcomp>c                 S   r  r  r  r  r\   r\   r]   r  8  r  c                 S   r  rd  r  r  r\   r\   r]   r  9  r  r<   r   )r  c                 S   r  r  r  r  r\   r\   r]   r  H  r  r   r  c                 S   r  r  r  r  r\   r\   r]   r  P  r  N)r2  )r8  max_possible_layersr9  )rY   r   r  r   r  ry   r{   r  rm  r6  r  r  r  r  r  r  r  r  r  rG   r]  r%  r  )re   rL   r8  r9  r  r  r  position_idsposition_embeddingr#  r  r  r\   rp   r]   r1    sH   


	zPixtralHFVisionModel.forwardr3  c                 C   s   g d}t |  }t }t| jj}|D ]L\}}|dr,t|dd }||kr,q|D ]\}	}
}|
|vr8q.|	|
|	}|| }|j
}||||  n|| }t|dt}||| || q|S )N))r  z.q_projr  )r  z.k_projr  )r  z.v_projr  )r  z
.gate_projr   )r  z.up_projr<   ztransformer.layersr?  r   weight_loader)r   rK  setr   r  r  r   r   r%  replacer=  rT  r$   add)re   r3  stacked_params_mappingparams_dictloaded_paramslayer_countr   loaded_weightr0  
param_nameweight_nameshard_idrD  r=  r\   r\   r]   rL  g  s,   

z!PixtralHFVisionModel.load_weightsra   )rT   rU   rV   r   r#   r   r   r   rc   r[   rY   rZ   rE   r   r1  r   r>  rL  r   r\   r\   rf   r]   r4    s>    0

,Ir4  )r  collections.abcr   r   r   dataclassesr   r   	functoolsr   typingr   r	   rY   torch.nnr  torch.nn.functionalr  r  &mistral_common.protocol.instruct.chunkr
   r   )mistral_common.protocol.instruct.messagesr   (mistral_common.protocol.instruct.requestr   +mistral_common.tokens.tokenizers.multimodalr   PILr   transformersr   r   r   transformers.image_utilsr   4transformers.models.pixtral.image_processing_pixtralr   r  r  r   r   r   $transformers.tokenization_utils_baser   vllm.configr   vllm.config.multimodalr   vllm.distributedr   r   %vllm.model_executor.layers.activationr   vllm.model_executor.layers.convr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr    r!   r"   'vllm.model_executor.layers.quantizationr#   -vllm.model_executor.model_loader.weight_utilsr$   vllm.multimodalr%   r&   vllm.multimodal.inputsr'   r(   r)   r*   vllm.multimodal.parser+   r,   r-   vllm.multimodal.processingr.   r/   $vllm.multimodal.processing.processorr0   r1   r2   r3   r4   r5   vllm.platformsr6   vllm.sequencer7   vllm.tokenizersr8   vllm.tokenizers.mistralr9   vllm.utils.tensor_schemar:   r;   
interfacesr=   r>   r?   r@   module_mappingrA   utilsrB   rC   visionrD   rE   rF   rG   xformersrH   r  is_cudahas_device_capabilityr  ImportErrorr  rK   r^   r   r   r   register_processorModuler   r  rZ   rj  r   ra  rw  r   r  r  r  r  r  r[   r  r
  r  r  r  r  r  r  r-  r.  r4  r\   r\   r\   r]   <module>   s    W1:=

 k

)
\G
%4&L(/