o
    
۾i                  
   @   s   d dl Z d dlmZmZmZ d dlmZmZ d dlm	Z	 d dl
mZmZ d dlZd dlmZ d dlm  mZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZm Z m!Z! d dl"m#Z# d dl$m%Z& d dl'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z;m<Z<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZCmDZD d dlEmFZFmGZGmHZHmIZI d dlJmKZKmLZLmMZM d dlNmOZOmPZP d dlQmRZRmSZSmTZTmUZUmVZVmWZW d dlXmYZY d dlZm[Z[ d d l\m]Z] d d!l^m_Z_ d d"l`maZambZb d#d$lcmdZdmeZemfZfmgZg d#d%lhmiZi d#d&ljmkZkmlZlmmZm d#d'lnmoZompZpmqZqmrZr zd d(lsmtZu eYv rLeYwd)rLd*Zxnd+ZxW n eyy[   d*ZxY nw d,Zzd-ej{d.e|fd/d0Z}G d1d2 d2eaZ~G d3d4 d4ZG d5d6 d6eSZG d7d8 d8eOe ZG d9d: d:eRe ZeCjeeed;G d<d= d=ej{eeefegZeG d>d? d?Zd@ejdAejd.ejfdBdCZdDedEedFedGed.ejf
dHdIZdJejdKejd@ejd.eejejf fdLdMZG dNdO dOej{ZG dPdQ dQej{ZG dRdS dSej{ZG dTdU dUej{ZdVeej d.ejfdWdXZG dYdZ dZej{ZG d[d\ d\ej{ZG d]d^ d^ej{ZdAejd_eeeef  d`ed.eej fdadbZG dcdd ddeoe  ZG dedf dfej{ZG dgdh dhej{ZG didj djej{ZG dkdl dlej{ZG dmdn dnej{ZdS )o    N)IterableMappingSequence)	dataclassfields)cached_property)	AnnotatedLiteral)
ImageChunk	TextChunk)UserMessage)ChatCompletionRequest)ImageEncoder)Image)BatchFeaturePixtralVisionConfig
TensorType)
ImageInput)_num_image_tokens)PixtralRotaryEmbeddingapply_rotary_pos_embposition_ids_in_meshgrid)	TextInput)
VllmConfig)BaseDummyOptions)divide$get_tensor_model_parallel_world_size)get_act_and_mul_fn)Conv2dLayer)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)default_weight_loader)MULTIMODAL_REGISTRYMultiModalKwargsItems)MultiModalDataDictMultiModalFieldConfigMultiModalUUIDDictNestedTensors)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderProcessorInputs)BaseMultiModalProcessorBaseProcessingInfoMultiModalProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)current_platform)IntermediateTensors)cached_tokenizer_from_config)MistralTokenizer)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)MultiModelKeys)StageMissingLayerinit_vllm_registered_modelmaybe_prefix)VisionEncoderInfoVisionFeatureSelectStrategyis_vit_use_data_parallelresolve_visual_encoder_outputs)opsd   FTpatch_mergelayerreturnc                 C   s   | d u pt | tS N)
isinstancerB   )rL    rP   V/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/pixtral.py_is_layer_none_or_staged`   s   rR   c                	   @   sP   e Zd ZU dZdZed ed< eej	e
ej	 B eddddddhdf ed	< d
S )PixtralImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image

    The result of stacking `ImageEncoding.tokens` from each prompt.
    pixel_valuestypebn   hw)dynamic_dimsimagesN)__name__
__module____qualname____doc__rU   r	   __annotations__r   torchTensorlistr;   rP   rP   rP   rQ   rS   d   s   
 
rS   c                       s   e Zd ZdZdeddf fddZedefddZe	de
fd	d
Ze	de
fddZe	de
fddZe	de
fddZe	de
fddZ			ddeee B dB deee B dB deeB dB deeef fddZ  ZS )PixtralProcessorAdapterzo
    Provide a HF-compatible interface for
    `mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
    	tokenizerrM   Nc                    s   t    || _d S rN   )super__init__re   selfre   	__class__rP   rQ   rg   }   s   

z PixtralProcessorAdapter.__init__c                 C   s   | j jj}t|tsJ |S rN   )re   instruct
mm_encoderrO   r   )ri   image_encoderrP   rP   rQ   image_processor   s   
z'PixtralProcessorAdapter.image_processorc                 C   
   | j jjS rN   )ro   special_ids	img_breakri   rP   rP   rQ   image_break_id      
z&PixtralProcessorAdapter.image_break_idc                 C   rp   rN   )ro   rq   imgrs   rP   rP   rQ   image_token_id   ru   z&PixtralProcessorAdapter.image_token_idc                 C   rp   rN   )ro   rq   img_endrs   rP   rP   rQ   image_end_id   ru   z$PixtralProcessorAdapter.image_end_idc                 C   rp   rN   )ro   	mm_configmax_image_sizers   rP   rP   rQ   
image_size   ru   z"PixtralProcessorAdapter.image_sizec                 C   rp   rN   )ro   rz   image_patch_sizers   rP   rP   rQ   
patch_size   ru   z"PixtralProcessorAdapter.patch_sizetextr[   return_tensorsc                 K   s   |d u rg }t |ts|g}|d u rg }t |ts|g}|s+| |j}dt|iS tdd |D r8tdttj  }ttj  }|D ] }| 	t
|d}	t|	j}
t|	j}||
 || qFtt|d  t|d|dS )N	input_idsc                 s   s    | ]	}t |d kV  qdS )r   N)len).0trP   rP   rQ   	<genexpr>   s    z3PixtralProcessorAdapter.__call__.<locals>.<genexpr>zYou've passed text inputs instead of token inputs. Make sure to process your input via `mistral_common`'s tokenizer or pass a chat completion request. For more info, see: https://github.com/vllm-project/vllm/issues/8411.image)r   r[   )rO   rc   re   r   ra   tensorany
ValueErrorrb   ro   r
   r   tokensappendr   catexpandr   )ri   r   r[   r   kwargsr   images_processedimages_tokensr   image_inputsimage_processedimage_tokensrP   rP   rQ   __call__   s8   


z PixtralProcessorAdapter.__call__)NNN)r\   r]   r^   r_   r9   rg   propertyr   ro   r   intrt   rw   ry   r|   r~   r   rc   r   strr   r   r*   r   __classcell__rP   rP   rj   rQ   rd   w   s6    

rd   c                	   @   s   e Zd ZdefddZdefddZdeee	dB f fddZ
	dd	edB fd
dZddde	de	d	edB de	fddZdefddZdS )PixtralProcessingInforM   c                 C   s"   t | jj}t|tstd|S )Nz.This model requires `--tokenizer-mode mistral`)r8   ctxmodel_configrO   r9   r   rh   rP   rP   rQ   get_tokenizer   s   
z#PixtralProcessingInfo.get_tokenizerc                 C   s   t |  S rN   )rd   r   rs   rP   rP   rQ   get_hf_processor   s   z&PixtralProcessingInfo.get_hf_processorNc                 C   s   dd iS )Nr   rP   rs   rP   rP   rQ   get_supported_mm_limits      z-PixtralProcessingInfo.get_supported_mm_limits	processorc                 C   s    |d u r|   }t|j|jdS )N)r|   r~   )r   r   r|   r~   )ri   r   rP   rP   rQ   get_vision_config   s   z'PixtralProcessingInfo.get_vision_config)r   image_widthimage_heightc                C   s4   |d u r|   }|jtd||f\}}|| S )NRGB)r   ro   _image_to_num_tokensr   new)ri   r   r   r   ncolsnrowsrP   rP   rQ   get_num_image_tokens   s   z*PixtralProcessingInfo.get_num_image_tokensc                 C   s   |   j}|jj}t||dS )N)widthheight)r   ro   rz   r{   r,   )ri   ro   r{   rP   rP   rQ   !get_image_size_with_most_features   s   
z7PixtralProcessingInfo.get_image_size_with_most_featuresrN   )r\   r]   r^   r9   r   rd   r   r   r   r   r   r   r   r,   r   rP   rP   rP   rQ   r      s&    

r   c                	   @   s   e Zd Zdeeef defddZ	ddedeeef deeef dB defdd	Z		ddedeeef deeef dB de
fd
dZdS )PixtralDummyInputsBuilder	mm_countsrM   c                 C   s   dS )N rP   )ri   r   rP   rP   rQ   get_dummy_text   s   z(PixtralDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j \}}|r| dnd }d| j||||diS )Nr   r   )r   r   
num_images	overrides)getinfor   _get_dummy_images)ri   r   r   r   r   target_widthtarget_heightimage_overridesrP   rP   rQ   get_dummy_mm_data  s   z+PixtralDummyInputsBuilder.get_dummy_mm_datac                 C   s   | j  }| |}| |||}|dg }ddi}ttt|dgdd |D dgd}	|j	|	}
|
j
}| j |}t|||d	S )
Nr   
truncationF)r   c                 s   s    | ]}t |d V  qdS )r   N)r
   )r   r   rP   rP   rQ   r   *  s    zGPixtralDummyInputsBuilder.get_dummy_processor_inputs.<locals>.<genexpr>)content)messages)promptmm_itemstokenization_kwargs)r   r   r   r   r   r   r   r   mistralencode_chat_completionr   parse_mm_datar/   )ri   r   r   r   re   
dummy_textdummy_mm_datadummy_imagesr   requestresdummy_tokensdummy_mm_itemsrP   rP   rQ   get_dummy_processor_inputs  s,   


z4PixtralDummyInputsBuilder.get_dummy_processor_inputsrN   )r\   r]   r^   r   r   r   r   r   r'   r   r/   r   rP   rP   rP   rQ   r      s,    


r   c                       s   e Zd Zdeeef deeef deeef fddZde	deeef de
dee fdd	Z	
ddeee B de	deeef deeef ded
B deee eef f fddZ  ZS )PixtralMultiModalProcessor	hf_inputshf_processor_mm_kwargsrM   c                 C   s   t tddS )Nr   )r[   )dictr(   batched)ri   r   r   rP   rP   rQ   _get_mm_fields_config<  s   z0PixtralMultiModalProcessor._get_mm_fields_configr   out_mm_kwargsc                    sN   | j jdi |j jjdtf fdd}tdd|dgS )Nitem_idxc                    s^    dt}|| }jtd|j|jf\}}g|  g | }|d< t	
|S )Nr   r   r   )	get_itemsr+   get_image_sizero   r   r   r   r   r   r5   select_token_id)r   r[   r|   r   r   r   rt   ry   rw   r   r   rP   rQ   get_replacementO  s   
zGPixtralMultiModalProcessor._get_prompt_updates.<locals>.get_replacementr   r   )modalitytargetreplacementrP   )r   r   rt   rw   ry   r   r3   )ri   r   r   r   r   rP   r   rQ   _get_prompt_updatesC  s   z.PixtralMultiModalProcessor._get_prompt_updatesNr   mm_data_itemsr   mm_uuidsc           	         s&   t  j|||||d\}}}||dfS )N)r   r   r   r   r   T)rf   _cached_apply_hf_processor)	ri   r   r   r   r   r   
prompt_idsmm_info_rj   rP   rQ   r   d  s   
	z5PixtralMultiModalProcessor._cached_apply_hf_processorrN   )r\   r]   r^   r   r   r*   objectr(   r   r-   r&   r   r4   r   rc   r   r)   tupler2   boolr   r   rP   rP   rj   rQ   r   ;  s>    





'


r   )r   dummy_inputsc                       s6  e Zd ZededededB fddZddd	ed
ef fddZde	de
dB fddZde
deejdf fddZde	defddZ		d*dejdB dejdedB dejdB de	dejeB fddZdejdejdB fddZdeeeejf  fd d!Zdefd"d#Zd$edefd%d&Zd'edefd(d)Z  ZS )+PixtralForConditionalGenerationr   irM   Nc                 C   s   | drd S td)Nr   z Only image modality is supported)
startswithr   )clsr   r   rP   rP   rQ   get_placeholder_str  s   
z3PixtralForConditionalGeneration.get_placeholder_strr   prefixvllm_configr   c                   sH  t    |jj}|jj}|| _|| _dd ttD   fdd| jj	 
 D }tdi || _| | t||jt|dd| _W d    n1 sOw   Y  | |d; t| j| _| jjrmt| jjdd	nd | _| jjtkrt| jj| jjd
dnd | _t| j|jjd| _W d    n1 sw   Y  | jj| _d S )Nc                 S   s   h | ]}|j qS rP   )name)r   fieldrP   rP   rQ   	<setcomp>  s    z;PixtralForConditionalGeneration.__init__.<locals>.<setcomp>c                    s   i | ]\}}| v r||qS rP   rP   )r   keyvaluedataclass_fieldsrP   rQ   
<dictcomp>  s
    z<PixtralForConditionalGeneration.__init__.<locals>.<dictcomp>language_model)r   	hf_configr   r   h㈵>epsF)vision_encoder_dimspatial_merge_sizeuse_mlp_biasdimrP   ) rf   rg   r   r   multimodal_configconfigr   VisionEncoderArgsvision_configto_dictitemsvision_args_mark_language_modelrC   text_configrD   r   _mark_tower_modelVisionTransformervision_encoderadd_pre_mm_projector_layer_normr   hidden_sizepre_mm_projector_normmm_projector_idPATCH_MERGEPatchMergerr   patch_mergerVisionLanguageAdaptervision_language_adaptermake_empty_intermediate_tensors)ri   r   r   r  r  r
  rj   r   rQ   rg     sJ   


	

z(PixtralForConditionalGeneration.__init__r   c                 K   s$   | dd }|d u rd S td|dS )Nr[   rT   )rU   r[   )poprS   )ri   r   r[   rP   rP   rQ   _parse_and_validate_image_input  s   z?PixtralForConditionalGeneration._parse_and_validate_image_inputimage_input.c                    s   |d }|  |}dd |D }t|}| jd ur| |}| jd urG| jj | jjd  fdd|D }fdd|D }| j||d}| |}t	||}|S )Nr[   c                 S      g | ]}|j d  qS r   shape)r   image_featurerP   rP   rQ   
<listcomp>      zHPixtralForConditionalGeneration._process_image_input.<locals>.<listcomp>   c                    s(   g | ]}|j d    |j d   fqS )r<   r$  r  r   rv   )r~   rP   rQ   r"    s    c                    s   g | ]}|  qS rP   rP   )r   feature_size)spatial_merge_size_squarerP   rQ   r"    s    )image_sizes)
r  ra   r   r  r  r
  r~   r   r  split)ri   r  r[   image_featuresfeature_sizesimg_patch_dimsimage_embedsrP   )r~   r'  rQ   _process_image_input  s*   







z4PixtralForConditionalGeneration._process_image_inputc                 K   s&   | j di |}|d u rg S | |S )NrP   )r  r.  )ri   r   r  rP   rP   rQ   embed_multimodal  s   
z0PixtralForConditionalGeneration.embed_multimodalr   	positionsintermediate_tensorsinputs_embedsc                 K   s$   |durd}| j j||||d}|S )zRun forward pass for pixtral.N)r2  )r   model)ri   r   r0  r1  r2  r   hidden_statesrP   rP   rQ   forward  s   	z'PixtralForConditionalGeneration.forwardr4  c                 C   s   | j |S rN   )r   compute_logits)ri   r4  rP   rP   rQ   r6    s   z.PixtralForConditionalGeneration.compute_logitsweightsc              
      s   dt ttjf fdddt ttjf fdddt ttjf fdd dt ttjf fdd	jd ur<tj ni jd urJtj ni jd urXtj ni j	d urftj	 ni  	f
d
d}j
|  d S )Nweightc                 S      | d  dS )Nr   )r  vision_towerr   r8  rP   rP   rQ   is_vision_encoder_weights     zOPixtralForConditionalGeneration.load_weights.<locals>.is_vision_encoder_weightsc                 S   r9  )Nr   )r  multi_modal_projectorr;  r<  rP   rP   rQ   is_vision_lang_adapter_weights  s   zTPixtralForConditionalGeneration.load_weights.<locals>.is_vision_lang_adapter_weightsc                 S   r9  )Nr   r  r;  r<  rP   rP   rQ   is_patch_merger  r>  zEPixtralForConditionalGeneration.load_weights.<locals>.is_patch_mergerc                 S   r9  )Nr   r  r;  r<  rP   rP   rQ   is_pre_mm_projector_norm
  r>  zNPixtralForConditionalGeneration.load_weights.<locals>.is_pre_mm_projector_normc               	   3   s   	D ]\} }| |frBt jrqd| ddd  }|}|d urAt  t|| W d    n1 s<w   Y  q | |frxt jrNqd| ddd  }| }t  t|| W d    n1 srw   Y  q| |frt j	rqd| ddd  }| }t  t|| W d    n1 sw   Y  q| |frt j
rqd| ddd  }|}|d urt  t|| W d    n1 sw   Y  q| d} | |fV  qd S )N.r<   zlanguage_model.)rR   r  joinr)  r   ra   no_gradr$   r  r  r  removeprefix)r   rY   trimmed_nameparam
rA  rB  r=  r@  patch_merger_dictpre_mm_projector_norm_dictri   vision_encoder_dictvision_lang_adapter_dictr7  rP   rQ   llm_weights_generator#  sV   










zKPixtralForConditionalGeneration.load_weights.<locals>.llm_weights_generator)r   r   ra   rb   r  r   named_parametersr  r  r  r   load_weights)ri   r7  rN  rP   rI  rQ   rP    s,   



-z,PixtralForConditionalGeneration.load_weightsc                 C   s   t jddddS )Nr   r  r  )r   	connectortower_model)rA   from_string_fieldrs   rP   rP   rQ   get_mm_mappingR  s
   z.PixtralForConditionalGeneration.get_mm_mappingnum_image_tokensc                 C   s(   t | dd d u r
|S | jj}||d  S Nr  r$  getattrr
  r   )ri   rU  
merge_sizerP   rP   rQ   get_num_mm_encoder_tokensY     z9PixtralForConditionalGeneration.get_num_mm_encoder_tokensnum_vision_tokensc                 C   s(   t | dd d u r
|S | jj}||d  S rV  rW  )ri   r\  rY  rP   rP   rQ   get_num_mm_connector_tokens_  r[  z;PixtralForConditionalGeneration.get_num_mm_connector_tokens)NN)r\   r]   r^   classmethodr   r   r   r   rg   r   rS   r  r   ra   rb   r.  r=   r/  r7   r5  r6  r   rP  rA   rT  rZ  r]  r   rP   rP   rj   rQ   r   x  sN    0



Tr   c                   @   s   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< d
Zeed< dZeed< dZ	eed< dZ
eed< dS )r  r  num_channelsr|   r~   intermediate_sizenum_hidden_layersnum_attention_heads
rope_thetarw   Tadapter_biasr<   r   Fr  r   r  N)r\   r]   r^   r   r`   floatrd  r   r   r  r  r   rP   rP   rP   rQ   r  g  s   
 r  	freqs_cisxc                    sl   |j   dks	J | j|jd |jd fks%J | j|jd |jd ff fddt|jD }| j| S )zd
    freqs_cis: complex - (seq_len, head_dim / 2)
    x: complex - (bsz, seq_len, head_dim / 2)
    r<   r   c                    s,   g | ]\}}|d ks| d  kr|nd qS r<   rP   )r   r   dndimrP   rQ   r"    s   , z*_reshape_for_broadcast.<locals>.<listcomp>)rk  r   	enumerateview)rf  rg  r   rP   rj  rQ   _reshape_for_broadcastx  s   
rn  r  r   r   thetac           
      C   s   d|t d| d |    }t j||jd}t j||jd}t ||ddd  }t ||ddd  }t j|dddddf d|d|dddddf |ddgdd}	t t |	|	S )	z
    freqs_cis: 2D complex tensor of shape (height, width, dim // 2)
        to be indexed by (height, width) position tuples
    g      ?r   r$  deviceNr<   r   r  )	ra   arangere  rq  outerr   repeatpolar	ones_like)
r  r   r   ro  freqsrX   rY   freqs_hfreqs_wfreqs_2drP   rP   rQ   precompute_freqs_cis_2d  s   r{  xqxkc                 C   s   t |  jg | jd d ddR  }t | jg |jd d ddR  }|jt jks4J t||}t || 	d}t || 	d}|
| |
|fS )Nr   r$  rW   )ra   view_as_complexre  reshaper   dtype	complex64rn  view_as_realflattentype_as)r|  r}  rf  xq_xk_xq_outxk_outrP   rP   rQ   apply_rotary_emb_vit  s   ,,
r  c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )FeedForwardargsc                    s^   t    |jd usJ tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _d S NFbias)	rf   rg   r`  nnLinearr  w1w2w3ri   r  rj   rP   rQ   rg     s
   
zFeedForward.__init__rg  rM   c                 C   s    |  t| || | S rN   )r  Fsilur  r  ri   rg  rP   rP   rQ   r5    s    zFeedForward.forward	r\   r]   r^   r  rg   ra   rb   r5  r   rP   rP   rj   rQ   r    s    r  c                       D   e Zd Zdef fddZdejdejdejdejfdd	Z  ZS )
	Attentionr  c                    s   t    || _|j|j rJ |j| _|j|j | _tj|j|jdd| _	tj|j|jdd| _
tj|j|jdd| _tj|j|jdd| _d S r  )rf   rg   r  r  rb  n_headshead_dimr  r  wqwkwvwor  rj   rP   rQ   rg     s   
zAttention.__init__rg  maskrf  rM   c                 C   s   |j \}}}| || || |}}}	|||| j| j}|||| j| j}|	||| j| j}	t|||d\}}trJt	j
|||	|d}
n"|dd}|dd}|	dd}	tjj|||	|d}
|
dd}
|
||| j| j }
| |
S )N)rf  	attn_biasr<   r$  	attn_mask)r   r  r  r  r  r  r  r  USE_XFORMERS_OPSxopsmemory_efficient_attention	transposer  
functionalscaled_dot_product_attentionr  )ri   rg  r  rf  batchpatchesr   qkvoutrP   rP   rQ   r5    s   "
zAttention.forwardr  rP   rP   rj   rQ   r    s    r  c                       r  )
TransformerBlockr  c                    sB   t    t|| _t|| _t|jdd| _t|jdd| _	d S )Nr   r   )
rf   rg   r  	attentionr  feed_forwardr   r  attention_normffn_normr  rj   rP   rQ   rg     s
   


zTransformerBlock.__init__rg  r  rf  rM   c                 C   s>   | j j| |||d}|| }| j| |}|| }|S Nr  rf  r  r5  r  r  r  )ri   rg  r  rf  rrX   r  rP   rP   rQ   r5    s   zTransformerBlock.forwardr  rP   rP   rj   rQ   r    s    r  c                       sH   e Zd Zdef fddZdejdejdejdB dejfd	d
Z  ZS )Transformerr  c                    s:   t    tj | _t|jD ]
}| jt	| qd S rN   )
rf   rg   ra   r  
ModuleListlayersrangera  r   r  )ri   r  r   rj   rP   rQ   rg     s
   
zTransformer.__init__rg  r  rf  NrM   c                 C   s   | j D ]	}||||d}q|S r  )r  )ri   rg  r  rf  rL   rP   rP   rQ   r5    s   
zTransformer.forwardr  rP   rP   rj   rQ   r    s    r  patch_embeds_listc                 C   s   t dd | D }|S )Nc              	   S   sF   g | ]}t jt jt |jd  t |jd ddddddqS )r   ij)indexingr  r$  )ra   stackmeshgridrr  r   r  r   prP   rP   rQ   r"    s    	z%position_meshgrid.<locals>.<listcomp>)ra   r   )r  r0  rP   rP   rQ   position_meshgrid  s   	r  c                       s   e Zd Zdef fddZedefddZedej	j
fddZedejfd	d
ZedejfddZdeej dejfddZ  ZS )r  r  c                    st   t    || _t|j|j|j|jdd| _t|jdd| _	t
|| _| jj| jj }|d dks5J dd | _d S )NFin_channelsout_channelskernel_sizestrider  r   r   r$  r   zROPE requires even head_dim)rf   rg   r  r   r_  r  r~   
patch_convr   ln_prer  transformerrb  
_freqs_cis)ri   r  r  rj   rP   rQ   rg   #  s   


zVisionTransformer.__init__rM   c                 C   s   | j j| j j S rN   )r  r|   r~   rs   rP   rP   rQ   max_patches_per_side4  s   z&VisionTransformer.max_patches_per_sidec                 C      t |  jS rN   )next
parametersrq  rs   rP   rP   rQ   rq  8     zVisionTransformer.devicec                 C   r  rN   )r  r  r  rs   rP   rP   rQ   r  <  r  zVisionTransformer.dtypec                 C   sV   | j d u rt| jj| jj | j| j| jjd| _ | j j| jkr(| j j| jd| _ | j S )N)r  r   r   ro  rp  )	r  r{  r  r  rb  r  rc  rq  tors   rP   rP   rQ   rf  @  s   
zVisionTransformer.freqs_cisr[   c           
         s    fdd|D }dd |D }dd |D }t j|dd} |}t| j} j|dddf |dddf f }trNtj	j
jd	d |D }ndd
lm} |dd |D |} j|||d}	t |	d|S )a  
        Args:
            images: list of N_img images of variable sizes,
                each of shape (C, H, W)
        Returns:
            image_features: tensor of token features for
                all tokens of all images of shape (N_toks, D)
        c                    $   g | ]}  |d  jqS r  r  	unsqueezer  r  r%  rs   rP   rQ   r"  \      z-VisionTransformer.forward.<locals>.<listcomp>c                 S       g | ]}| d dd dqS r$  r   r<   r  permuter  rP   rP   rQ   r"  `       c                 S   r  rh  r  r  rP   rP   rQ   r"  a  r#  r<   r  Nr   c                 S       g | ]}|j d  |j d  qS r  r   r  r  rP   rP   rQ   r"  n  r  generate_block_attention_maskc                 S   r  r  r  r  rP   rP   rQ   r"  v  r  r  )ra   r   r  r  r  rq  rf  r  r  fmhar  BlockDiagonalMaskfrom_seqlens,transformers.models.pixtral.modeling_pixtralr  r  r)  squeeze)
ri   r[   r  patch_embedsembed_sizesr0  rf  r  r  r  rP   rs   rQ   r5  O  s&   

&
zVisionTransformer.forward)r\   r]   r^   r  rg   r   r   r  ra   typesDevicerq  r  rb   rf  rc   r5  r   rP   rP   rj   rQ   r  "  s    r  c                       s<   e Zd Zdedef fddZdejdejfddZ  Z	S )	r  r  r  c                    sP   t    t|tsJ tj|j||jd| _t	 | _
tj|||jd| _d S )Nr  )rf   rg   rO   r  r  r  r  rd  w_inGELUgeluw_out)ri   r  r  rj   rP   rQ   rg     s   

zVisionLanguageAdapter.__init__rg  rM   c                 C   s   |  | | |S rN   )r  r  r  r  rP   rP   rQ   r5    s   zVisionLanguageAdapter.forward)
r\   r]   r^   r  r   rg   ra   rb   r5  r   rP   rP   rj   rQ   r  ~  s    r  c                	       s   e Zd ZdZ	ddedededdf fdd	Zd
ejde	e
eef  dejfddZd
ejde	e
eef  dejfddZ  ZS )r  z<
    Learned merging of spatial_merge_size ** 2 patches
    Fr   r   r  rM   Nc                    s8   t    ||d  }|| _|| _tj|||d| _d S )Nr$  r  )rf   rg   r   mlp_input_dimr  r  merging_layer)ri   r   r   r  r  rj   rP   rQ   rg     s   
zPatchMerger.__init__rg  r(  c                 C   s8   t dd |D t|ksJ | ||}| |}|S )Nc                 S      g | ]\}}|| qS rP   rP   r   rX   rY   rP   rP   rQ   r"        z'PatchMerger.forward.<locals>.<listcomp>)sumr   r  r  )ri   rg  r(  rP   rP   rQ   r5    s   
zPatchMerger.forwardc                 C   sL   t ||| jd}g }|D ]}|jd }||d|  qtj|ddS )a  
        Args:
            x: (N, D) where N is flattened and concatenated patch tokens
                for all images
            image_sizes: list of tuple of (height, width) in tokens for
                each image
        Returns:
            image_features: reorders patch tokens so each grid of
                (spatial_merge_size, spatial_merge_size) is contiguous.
                now (N / spatial_merge_size ** 2, D * spatial_merge_size ** 2)
        )rg  r(  r   r   r   r  )get_sub_gridsr   r   r   rm  r   ra   r   )ri   rg  r(  	sub_gridspermuted_tensorgrid	n_patchesrP   rP   rQ   r    s   
zPatchMerger.permute)F)r\   r]   r^   r_   r   r   rg   ra   rb   rc   r   r5  r  r   rP   rP   rj   rQ   r    s4    
r  r(  r   c                 C   s   dd |D }| j d }g }|}t| |D ]<\}}|| \}	}
||	|
|dddd d d d d d d f }tjjj|||d}|d|||d}|	|d  q|S )Nc                 S   r  rP   rP   r  rP   rP   rQ   r"    r  z!get_sub_grids.<locals>.<listcomp>r   r$  r   r<   )r  r  )
r   rl  r)  rm  r  ra   r  r  unfoldr   )rg  r(  r   tokens_per_imageri  all_img_sub_gridssub_grid_sizeimage_indexr   rX   rY   
image_gridr  rP   rP   rQ   r    s"   

r  c                   @   sj   e Zd ZdededefddZdefddZdefdd	Zdefd
dZdededeeef fddZ	dS )PixtralHFEncoderInfor   r   rM   c                C   s   | j ||d\}}|| S )N)r   r   )get_patch_grid_size)ri   r   r   r   r   rP   rP   rQ   r     s
   
z)PixtralHFEncoderInfo.get_num_image_tokensc                 C   s   | j jS rN   )r  r|   rs   rP   rP   rQ   r     r   z#PixtralHFEncoderInfo.get_image_sizec                 C   s   t | jdd}| jj| S )Nr   r<   )rX  r   r  r~   )ri   r   rP   rP   rQ   get_patch_size
  s   z#PixtralHFEncoderInfo.get_patch_sizec                 C   s   |   |  }}|| S rN   )r   r  )ri   r|   r~   rP   rP   rQ   get_patch_grid_length  s   z*PixtralHFEncoderInfo.get_patch_grid_lengthc          
      C   st   |    }}|   }}t|| || }|dkr+tt|| }tt|| }t||f||f\}}	|	|fS )Nr<   )r   r  maxr   mathfloor _get_pixtral_hf_num_image_tokens)
ri   r   r   	max_width
max_heightpatch_widthpatch_heightratior   r   rP   rP   rQ   r    s   z(PixtralHFEncoderInfo.get_patch_grid_sizeN)
r\   r]   r^   r   r   r   r  r  r   r  rP   rP   rP   rQ   r    s$    

r  c                
       sR   e Zd Z	ddddededB deddf fdd	Zd
ejdejfddZ	  Z
S )PixtralHFMLPNr   r   r  quant_configr   rM   c                   st   t    t }|jd usJ t|j|jgd d|| d|d| _t|j|jd|| d|d| _t	|j
| _d S )Nr$  F.gate_up_proj)
input_sizeoutput_sizesr  r  r   
disable_tpz
.down_projr  output_sizer  r  r   r  )rf   rg   rG   r`  r    r  gate_up_projr"   	down_projr   
hidden_actact_and_mulri   r  r  r   use_data_parallelrj   rP   rQ   rg   /  s(   

zPixtralHFMLP.__init__rg  c                 C   s*   |  |\}}| |}| |\}}|S rN   )r  r  r  )ri   rg  gate_upr   rP   rP   rQ   r5  M  s   
zPixtralHFMLP.forwardrN   r\   r]   r^   r   r#   r   rg   ra   rb   r5  r   rP   rP   rj   rQ   r  .  s    r  c                       sl   e Zd Z	ddddededB deddf fdd	Zd
ejdejdejde	ejejdB f fddZ
  ZS )PixtralHFAttentionNr   r   r  r  r   rM   c             	      s   t    || _|j|j rJ |j| _|j|j | _| j| j |jks&J t }t|j| j| jd|| d|d| _	t
|j|jd|| d|d| _|rOdnt | _t|j| j| _d S )NF	.qkv_proj)r  	head_sizetotal_num_headsr  r  r   r  z.o_projr  r<   )rf   rg   r  r  rb  r"  r  rG   r!   qkv_projr"   o_projr   tp_sizer   r  r  rj   rP   rQ   rg   U  s6   
	
zPixtralHFAttention.__init__r4  attention_maskposition_embeddingsc                 C   s*  |  \}}}| |\}}|jddd\}}	}
|||| j| jdd}|	||| j| jdd}	|
||| j| j}
|\}}t||	||dd\}}	tri|dd	 }|	dd	 }	t
j||	|
|d}n|
dd}
tjj||	|
|d	}|dd}|||| j| j }| |\}}|d fS )
NrW   r   r  r<   r$  r   )unsqueeze_dimr  r  )sizer#  chunkrm  r  r  r  r   r  
contiguousr  r  r  r  r  r  r$  )ri   r4  r&  r'  r  r  r   
qkv_statesr  r  r  cossinr  attn_outputrP   rP   rQ   r5  |  s(   zPixtralHFAttention.forwardrN   )r\   r]   r^   r   r#   r   rg   ra   rb   r   r5  r   rP   rP   rj   rQ   r  T  s,    'r  c                
       s^   e Zd Z	ddddededB deddf fdd	Zd
ejdejdejdejfddZ	  Z
S )PixtralHFTransformerBlockNr   r   r  r  r   rM   c                   sZ   t    t|jdd| _t||| dd| _t||| dd| _t|jdd| _	d S )Nr   r   z
.attention)r  r   z.feed_forward)
rf   rg   r   r  r  r  r  r  r  r  )ri   r  r  r   rj   rP   rQ   rg     s   
z"PixtralHFTransformerBlock.__init__r4  r&  r'  c                 C   sB   | j j| |||d\}}|| }| j| |}|| }|S )N)r&  r'  r  )ri   r4  r&  r'  r  r   rX   r  rP   rP   rQ   r5    s   
z!PixtralHFTransformerBlock.forwardrN   r  rP   rP   rj   rQ   r0    s,    r0  c                       sl   e Zd Z	dddddededB dedB deddf
 fd	d
Zdej	dej	dej	de
dej	f
ddZ  ZS )PixtralHFTransformerNr   )num_hidden_layers_overrider   r  r  r2  r   rM   c                   sD   t    |d u r j}n|}t fddt|D | _d S )Nc                    s$   g | ]}t   d | dqS )z.layers.)r  r  r   )r0  )r   	layer_idxr  r   r  rP   rQ   r"    s    z1PixtralHFTransformer.__init__.<locals>.<listcomp>)rf   rg   ra  r  r  r  r  )ri   r  r  r2  r   ra  rj   r4  rQ   rg     s   

zPixtralHFTransformer.__init__rg  r&  r'  return_all_hidden_statesc                 C   s8   |g}| j D ]}||||}|r|| q|r|S |S rN   )r  r   )ri   rg  r&  r'  r5  hidden_states_poolrL   rP   rP   rQ   r5    s   

zPixtralHFTransformer.forwardrN   )r\   r]   r^   r   r#   r   r   rg   ra   rb   r   r5  r   rP   rP   rj   rQ   r1    s6    r1  c                       s   e Zd Z	ddddddededB dedB dedB ded	df fd
dZdddde	e
j de	e dB dedB d	ee
jdf fddZdeeee
jf  d	ee fddZ  ZS )PixtralHFVisionModelNr   )r2  require_post_normr   r  r  r2  r8  r   rM   c                   s   t    || _t|j|j|j|jdd| _t|jdd| _	t
|||| dd| _|j}t| jj|jkrEtd| dt| jj d	|d
u rOd}t|t|  j| _t|  j| _t|| j| _d S )NFr  r   r   z.transformer)r  r2  r   zThe original encoder only has z layers, but you requested z layers.Tz1PixtralHFVisionModel does not have post-layernorm)rf   rg   r  r   r_  r  r~   r  r   r  r1  r  ra  r   r  r   r  r  r  rq  r   patch_positional_embedding)ri   r  r  r2  r8  r   ra  msgrj   rP   rQ   rg     s:   
	
zPixtralHFVisionModel.__init__)select_layersfeature_select_strategyrT   r;  r<  .c                   s    fdd|D }dd |D }dd |D }t j|dd} |}t| jj jj d j} 	||}t
rItjjjdd |D }	nd	d
lm}
 |
dd |D |}	 j||	||dud}t|d| jj|d}t |d	|S )a~  
        Args:
            pixel_values: Each image to be processed will be a separate tensor
                in pixel_values. This means it will be a list of tensors
                because multiple requests batched can have multiple images,
                each with their own shape potentially
            select_layers: Layer indices whose features should be
                concatenated and used as the visual encoder output. If none
                are provided, the last layer is used.

        Returns:
            image_features: tensor of token features for
                all tokens of all images of shape (N_toks, D)
        c                    r  r  r  r%  rs   rP   rQ   r"  :  r  z0PixtralHFVisionModel.forward.<locals>.<listcomp>c                 S   r  r  r  r  rP   rP   rQ   r"  >  r  c                 S   r  rh  r  r  rP   rP   rQ   r"  ?  r#  r<   r  )r
  c                 S   r  r  r  r  rP   rP   rQ   r"  N  r  r   r  c                 S   r  r  r  r  rP   rP   rQ   r"  V  r  N)r5  )r;  max_possible_layersr<  )ra   r   r  r   r  r|   r~   r  rq  r9  r  r  r  r  r  r  r  r  r  rH   ra  r)  r  )ri   rT   r;  r<  r  r  r  position_idsposition_embeddingr&  r  r  rP   rs   rQ   r5  $  sH   


	zPixtralHFVisionModel.forwardr7  c                 C   s   g d}t |  }t }t| jj}|D ]L\}}|dr,t|dd }||kr,q|D ]\}	}
}|
|vr8q.|	|
|	}|| }|j
}||||  n|| }t|dt}||| || q|S )N))r   z.q_projr  )r   z.k_projr  )r   z.v_projr  )r  z
.gate_projr   )r  z.up_projr<   ztransformer.layersrC  r$  weight_loader)r   rO  setr   r  r  r   r   r)  replacer@  rX  r$   add)ri   r7  stacked_params_mappingparams_dictloaded_paramslayer_countr   loaded_weightr3  
param_nameweight_nameshard_idrH  r@  rP   rP   rQ   rP  m  s,   

z!PixtralHFVisionModel.load_weightsrN   )r\   r]   r^   r   r#   r   r   r   rg   rc   ra   rb   rF   r   r5  r   rA  rP  r   rP   rP   rj   rQ   r7    s>    0

,Ir7  )r  collections.abcr   r   r   dataclassesr   r   	functoolsr   typingr   r	   ra   torch.nnr  torch.nn.functionalr  r  &mistral_common.protocol.instruct.chunkr
   r   )mistral_common.protocol.instruct.messagesr   (mistral_common.protocol.instruct.requestr   +mistral_common.tokens.tokenizers.multimodalr   PILr   transformersr   r   r   transformers.image_utilsr   4transformers.models.pixtral.image_processing_pixtralr   r	  r  r   r   r   $transformers.tokenization_utils_baser   vllm.configr   vllm.config.multimodalr   vllm.distributedr   r   %vllm.model_executor.layers.activationr   vllm.model_executor.layers.convr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr    r!   r"   'vllm.model_executor.layers.quantizationr#   -vllm.model_executor.model_loader.weight_utilsr$   vllm.multimodalr%   r&   vllm.multimodal.inputsr'   r(   r)   r*   vllm.multimodal.parser+   r,   r-   vllm.multimodal.processingr.   r/   $vllm.multimodal.processing.processorr0   r1   r2   r3   r4   r5   vllm.platformsr6   vllm.sequencer7   vllm.tokenizersr8   vllm.tokenizers.mistralr9   vllm.utils.tensor_schemar:   r;   
interfacesr=   r>   r?   r@   module_mappingrA   utilsrB   rC   rD   visionrE   rF   rG   rH   xformersrI   r  is_cudahas_device_capabilityr  ImportErrorr  Moduler   rR   rS   rd   r   r   r   register_processorr   r  rb   rn  r   re  r{  r   r  r  r  r  r  rc   r  r  r  r  r  r  r  r  r0  r1  r7  rP   rP   rP   rQ   <module>   s    W1<=

 k

)
\G
%4&L(/