o
    i!T                     @   s  d dl mZ d dlmZmZmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;m<Z<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZEmFZF ddlGmHZHmIZImJZJmKZKmLZL ddlMmNZN G dd de<ZOG dd dejPZQG dd dejPZRG d d! d!e
ZSG d"d# d#e
ZTG d$d% d%e4ZUed&eUd'ZVG d(d) d)e2eV ZWG d*d+ d+eUZXG d,d- d-e3eX ZYd.e5d/eUfd0d1ZZdd2d3eVd4e2eV d5e(dB d/e3fd6d7Z[d8eSd/e\fd9d:Z]dd;d<d8eSd=e"dB d>e^dB d?e_d/eFf
d@dAZ`e&jae[eZeWdBG dCdD dDejPeAeBeCe@ZbdS )E    )abstractmethod)IterableMappingSequence)	AnnotatedFinalLiteralProtocolTypeVarN)BatchFeatureMistral3ConfigPixtralVisionConfigPretrainedConfig)PixtralProcessor)
VllmConfig)BaseDummyOptions)
get_act_fn)RMSNorm)ColumnParallelLinearRowParallelLinear)QuantizationConfig)MultiModelKeys)MULTIMODAL_REGISTRY)BaseMultiModalProcessorCache)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoInputProcessingContextPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsEagle3SupportsLoRASupportsMultiModal
SupportsPP)PixtralHFEncoderInfoPixtralHFVisionModel)AutoWeightsLoaderWeightsMapperget_layer_indexinit_vllm_registered_modelmaybe_prefix)get_vision_encoder_infoc                	   @   sP   e Zd ZU dZdZed ed< eej	e
ej	 B eddddddhdf ed	< d
S )Mistral3ImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image
    pixel_values_pixtraltypebn   hw)dynamic_dimspixel_valuesN)__name__
__module____qualname____doc__r:   r   __annotations__r   torchTensorlistr)    rI   rI   Y/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/mistral3.pyr8   ?   s   
 r8   c                       sJ   e Zd ZdZdededef fddZdejdejd	ejfd
dZ  Z	S )Mistral3PatchMergerz<
    Learned merging of spatial_merge_size ** 2 patches
    vision_hidden_sizespatial_merge_size
patch_sizec                    s<   t    || _|| _|| _tj|| jd  |dd| _d S )N   F)bias)super__init__rL   rM   rN   nnLinearmerging_layer)selfrL   rM   rN   	__class__rI   rJ   rR   W   s   
zMistral3PatchMerger.__init__image_featuresimage_sizesreturnc                    s    fdd|D }dd |D }|j d }g }t||D ]7\}}|| \}}	|||	|dddd}
tjjj	|
 j
 j
d}|| j
d  d }|| qtj|dd	} |}|S )
Nc                    s(   g | ]}|d   j  |d  j  fqS )r   r*   )rN   ).0
image_sizerV   rI   rJ   
<listcomp>h   s    z/Mistral3PatchMerger.forward.<locals>.<listcomp>c                 S   s   g | ]\}}|| qS rI   rI   )r\   r=   r>   rI   rI   rJ   r_   m   s    rO   r   r*   )kernel_sizestride)dim)shape	enumeratesplitviewpermute	unsqueezerF   rS   
functionalunfoldrM   tappendcatrU   )rV   rY   rZ   tokens_per_imagedpermuted_tensorimage_indeximage_tokensr=   r>   
image_gridgridrI   r^   rJ   forwarde   s*   


zMistral3PatchMerger.forward)
rA   rB   rC   rD   intrR   rF   rG   rv   __classcell__rI   rI   rW   rJ   rK   R   s     rK   c                       sd   e Zd Z		ddedededededed	edB d
ef fddZdej	dej	dej	fddZ
  ZS )Mistral3MultiModalProjectorN rL   text_hidden_sizerM   rN   projector_hidden_actmultimodal_projector_biasquant_configprefixc	           	         sj   t    t|dd| _t|||d| _t||||| dd| _t|| _	t
||||| dd| _d S )Ngh㈵>)eps)rL   rM   rN   z	.linear_1)rP   r~   r   z	.linear_2)rQ   rR   r   normrK   patch_mergerr   linear_1r   actr   linear_2)	rV   rL   r{   rM   rN   r|   r}   r~   r   rW   rI   rJ   rR      s,   

z$Mistral3MultiModalProjector.__init__rY   rZ   r[   c                 C   s@   |  |}| ||}| |\}}| |}| |\}}|S N)r   r   r   r   r   )rV   rY   rZ   hidden_states_rI   rI   rJ   rv      s   

z#Mistral3MultiModalProjector.forward)Nrz   )rA   rB   rC   rw   strboolr   rR   rF   rG   rv   rx   rI   rI   rW   rJ   ry      s6    		$ry   c                   @   sF   e Zd ZU ee ed< ee ed< ee ed< eeee B  ed< dS )LlavaLikeConfigvision_configimage_token_indexvision_feature_select_strategyvision_feature_layerN)	rA   rB   rC   r   r   rE   rw   r   rH   rI   rI   rI   rJ   r      s
   
 r   c                   @   s   e Zd ZU ee ed< dS )LlavaLikeProcessorimage_tokenN)rA   rB   rC   r   r   rE   rI   rI   rI   rJ   r      s   
 r   c                   @   sv   e Zd ZdefddZdd ZededefddZ	de
eed	B f fd
dZdededefddZdefddZd	S )BaseLlavaProcessingInfor[   c                 C   s   | j tS r   )ctxget_hf_configr   r^   rI   rI   rJ   r         z%BaseLlavaProcessingInfo.get_hf_configc                 C   s   t |  S r   )r7   r   r^   rI   rI   rJ   r7      r   z/BaseLlavaProcessingInfo.get_vision_encoder_infokwargsc                 K   s   t r   )NotImplementedErrorrV   r   rI   rI   rJ   get_hf_processor   s   z(BaseLlavaProcessingInfo.get_hf_processorNc                 C   s   dd iS )NimagerI   r^   rI   rI   rJ   get_supported_mm_limits   s   z/BaseLlavaProcessingInfo.get_supported_mm_limitsimage_widthimage_heightc                C   s   |   }|j||dS )Nr   r   )r7   get_num_image_tokens)rV   r   r   vision_encoder_inforI   rI   rJ   r      s
   z,BaseLlavaProcessingInfo.get_num_image_tokensc                 C   s    |   }|  }}t||dS )N)widthheight)r7   get_image_sizer   )rV   r   r   r   rI   rI   rJ   !get_image_size_with_most_features   s   z9BaseLlavaProcessingInfo.get_image_size_with_most_features)rA   rB   rC   r   r   r7   r   objectr   r   r   r   rw   r   r   r   r   rI   rI   rI   rJ   r      s    
r   _I)boundc                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Mistral3DummyInputsBuilder	mm_countsr[   c                 C   s$   | dd}| j }|j}|| S )Nr   r   )getinfor   r   )rV   r   
num_images	processorr   rI   rI   rJ   get_dummy_text   s   
z)Mistral3DummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j \}}|r| dnd }d| j||||diS )Nr   r   )r   r   r   	overrides)r   r   r   _get_dummy_images)rV   r   r   r   r   target_widthtarget_heightimage_overridesrI   rI   rJ   get_dummy_mm_data   s   z,Mistral3DummyInputsBuilder.get_dummy_mm_datar   )
rA   rB   rC   r   r   rw   r   r   r   r   rI   rI   rI   rJ   r      s    
r   c                   @   s   e Zd ZdefddZdS )Mistral3ProcessingInfor   c                 K   s   | j jtfi |S r   )r   r   r   r   rI   rI   rJ   r     s   z'Mistral3ProcessingInfo.get_hf_processorN)rA   rB   rC   r   r   rI   rI   rI   rJ   r     s    r   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ  ZS )Mistral3MultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsr[   c                    s^   t  j||||d}|d}|d ur-|d }t|t|ks!J dd t||D |d< |S )N)r   r   r   r   r@   rZ   c                 S   s.   g | ]\}\}}|d d d |d |f qS r   rI   )r\   pr=   r>   rI   rI   rJ   r_     s    "zBMistral3MultiModalProcessor._call_hf_processor.<locals>.<listcomp>)rQ   _call_hf_processorr   lenzip)rV   r   r   r   r   processed_outputsr@   rZ   rW   rI   rJ   r     s   

z.Mistral3MultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s   t tdtddS )Nr   )r@   image_embeds)dictr   batched)rV   r   r   rI   rI   rJ   _get_mm_fields_config"  s   z1Mistral3MultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc           	         s   | j jdi |}| j  }| j  }| }||j |j||j t|j	t
s,J t| dtf fdd}tdg|dgS )Nitem_idxc                    sT    dt}|| } j|j|jd\}}g| g | }|d< t|S )Nr   r   r`   )	get_itemsr   r   get_patch_grid_sizer   r   r&   select_token_id)r   imagesr]   ncolsnrowstokensencoder_infoimage_break_idimage_end_idimage_token_idr   rI   rJ   get_replacement>  s   

zHMistral3MultiModalProcessor._get_prompt_updates.<locals>.get_replacementr   )modalitytargetreplacementrI   )r   r   r   get_tokenizer	get_vocabimage_break_tokenr   image_end_token
isinstancer   r   r0   rw   r$   )	rV   r   r   r   r   	hf_config	tokenizervocabr   rI   r   rJ   _get_prompt_updates,  s    



z/Mistral3MultiModalProcessor._get_prompt_updates)rA   rB   rC   r   r   r   r   r   r   r   r   r   r   r%   r   rx   rI   rI   rW   rJ   r     s8    







r   r   r[   c                 C   s"   |  t}t|jtsJ t| S r   )r   r   r   r   r   r   )r   r   rI   rI   rJ   _build_mistral3_infoU  s   
r   cacher   dummy_inputsr   c                C   s   t | tsJ t| ||dS )Nr   )r   r   r   )r   r   r   rI   rI   rJ   _build_mistral3_processor]  s   r   r   c                    sZ   | j }| jj t|trt| S t|ttfr#t fdd|D S t	dt
| d)zDetermine the number of hidden layers to initialize up to in the
    visual encoder.

    Args:
        hf_config: Model config with vision feature layer(s).
    c                 3   s    | ]}t | V  qd S r   )r4   )r\   idxnum_hidden_layersrI   rJ   	<genexpr>y  s    z)_get_num_hidden_layers.<locals>.<genexpr>zvision_layer_feature type: z is not supported)r   r   r   r   rw   r4   rH   tuplemax	TypeErrorr:   )r   feature_layersrI   r   rJ   _get_num_hidden_layersk  s   

r   rz   )require_post_normr   r~   r   r   c                C   s.   | j }t| }t|tsJ t|||||dS )N)r~   num_hidden_layers_overrider   r   )r   r   r   r   r1   )r   r~   r   r   r   r   rI   rI   rJ   init_vision_tower_for_llava  s   r   )r   r   c                       sz  e Zd Zg dddgdZeddddd	d
ZededededB fddZ	de
edf ddfddZde
edf fddZdddededdf fddZdededB fddZd edeje
ejdf B fd!d"Zdedefd#d$Z		d3d%ejdB d&ejd'edB d(ejdB dedejeB fd)d*Zd+ejdejdB fd,d-Zd.ee
eejf  dee fd/d0Zdefd1d2Z  ZS )4 Mistral3ForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzlanguage_model.model.zvision_tower.zmulti_modal_projector.zlanguage_model.lm_head.)zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.zlm_head.)orig_to_new_prefixr   ir[   Nc                 C   s   | drd S td)Nr   z Only image modality is supported)
startswith
ValueError)clsr   r   rI   rI   rJ   get_placeholder_str  s   
z4Mistral3ForConditionalGeneration.get_placeholder_strlayers.c                 C   s   ||   j_d S r   )get_language_modelmodelaux_hidden_state_layers)rV   r  rI   rI   rJ   set_aux_hidden_state_layers  s   z<Mistral3ForConditionalGeneration.set_aux_hidden_state_layersc                 C   s"   t |  jj}d|d |d fS )NrO   r<   )r   r  r  r  )rV   
num_layersrI   rI   rJ   "get_eagle3_aux_hidden_state_layers  s   zCMistral3ForConditionalGeneration.get_eagle3_aux_hidden_state_layersrz   )r   vllm_configr   c                   s0  t    |jj}|j}|jj}|| _|| _|jjd u r'|jj	dkr'dg|j_|j
d u r5|jjdkr5d|_
| |d- t||dt|dd| _t|jj|jj|j
|j|jj|j|t|dd	| _W d    n1 skw   Y  | | t||jt|d
d| _W d    n1 sw   Y  | jj| _d S )NmistralMistralForCausalLMgelur   Fvision_tower)r~   r   r   multi_modal_projector)rL   r{   r|   rM   rN   r}   r~   r   language_model)r
  r   r   )rQ   rR   model_configr   r~   multimodal_configconfigtext_configarchitectures
model_typer|   r   
hidden_act_mark_tower_modelr   r6   r  ry   hidden_sizerM   rN   r}   r  _mark_language_modelr5   r  make_empty_intermediate_tensors)rV   r
  r   r  r~   r  rW   rI   rJ   rR     sN   




z)Mistral3ForConditionalGeneration.__init__r   c                 K   s8   | dd }| dd }|d u r|d u rd S td|dS )Nr@   r   r9   )r:   r@   )popr8   )rV   r   r@   r   rI   rI   rJ   _parse_and_validate_image_input  s   z@Mistral3ForConditionalGeneration._parse_and_validate_image_inputimage_inputc                    s   |d dkr
|d S dd |d D }  |d }t|tjr& ||S  fdd|D } t||}t|dkrFt||}|S |f}|S )	Nr:   r   datac                 S   s    g | ]}|j d  |j d fqS )r`   )rd   )r\   imgrI   rI   rJ   r_     s    zIMistral3ForConditionalGeneration._process_image_input.<locals>.<listcomp>r@   c                    s"   g | ]}|j d   jjd  qS )r   rO   )rd   r  rM   )r\   image_featurer^   rI   rJ   r_     s    r*   )r  r   rF   rG   r  rn   r   rf   )rV   r  rZ   rY   feature_sizesr   rI   r^   rJ   _process_image_input  s&   

z5Mistral3ForConditionalGeneration._process_image_inputc                 K   s*   | j di |}|d u rg S | |}|S )NrI   )r  r$  )rV   r   r  vision_embeddingsrI   rI   rJ   embed_multimodal  s
   
z1Mistral3ForConditionalGeneration.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s$   |durd}| j j||||d}|S )a  Run forward pass for Mistral3.

        One key thing to understand is the `input_ids` already accounts for the
        positions of the to-be-inserted image embeddings.

        Concretely, consider a text prompt:
        `"USER: <image>\nWhat's the content of the image?\nASSISTANT:"`.

        Tokenizer outputs:
        `[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 29915, 29879,
        278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901]`.

        To reserve space in KV cache, we have to insert placeholder tokens
        before they are inputted to the model, so the input processor prepends
        additional image tokens (denoted as `32000`), resulting in:
        `[1, 3148, 1001, 29901, 29871, 32000, ..., 32000, 29871, 13, 5618,
        29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566,
        29901]`.

        We insert 575 tokens so that including the original image token in the
        input, there are a total of 576 (24 * 24) image tokens, which
        corresponds to the number of image tokens inputted to the language
        model, i.e. the number of image tokens outputted by the visual encoder.

        This way, the `positions` and `attn_metadata` are consistent
        with the `input_ids`.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Position indices for the input tokens.
            intermediate_tensors: Intermediate tensors from prior forward pass.
            inputs_embeds: Optional tensor of input embeddings.

        Info:
            [`Mistral3ImagePixelInputs`][vllm.model_executor.models.mistral3.Mistral3ImagePixelInputs]
        N)r*  )r  r  )rV   r'  r(  r)  r*  r   r   rI   rI   rJ   rv   $  s   -z(Mistral3ForConditionalGeneration.forwardr   c                 C   s   | j |S r   )r  compute_logits)rV   r   rI   rI   rJ   r+  Z  s   z/Mistral3ForConditionalGeneration.compute_logitsweightsc                 C   s   t | }|j|| jdS )N)mapper)r2   load_weightshf_to_vllm_mapper)rV   r,  loaderrI   rI   rJ   r.  `  s   z-Mistral3ForConditionalGeneration.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r  r  r  )r  	connectortower_model)r   from_string_fieldr^   rI   rI   rJ   get_mm_mappingd  s
   z/Mistral3ForConditionalGeneration.get_mm_mapping)NN) rA   rB   rC   packed_modules_mappingr3   r/  classmethodr   rw   r  r   r  r	  r   rR   r   r8   r  rF   rG   r$  r+   r&  r'   rv   r+  r   setr.  r   r4  rx   rI   rI   rW   rJ   r     sb    	
 4


6
$r   )cabcr   collections.abcr   r   r   typingr   r   r   r	   r
   rF   torch.nnrS   transformersr   r   r   r   transformers.models.pixtralr   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.cacher   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr    r!   r"   r#   r$   r%   r&   vllm.sequencer'   vllm.utils.tensor_schemar(   r)   
interfacesr+   r,   r-   r.   r/   pixtralr0   r1   utilsr2   r3   r4   r5   r6   visionr7   r8   ModulerK   ry   r   r   r   r   r   r   r   r   r   rw   r   r   r   r   register_processorr   rI   rI   rI   rJ   <module>   s   $	20 O




