o
    -iR                     @   s  d dl mZ d dlmZmZmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;m<Z<m=Z= ddl>m?Z?m@Z@mAZAmBZB ddlCmDZDmEZE ddlFmGZGmHZHmIZImJZJmKZK ddlLmMZM G dd de<ZNG dd dejOZPG dd dejOZQG d d! d!e
ZRG d"d# d#e
ZSG d$d% d%e4ZTed&eTd'ZUG d(d) d)e2eU ZVG d*d+ d+eTZWG d,d- d-e3eW ZXd.e5d/eTfd0d1ZYdd2d3eUd4e2eU d5e(dB d/e3fd6d7ZZd8eRd/e[fd9d:Z\dd;d<d8eRd=e"dB d>e]dB d?e^d/eEf
d@dAZ_e&j`eZeYeVdBG dCdD dDejOe@eAeBZadS )E    )abstractmethod)IterableMappingSequence)	AnnotatedFinalLiteralProtocolTypeVarN)BatchFeatureMistral3ConfigPixtralVisionConfigPretrainedConfig)PixtralProcessor)
VllmConfig)BaseDummyOptions)
get_act_fn)RMSNorm)ColumnParallelLinearRowParallelLinear)QuantizationConfig)MultiModelKeys)MULTIMODAL_REGISTRY)BaseMultiModalProcessorCache)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoInputProcessingContextPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)PixtralHFEncoderInfoPixtralHFVisionModel)AutoWeightsLoaderWeightsMapperget_layer_indexinit_vllm_registered_modelmaybe_prefix)get_vision_encoder_infoc                	   @   sP   e Zd ZU dZdZed ed< eej	e
ej	 B eddddddhdf ed	< d
S )Mistral3ImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image
    pixel_values_pixtraltypebn   hw)dynamic_dimspixel_valuesN)__name__
__module____qualname____doc__r9   r   __annotations__r   torchTensorlistr)    rH   rH   `/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/mistral3.pyr7   >   s   
 r7   c                       sJ   e Zd ZdZdededef fddZdejdejd	ejfd
dZ  Z	S )Mistral3PatchMergerz<
    Learned merging of spatial_merge_size ** 2 patches
    vision_hidden_sizespatial_merge_size
patch_sizec                    s<   t    || _|| _|| _tj|| jd  |dd| _d S )N   F)bias)super__init__rK   rL   rM   nnLinearmerging_layer)selfrK   rL   rM   	__class__rH   rI   rQ   V   s   
zMistral3PatchMerger.__init__image_featuresimage_sizesreturnc                    s    fdd|D }dd |D }|j d }g }t||D ]7\}}|| \}}	|||	|dddd}
tjjj	|
 j
 j
d}|| j
d  d }|| qtj|dd	} |}|S )
Nc                    s(   g | ]}|d   j  |d  j  fqS )r   r*   )rM   ).0
image_sizerU   rH   rI   
<listcomp>g   s    z/Mistral3PatchMerger.forward.<locals>.<listcomp>c                 S   s   g | ]\}}|| qS rH   rH   )r[   r<   r=   rH   rH   rI   r^   l   s    rN   r   r*   )kernel_sizestride)dim)shape	enumeratesplitviewpermute	unsqueezerE   rR   
functionalunfoldrL   tappendcatrT   )rU   rX   rY   tokens_per_imagedpermuted_tensorimage_indeximage_tokensr<   r=   
image_gridgridrH   r]   rI   forwardd   s*   


zMistral3PatchMerger.forward)
r@   rA   rB   rC   intrQ   rE   rF   ru   __classcell__rH   rH   rV   rI   rJ   Q   s     rJ   c                       sd   e Zd Z		ddedededededed	edB d
ef fddZdej	dej	dej	fddZ
  ZS )Mistral3MultiModalProjectorN rK   text_hidden_sizerL   rM   projector_hidden_actmultimodal_projector_biasquant_configprefixc	           	         sj   t    t|dd| _t|||d| _t||||| dd| _t|| _	t
||||| dd| _d S )Ngh㈵>)eps)rK   rL   rM   z	.linear_1)rO   r}   r~   z	.linear_2)rP   rQ   r   normrJ   patch_mergerr   linear_1r   actr   linear_2)	rU   rK   rz   rL   rM   r{   r|   r}   r~   rV   rH   rI   rQ      s,   

z$Mistral3MultiModalProjector.__init__rX   rY   rZ   c                 C   s@   |  |}| ||}| |\}}| |}| |\}}|S N)r   r   r   r   r   )rU   rX   rY   hidden_states_rH   rH   rI   ru      s   

z#Mistral3MultiModalProjector.forward)Nry   )r@   rA   rB   rv   strboolr   rQ   rE   rF   ru   rw   rH   rH   rV   rI   rx      s6    		$rx   c                   @   sF   e Zd ZU ee ed< ee ed< ee ed< eeee B  ed< dS )LlavaLikeConfigvision_configimage_token_indexvision_feature_select_strategyvision_feature_layerN)	r@   rA   rB   r   r   rD   rv   r   rG   rH   rH   rH   rI   r      s
   
 r   c                   @   s   e Zd ZU ee ed< dS )LlavaLikeProcessorimage_tokenN)r@   rA   rB   r   r   rD   rH   rH   rH   rI   r      s   
 r   c                   @   sv   e Zd ZdefddZdd ZededefddZ	de
eed	B f fd
dZdededefddZdefddZd	S )BaseLlavaProcessingInforZ   c                 C   s   | j tS r   )ctxget_hf_configr   r]   rH   rH   rI   r         z%BaseLlavaProcessingInfo.get_hf_configc                 C   s   t |  S r   )r6   r   r]   rH   rH   rI   r6      r   z/BaseLlavaProcessingInfo.get_vision_encoder_infokwargsc                 K   s   t r   )NotImplementedErrorrU   r   rH   rH   rI   get_hf_processor   s   z(BaseLlavaProcessingInfo.get_hf_processorNc                 C   s   dd iS )NimagerH   r]   rH   rH   rI   get_supported_mm_limits   s   z/BaseLlavaProcessingInfo.get_supported_mm_limitsimage_widthimage_heightc                C   s   |   }|j||dS )Nr   r   )r6   get_num_image_tokens)rU   r   r   vision_encoder_inforH   rH   rI   r      s
   z,BaseLlavaProcessingInfo.get_num_image_tokensc                 C   s    |   }|  }}t||dS )N)widthheight)r6   get_image_sizer   )rU   r   r   r   rH   rH   rI   !get_image_size_with_most_features   s   z9BaseLlavaProcessingInfo.get_image_size_with_most_features)r@   rA   rB   r   r   r6   r   objectr   r   r   r   rv   r   r   r   r   rH   rH   rH   rI   r      s    
r   _I)boundc                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Mistral3DummyInputsBuilder	mm_countsrZ   c                 C   s$   | dd}| j }|j}|| S )Nr   r   )getinfor   r   )rU   r   
num_images	processorr   rH   rH   rI   get_dummy_text   s   
z)Mistral3DummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j \}}|r| dnd }d| j||||diS )Nr   r   )r   r   r   	overrides)r   r   r   _get_dummy_images)rU   r   r   r   r   target_widthtarget_heightimage_overridesrH   rH   rI   get_dummy_mm_data   s   z,Mistral3DummyInputsBuilder.get_dummy_mm_datar   )
r@   rA   rB   r   r   rv   r   r   r   r   rH   rH   rH   rI   r      s    
r   c                   @   s   e Zd ZdefddZdS )Mistral3ProcessingInfor   c                 K   s   | j jtfi |S r   )r   r   r   r   rH   rH   rI   r     s   z'Mistral3ProcessingInfo.get_hf_processorN)r@   rA   rB   r   r   rH   rH   rH   rI   r      s    r   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ  ZS )Mistral3MultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrZ   c                    s^   t  j||||d}|d}|d ur-|d }t|t|ks!J dd t||D |d< |S )N)r   r   r   r   r?   rY   c                 S   s.   g | ]\}\}}|d d d |d |f qS r   rH   )r[   pr<   r=   rH   rH   rI   r^     s    "zBMistral3MultiModalProcessor._call_hf_processor.<locals>.<listcomp>)rP   _call_hf_processorr   lenzip)rU   r   r   r   r   processed_outputsr?   rY   rV   rH   rI   r     s   

z.Mistral3MultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s   t tdtddS )Nr   )r?   image_embeds)dictr   batched)rU   r   r   rH   rH   rI   _get_mm_fields_config!  s   z1Mistral3MultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc           	         s   | j jdi |}| j  }| j  }| }||j |j||j t|j	t
s,J t| dtf fdd}tdg|dgS )Nitem_idxc                    sT    dt}|| } j|j|jd\}}g| g | }|d< t|S )Nr   r   r_   )	get_itemsr   r   get_patch_grid_sizer   r   r&   select_token_id)r   imagesr\   ncolsnrowstokensencoder_infoimage_break_idimage_end_idimage_token_idr   rH   rI   get_replacement=  s   

zHMistral3MultiModalProcessor._get_prompt_updates.<locals>.get_replacementr   )modalitytargetreplacementrH   )r   r   r   get_tokenizer	get_vocabimage_break_tokenr   image_end_token
isinstancer   r   r/   rv   r$   )	rU   r   r   r   r   	hf_config	tokenizervocabr   rH   r   rI   _get_prompt_updates+  s    



z/Mistral3MultiModalProcessor._get_prompt_updates)r@   rA   rB   r   r   r   r   r   r   r   r   r   r   r%   r   rw   rH   rH   rV   rI   r     s8    







r   r   rZ   c                 C   s"   |  t}t|jtsJ t| S r   )r   r   r   r   r   r   )r   r   rH   rH   rI   _build_mistral3_infoT  s   
r   cacher   dummy_inputsr   c                C   s   t | tsJ t| ||dS )Nr   )r   r   r   )r   r   r   rH   rH   rI   _build_mistral3_processor\  s   r   r   c                    sZ   | j }| jj t|trt| S t|ttfr#t fdd|D S t	dt
| d)zDetermine the number of hidden layers to initialize up to in the
    visual encoder.

    Args:
        hf_config: Model config with vision feature layer(s).
    c                 3   s    | ]}t | V  qd S r   )r3   )r[   idxnum_hidden_layersrH   rI   	<genexpr>x  s    z)_get_num_hidden_layers.<locals>.<genexpr>zvision_layer_feature type: z is not supported)r   r   r   r   rv   r3   rG   tuplemax	TypeErrorr9   )r   feature_layersrH   r   rI   _get_num_hidden_layersj  s   

r   ry   )require_post_normr~   r}   r   r~   c                C   s.   | j }t| }t|tsJ t|||||dS )N)r}   num_hidden_layers_overrider   r~   )r   r   r   r   r0   )r   r}   r   r~   r   r   rH   rH   rI   init_vision_tower_for_llava~  s   r   )r   r   c                       sF  e Zd Zg dddgdZeddddd	d
ZededededB fddZ	ddde
deddf fddZdededB fddZdedejeejdf B fddZdedefddZ		d.d ejd!ejd"edB d#ejdB dedejeB fd$d%Zd&ejdejdB fd'd(Zd)eeeejf  dee fd*d+Zdefd,d-Z  ZS )/ Mistral3ForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzlanguage_model.model.zvision_tower.zmulti_modal_projector.zlanguage_model.lm_head.)zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.zlm_head.)orig_to_new_prefixr   irZ   Nc                 C   s   | drd S td)Nr   z Only image modality is supported)
startswith
ValueError)clsr   r   rH   rH   rI   get_placeholder_str  s   
z4Mistral3ForConditionalGeneration.get_placeholder_strry   )r~   vllm_configr~   c                   s0  t    |jj}|j}|jj}|| _|| _|jjd u r'|jj	dkr'dg|j_|j
d u r5|jjdkr5d|_
| |d- t||dt|dd| _t|jj|jj|j
|j|jj|j|t|dd	| _W d    n1 skw   Y  | | t||jt|d
d| _W d    n1 sw   Y  | jj| _d S )NmistralMistralForCausalLMgelur   Fvision_tower)r}   r   r~   multi_modal_projector)rK   rz   r{   rL   rM   r|   r}   r~   language_model)r  r   r~   )rP   rQ   model_configr   r}   multimodal_configconfigtext_configarchitectures
model_typer{   r   
hidden_act_mark_tower_modelr   r5   r  rx   hidden_sizerL   rM   r|   r  _mark_language_modelr4   r  make_empty_intermediate_tensors)rU   r  r~   r  r}   r
  rV   rH   rI   rQ     sN   




z)Mistral3ForConditionalGeneration.__init__r   c                 K   s8   | dd }| dd }|d u r|d u rd S td|dS )Nr?   r   r8   )r9   r?   )popr7   )rU   r   r?   r   rH   rH   rI   _parse_and_validate_image_input  s   z@Mistral3ForConditionalGeneration._parse_and_validate_image_inputimage_input.c                    s   |d dkr
|d S dd |d D }  |d }t|tjr& ||S  fdd|D } t||}t|dkrFt||}|S |f}|S )	Nr9   r   datac                 S   s    g | ]}|j d  |j d fqS )r_   )rc   )r[   imgrH   rH   rI   r^     s    zIMistral3ForConditionalGeneration._process_image_input.<locals>.<listcomp>r?   c                    s"   g | ]}|j d   jjd  qS )r   rN   )rc   r  rL   )r[   image_featurer]   rH   rI   r^     s    r*   )r  r   rE   rF   r  rm   r   re   )rU   r  rY   rX   feature_sizesr   rH   r]   rI   _process_image_input  s&   

z5Mistral3ForConditionalGeneration._process_image_inputc                 K   s*   | j di |}|d u rg S | |}|S )NrH   )r  r  )rU   r   r  vision_embeddingsrH   rH   rI   embed_multimodal  s
   
z1Mistral3ForConditionalGeneration.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s$   |durd}| j j||||d}|S )a  Run forward pass for Mistral3.

        One key thing to understand is the `input_ids` already accounts for the
        positions of the to-be-inserted image embeddings.

        Concretely, consider a text prompt:
        `"USER: <image>\nWhat's the content of the image?\nASSISTANT:"`.

        Tokenizer outputs:
        `[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 29915, 29879,
        278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901]`.

        To reserve space in KV cache, we have to insert placeholder tokens
        before they are inputted to the model, so the input processor prepends
        additional image tokens (denoted as `32000`), resulting in:
        `[1, 3148, 1001, 29901, 29871, 32000, ..., 32000, 29871, 13, 5618,
        29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566,
        29901]`.

        We insert 575 tokens so that including the original image token in the
        input, there are a total of 576 (24 * 24) image tokens, which
        corresponds to the number of image tokens inputted to the language
        model, i.e. the number of image tokens outputted by the visual encoder.

        This way, the `positions` and `attn_metadata` are consistent
        with the `input_ids`.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Position indices for the input tokens.
            intermediate_tensors: Intermediate tensors from prior forward pass.
            inputs_embeds: Optional tensor of input embeddings.

        Info:
            [`Mistral3ImagePixelInputs`][vllm.model_executor.models.mistral3.Mistral3ImagePixelInputs]
        N)r"  )r  model)rU   r  r   r!  r"  r   r   rH   rH   rI   ru     s   -z(Mistral3ForConditionalGeneration.forwardr   c                 C   s   | j |S r   )r  compute_logits)rU   r   rH   rH   rI   r$  R  s   z/Mistral3ForConditionalGeneration.compute_logitsweightsc                 C   s   t | }|j|| jdS )N)mapper)r1   load_weightshf_to_vllm_mapper)rU   r%  loaderrH   rH   rI   r'  X  s   z-Mistral3ForConditionalGeneration.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r  r  r  )r  	connectortower_model)r   from_string_fieldr]   rH   rH   rI   get_mm_mapping\  s
   z/Mistral3ForConditionalGeneration.get_mm_mapping)NN)r@   rA   rB   packed_modules_mappingr2   r(  classmethodr   rv   r  r   rQ   r   r7   r  rE   rF   r   r  r+   r  r'   ru   r$  r   setr'  r   r-  rw   rH   rH   rV   rI   r     s^    	
 4


6
$r   )babcr   collections.abcr   r   r   typingr   r   r   r	   r
   rE   torch.nnrR   transformersr   r   r   r   transformers.models.pixtralr   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.cacher   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr    r!   r"   r#   r$   r%   r&   vllm.sequencer'   vllm.utils.tensor_schemar(   r)   
interfacesr+   r,   r-   r.   pixtralr/   r0   utilsr1   r2   r3   r4   r5   visionr6   r7   ModulerJ   rx   r   r   r   r   r   r   r   r   r   rv   r   r   r   r   register_processorr   rH   rH   rH   rI   <module>   s   $	20 O





