o
    
۾iSr                     @   s  U d dl mZ d dlmZmZmZ d dlmZmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z< d dl=m>Z> d dl?m@Z@mAZA ddlBmCZC ddlDmEZEmFZFmGZGmHZHmIZI ddlJmKZK ddlLmMZMmNZN ddlOmPZP ddlQmRZRmSZSmTZTmUZUmVZV ddlWmXZXmYZY G dd de@ZZG dd de@Z[G d d! d!e@Z\eZe[B e\B Z]ee^d"< G d#d$ d$ej_Z`G d%d& d&e
ZaG d'd( d(e
ZbG d)d* d*e8Zced+ecd,ZdG d-d. d.e6ed ZeG d/d0 d0ecZfG d1d2 d2e7ed ZgG d3d4 d4egef ZhG d5d6 d6ecZiG d7d8 d8e7ei Zjd9e9d:ecfd;d<Zkdd=d>edd?e6ed d@e)dB d:e7fdAdBZldCead:emfdDdEZnddFdGdCeadHe%dB dIeodB dJepd:eCePB eNB f
dKdLZqe'jrelekeedMG dNdO dOej_eGeHeIeFZsG dPdQ dQefZtG dRdS dSehZue'jreueteedMG dTdU dUesZvdS )V    )abstractmethod)IterableMappingSequence)	AnnotatedFinalLiteralProtocol	TypeAliasTypeVarN)BatchFeatureCLIPVisionConfigLlavaConfigPixtralVisionConfigPretrainedConfigSiglipVisionConfig)LlavaProcessor)PixtralProcessor)
VllmConfig)BaseDummyOptions)
get_act_fn)ColumnParallelLinearRowParallelLinear)QuantizationConfig)MULTIMODAL_REGISTRY)BaseMultiModalProcessorCache)MultiModalDataDictMultiModalFieldConfigMultiModalInputsMultiModalKwargsItemsMultiModalUUIDDict)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoInputProcessingContextPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )CLIPVisionModel)MultiModalEmbeddingsSupportsEagle3SupportsLoRASupportsMultiModal
SupportsPP)MultiModelKeys)PixtralHFEncoderInfoPixtralHFVisionModel)SiglipVisionModel)AutoWeightsLoaderWeightsMapperget_layer_indexinit_vllm_registered_modelmaybe_prefix)get_num_selected_vision_tokensget_vision_encoder_infoc                   @   s>   e Zd ZU dZdZed ed< eej	e
ddddf ed< dS )	LlavaImagePixelInputsa!  
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height
        - w: Width

    Note that `height` or `width` may be different per batch and image,
    in which case the data is passed as a list instead of a batched tensor.
    pixel_valuestypebn   hwN__name__
__module____qualname____doc__rC   r   __annotations__r   torchTensorr.    rP   rP   T/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/llava.pyrA   J   s   
  rA   c                	   @   sP   e Zd ZU dZdZed ed< eej	e
ej	 B eddddddhdf ed	< d
S )PixtralHFImagePixelInputsa  
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels
        - h: Height
        - w: Width

    Note that `height` or `width` may be different per batch and image,
    in which case the data is passed as a list instead of a batched tensor.
    pixel_values_pixtralrC   rD   crF   rG   )dynamic_dimsrB   N)rI   rJ   rK   rL   rC   r   rM   r   rN   rO   listr.   rP   rP   rP   rQ   rR   Z   s   
 rR   c                   @   s<   e Zd ZU dZdZed ed< eej	e
dddf ed< dS )	LlavaImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - ifs: Image feature size
        - hs: Hidden size (must match language model backbone)
    image_embedsrC   rD   ifshsdataNrH   rP   rP   rP   rQ   rW   m   s   
 rW   LlavaImageInputsc                       sV   e Zd Z		ddedededededB def fd	d
Zdej	dej	fddZ
  ZS )LlavaMultiModalProjectorN vision_hidden_sizetext_hidden_sizeprojector_hidden_actmultimodal_projector_biasquant_configprefixc                    sL   t    t||||| dd| _t|| _t||||| dd| _d S )Nz	.linear_1)biasrc   rd   z	.linear_2)super__init__r   linear_1r   actr   linear_2)selfr_   r`   ra   rb   rc   rd   	__class__rP   rQ   rg      s    
	
z!LlavaMultiModalProjector.__init__image_featuresreturnc                 C   s*   |  |\}}| |}| |\}}|S N)rh   ri   rj   )rk   rn   hidden_states_rP   rP   rQ   forward   s   
z LlavaMultiModalProjector.forward)Nr^   )rI   rJ   rK   intstrboolr   rg   rN   rO   rs   __classcell__rP   rP   rl   rQ   r]   ~   s"    r]   c                   @   sF   e Zd ZU ee ed< ee ed< ee ed< eeee B  ed< dS )LlavaLikeConfigvision_configimage_token_indexvision_feature_select_strategyvision_feature_layerN)	rI   rJ   rK   r   r   rM   rt   ru   rV   rP   rP   rP   rQ   rx      s
   
 rx   c                   @   s   e Zd ZU ee ed< dS )LlavaLikeProcessorimage_tokenN)rI   rJ   rK   r   ru   rM   rP   rP   rP   rQ   r}      s   
 r}   c                   @   s   e Zd ZdefddZdd ZededefddZ	de
eed	B f fd
dZdededefddZdefddZdefddZd	S )BaseLlavaProcessingInforo   c                 C   s   | j tS rp   )ctxget_hf_configr   rk   rP   rP   rQ   r         z%BaseLlavaProcessingInfo.get_hf_configc                 C   s   t |  S rp   )r@   r   r   rP   rP   rQ   r@      r   z/BaseLlavaProcessingInfo.get_vision_encoder_infokwargsc                 K      t rp   NotImplementedErrorrk   r   rP   rP   rQ   get_hf_processor   s   z(BaseLlavaProcessingInfo.get_hf_processorNc                 C   s   dd iS )NimagerP   r   rP   rP   rQ   get_supported_mm_limits   s   z/BaseLlavaProcessingInfo.get_supported_mm_limitsimage_widthimage_heightc                C   s&   |   }|  }t|j||d|jS Nr   r   )r   r@   r?   get_num_image_tokensr{   )rk   r   r   	hf_configvision_encoder_inforP   rP   rQ   r      s   z,BaseLlavaProcessingInfo.get_num_image_tokensc                 C   s    |   }|  }}t||dS )N)widthheight)r@   get_image_sizer#   )rk   r   r   r   rP   rP   rQ   !get_image_size_with_most_features   s   z9BaseLlavaProcessingInfo.get_image_size_with_most_featuresc                 C   s   |   \}}| j||dS r   )r   r   )rk   target_widthtarget_heightrP   rP   rQ   get_max_image_tokens   s
   z,BaseLlavaProcessingInfo.get_max_image_tokens)rI   rJ   rK   rx   r   r@   r   objectr}   r   r   ru   rt   r   r   r#   r   r   rP   rP   rP   rQ   r      s    
r   _I)boundc                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )LlavaDummyInputsBuilder	mm_countsro   c                 C   s$   | dd}| j }|j}|| S )Nr   r   )getinfor   r~   )rk   r   
num_images	processorr~   rP   rP   rQ   get_dummy_text   s   
z&LlavaDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j \}}|r| dnd }d| j||||diS )Nr   r   )r   r   r   	overrides)r   r   r   _get_dummy_images)rk   r   r   r   r   r   r   image_overridesrP   rP   rQ   get_dummy_mm_data   s   z)LlavaDummyInputsBuilder.get_dummy_mm_datarp   )
rI   rJ   rK   r   ru   rt   r   r   r   r   rP   rP   rP   rQ   r      s    
r   c                   @      e Zd ZdefddZdS )LlavaProcessingInfor   c                 K   s4   | j jtfi |}|jd u r|   }||_|S rp   )r   r   r   
patch_sizer@   get_patch_size)rk   r   hf_processorr   rP   rP   rQ   r      s
   
z$LlavaProcessingInfo.get_hf_processorNrI   rJ   rK   r   r   rP   rP   rP   rQ   r          r   c                	   @   s\   e Zd Zededeeef deeef fddZ	de
deeef dedee fdd	Zd
S )BaseLlavaMultiModalProcessor	hf_inputshf_processor_mm_kwargsro   c                 C   r   rp   r   rk   r   r   rP   rP   rQ   _get_mm_fields_config     z2BaseLlavaMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s8   j  }|j dtf fdd}td g|dgS )Nitem_idxc                    sN    dttf}t|tr|| }n|| }jj|j|j	d} g| S )Nr   r   )
	get_itemsr!   r"   
isinstanceget_feature_sizer   r   r   r   r   )r   imagesnum_image_tokens
image_sizeimage_token_idr   rk   rP   rQ   get_replacement  s   


zIBaseLlavaMultiModalProcessor._get_prompt_updates.<locals>.get_replacementr   modalitytargetreplacement)r   r   rz   rt   r)   )rk   r   r   r   r   r   rP   r   rQ   _get_prompt_updates  s   
z0BaseLlavaMultiModalProcessor._get_prompt_updatesN)rI   rJ   rK   r   r   r   ru   r   r   r   r$   r   r   r*   r   rP   rP   rP   rQ   r     s$    


r   c                   @   s2   e Zd Zdedeeef deeef fddZdS )LlavaMultiModalProcessorr   r   ro   c                 C      t tdtddS Nr   )rB   rX   dictr   batchedr   rP   rP   rQ   r   3     z.LlavaMultiModalProcessor._get_mm_fields_configN)	rI   rJ   rK   r   r   ru   r   r   r   rP   rP   rP   rQ   r   2  s    

r   c                   @   r   )PixtralHFProcessingInfor   c                 K   s   | j jtfi |S rp   )r   r   r   r   rP   rP   rQ   r   ?  s   z(PixtralHFProcessingInfo.get_hf_processorNr   rP   rP   rP   rQ   r   >  r   r   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ  ZS )PixtralHFMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsro   c                    s^   t  j||||d}|d}|d ur-|d }t|t|ks!J dd t||D |d< |S )N)r   r   r   r   rB   image_sizesc                 S   s.   g | ]\}\}}|d d d |d |f qS rp   rP   ).0prF   rG   rP   rP   rQ   
<listcomp>Y  s    "zCPixtralHFMultiModalProcessor._call_hf_processor.<locals>.<listcomp>)rf   _call_hf_processorr   lenzip)rk   r   r   r   r   processed_outputsrB   r   rl   rP   rQ   r   D  s   

z/PixtralHFMultiModalProcessor._call_hf_processorr   r   c                 C   r   r   r   r   rP   rP   rQ   r   _  r   z2PixtralHFMultiModalProcessor._get_mm_fields_configr   r   c           	         s   | j jdi |}| j  }| j  }| }||j |j||j t|j	t
s,J t| dtf fdd}tdg|dgS )Nr   c                    sT    dt}|| } j|j|jd\}}g| g | }|d< t|S )Nr   r   )r   r"   r   get_patch_grid_sizer   r   r+   select_token_id)r   r   r   ncolsnrowstokensencoder_infoimage_break_idimage_end_idr   r   rP   rQ   r   {  s   

zIPixtralHFMultiModalProcessor._get_prompt_updates.<locals>.get_replacementr   r   rP   )r   r   r   get_tokenizer	get_vocabimage_break_tokenrz   image_end_tokenr   ry   r   r7   rt   r)   )	rk   r   r   r   r   r   	tokenizervocabr   rP   r   rQ   r   i  s    



z0PixtralHFMultiModalProcessor._get_prompt_updates)rI   rJ   rK   ru   r   r   r   r   r   r   r$   r   r   r*   r   rw   rP   rP   rl   rQ   r   C  s8    







r   r   ro   c                 C   s&   |  t}t|jtrt| S t| S rp   )r   r   r   ry   r   r   r   )r   r   rP   rP   rQ   _build_llava_or_pixtral_hf_info  s   
r   cacher   dummy_inputsr   c                C   s<   t | trt| ||dS t | trt| ||dS tt| )Nr   )r   r   r   r   r   r   rC   )r   r   r   rP   rP   rQ   $_build_llava_or_pixtral_hf_processor  s   

r   r   c                    sZ   | j }| jj t|trt| S t|ttfr#t fdd|D S t	dt
| d)zDetermine the number of hidden layers to initialize up to in the
    visual encoder.

    Args:
        hf_config: Model config with vision feature layer(s).
    c                 3   s    | ]}t | V  qd S rp   )r<   )r   idxnum_hidden_layersrP   rQ   	<genexpr>  s    z)_get_num_hidden_layers.<locals>.<genexpr>zvision_layer_feature type: z is not supported)r|   ry   r   r   rt   r<   rV   tuplemax	TypeErrorrC   )r   feature_layersrP   r   rQ   _get_num_hidden_layers  s   

r   r^   )require_post_normrd   rc   r   rd   c                C   sx   | j }t| }t|trt|||||dS t|tr#t|||||dS t|tr1t|||||dS dt	| }t
|)N)rc   num_hidden_layers_overrider   rd   zUnsupported vision config: )ry   r   r   r   r0   r   r9   r   r8   rC   r   )r   rc   r   rd   ry   r   msgrP   rP   rQ   init_vision_tower_for_llava  s8   


r   )r   r   c                       s  e Zd Zg dddgdZeddddd	d
ZededededB fddZ	de
edf ddfddZde
edf fddZdddededdf fddZdededB fddZd eeB eB d!ejeej B deje
ejdf B fd"d#Zd$eeB deje
ejdf B fd%d&Zd'edeje
ejdf B fd(d)Zdedefd*d+Z		d@d,ejdB d-ejd.edB d/ejdB dedejeB fd0d1Z d2ejdejdB fd3d4Z!d5e"e
eejf  de#e fd6d7Z$de%fd8d9Z&d:edefd;d<Z'd=edefd>d?Z(  Z)S )ALlavaForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzlanguage_model.model.zvision_tower.zmulti_modal_projector.zlanguage_model.lm_head.)zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.zlm_head.)orig_to_new_prefixr   iro   Nc                 C   s   | drdS td)Nr   <image>z Only image modality is supported)
startswith
ValueError)clsr   r  rP   rP   rQ   get_placeholder_str  s   
z1LlavaForConditionalGeneration.get_placeholder_strlayers.c                 C   s   ||   j_d S rp   )get_language_modelmodelaux_hidden_state_layers)rk   r  rP   rP   rQ   set_aux_hidden_state_layers  s   z9LlavaForConditionalGeneration.set_aux_hidden_state_layersc                 C   s"   t |  jj}d|d |d fS )N   rE   )r   r  r  r  )rk   
num_layersrP   rP   rQ   "get_eagle3_aux_hidden_state_layers  s   z@LlavaForConditionalGeneration.get_eagle3_aux_hidden_state_layersr^   )rd   vllm_configrd   c             
      s&  t    |jj}|j}|jj}|| _|| _|jjd u r'|jj	dkr'dg|j_|j
d u r5|jjdkr5d|_
| |d( t||dt|dd| _t|jj|jj|j
|j|t|dd	| _W d    n1 sfw   Y  | | t||jt|d
d| _W d    n1 sw   Y  | jj| _d S )NmistralMistralForCausalLMgelur   Fvision_tower)rc   r   rd   multi_modal_projector)r_   r`   ra   rb   rc   rd   language_model)r  r   rd   )rf   rg   model_configr   rc   multimodal_configconfigtext_configarchitectures
model_typera   ry   
hidden_act_mark_tower_modelr   r>   r  r]   hidden_sizerb   r  _mark_language_modelr=   r  make_empty_intermediate_tensors)rk   r  rd   r  rc   r  rl   rP   rQ   rg     sJ   




z&LlavaForConditionalGeneration.__init__r   c                 K   s   | dd }| dd }|d u r|d u rd S |d ur8| jjjdkr'td|dS | jjj }}td|||ddS |d urM| jjjdkrGtdtd|d	S t	d
)NrB   rX   pixtralrS   )rC   rB   )rF   rG   )rC   rB   resolve_bindingsz)Pixtral-HF does not support image_embeds.)rC   r[   z This line should be unreachable.)
popr  ry   r"  rR   r   rA   r  rW   AssertionError)rk   r   rB   rX   
expected_h
expected_wrP   rP   rQ   _parse_and_validate_image_inputH  s0   z=LlavaForConditionalGeneration._parse_and_validate_image_inputr  rB   c                 C   s   ||| j jdS )N)feature_select_strategy)r  r{   )rk   r  rB   rP   rP   rQ   _image_pixels_to_featuresj  s   z7LlavaForConditionalGeneration._image_pixels_to_featuresinputsc                 C   s   |d }|  | j|S )NrB   )r0  r  )rk   r1  rB   rP   rP   rQ   _process_image_pixelsv  s   z3LlavaForConditionalGeneration._process_image_pixelsimage_inputc                 C   sb   |d dkr
|d S |  |}t|tjr| |S dd |D }| t|}t||}|S )NrC   rX   r[   c                 S   s   g | ]}|j d  qS )r   )shape)r   image_featurerP   rP   rQ   r     s    zFLlavaForConditionalGeneration._process_image_input.<locals>.<listcomp>)r2  r   rN   rO   r  catsplit)rk   r3  rn   feature_sizesrX   rP   rP   rQ   _process_image_input~  s   

z2LlavaForConditionalGeneration._process_image_inputc                 K   s&   | j di |}|d u rg S | |S )NrP   )r.  r9  )rk   r   r3  rP   rP   rQ   embed_multimodal  s   
z.LlavaForConditionalGeneration.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s$   |durd}| j j||||d}|S )a  Run forward pass for LLaVA-1.5.

        One key thing to understand is the `input_ids` already accounts for the
        positions of the to-be-inserted image embeddings.

        Concretely, consider a text prompt:
        `"USER: <image>\nWhat's the content of the image?\nASSISTANT:"`.

        Tokenizer outputs:
        `[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 29915, 29879,
        278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901]`.

        To reserve space in KV cache, we have to insert placeholder tokens
        before they are inputted to the model, so the input processor prepends
        additional image tokens (denoted as `32000`), resulting in:
        `[1, 3148, 1001, 29901, 29871, 32000, ..., 32000, 29871, 13, 5618,
        29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566,
        29901]`.

        We insert 575 tokens so that including the original image token in the
        input, there are a total of 576 (24 * 24) image tokens, which
        corresponds to the number of image tokens inputted to the language
        model, i.e. the number of image tokens outputted by the visual encoder.

        This way, the `positions` and `attn_metadata` are consistent
        with the `input_ids`.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Position indices for the input tokens.
            intermediate_tensors: Intermediate tensors from prior forward pass.
            inputs_embeds: Optional tensor of input embeddings.

        Info:
            [`LlavaImageInputs`][vllm.model_executor.models.llava.LlavaImageInputs]
        N)r>  )r  r  )rk   r;  r<  r=  r>  r   rq   rP   rP   rQ   rs     s   -z%LlavaForConditionalGeneration.forwardrq   c                 C   s   | j |S rp   )r  compute_logits)rk   rq   rP   rP   rQ   r?    s   z,LlavaForConditionalGeneration.compute_logitsweightsc                 C   s   t | }|j|| jdS )N)mapper)r:   load_weightshf_to_vllm_mapper)rk   r@  loaderrP   rP   rQ   rB    s   z*LlavaForConditionalGeneration.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r  r  r  )r  	connectortower_model)r6   from_string_fieldr   rP   rP   rQ   get_mm_mapping  s
   z,LlavaForConditionalGeneration.get_mm_mappingr   c                 C      |S rp   rP   )rk   r   rP   rP   rQ   get_num_mm_encoder_tokens  r   z7LlavaForConditionalGeneration.get_num_mm_encoder_tokensnum_vision_tokensc                 C   rI  rp   rP   )rk   rK  rP   rP   rQ   get_num_mm_connector_tokens  r   z9LlavaForConditionalGeneration.get_num_mm_connector_tokensNN)*rI   rJ   rK   packed_modules_mappingr;   rC  classmethodru   rt   r  r   r  r  r   rg   r   r\   r.  r0   r9   r8   rN   rO   rV   r0  rA   rR   r2  r9  r1   r:  r,   rs   r?  r   setrB  r6   rH  rJ  rL  rw   rP   rP   rl   rQ   r     s    	
 2
"




6
$

r   c                   @   r   )MantisProcessingInfor   c                 K   sB   |   }|  }|d|  |d|j | jjtfi |S )Nr   r{   )r   r@   
setdefaultr   r{   r   r   r   )rk   r   r   vision_inforP   rP   rQ   r     s   z%MantisProcessingInfo.get_hf_processorNr   rP   rP   rP   rQ   rQ    r   rQ  c                       s\   e Zd Z		d
deee B dedeeef deeef dB de	dB de
f fdd	Z  ZS )MantisMultiModalProcessorNr   r   r   tokenization_kwargsmm_uuidsro   c                    s   | j  }|j}| j jddd t j|||||d}| }	|d }
|d }dtf fdd}| t	d	|g  |d
g|	}| 
|d |\}}| |||
}| ||}| ||	 dd | D }td||
||dS )Nr   r   )rV  r   	mm_hashesr   c                    s    d d| d  dd  dgS )Nr^   z(image r/   z	: <Image>r	  z	</Image>))join)r   r   rP   rQ   get_replacement_mantis   s   z?MantisMultiModalProcessor.apply.<locals>.get_replacement_mantisr   r   prompt_token_idsc                 S   s    i | ]\}}|d d |D qS )c                 S   s   g | ]}|  qS rP   )to_range)r   itemrP   rP   rQ   r   B  s    z>MantisMultiModalProcessor.apply.<locals>.<dictcomp>.<listcomp>rP   )r   r   placeholdersrP   rP   rQ   
<dictcomp>A  s    z3MantisMultiModalProcessor.apply.<locals>.<dictcomp>
multimodal)rC   r[  r   rW  mm_placeholders)r   r   rz   r   rf   applyget_all_countsrt   _bind_and_group_updatesr)   _apply_prompt_updates_get_mm_prompt_updates_find_mm_placeholders_validate_mm_placeholdersitemsr   )rk   r   r   r   rU  rV  r   r   resultmm_item_countsr   rW  rZ  mantis_mm_repls
prompt_idsrr   
orig_replsra  mm_placeholder_rangesrl   rY  rQ   rb    s^   
	zMantisMultiModalProcessor.applyrM  )rI   rJ   rK   ru   rV   rt   r$   r   r   r    r   rb  rw   rP   rP   rl   rQ   rT     s     

rT  c                   @   s   e Zd ZdS )MantisForConditionalGenerationN)rI   rJ   rK   rP   rP   rP   rQ   rp  Q  s    rp  )wabcr   collections.abcr   r   r   typingr   r   r   r	   r
   r   rN   torch.nnnntransformersr   r   r   r   r   r   transformers.models.llavar   transformers.models.pixtralr   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.cacher   vllm.multimodal.inputsr   r   r   r   r    vllm.multimodal.parser!   r"   r#   r$   vllm.multimodal.processingr%   r&   r'   r(   r)   r*   r+   vllm.sequencer,   vllm.utils.tensor_schemar-   r.   clipr0   
interfacesr1   r2   r3   r4   r5   module_mappingr6   r(  r7   r8   siglipr9   utilsr:   r;   r<   r=   r>   visionr?   r@   rA   rR   rW   r\   rM   Moduler]   rx   r}   r   r   r   r   r   r   r   r   r   r   rt   r   rv   ru   r   register_processorr   rQ  rT  rp  rP   rP   rP   rQ   <module>   s     $	

#-,O



)
 }Q