o
    
۾i`                     @   s  d Z ddlmZmZ ddlmZ ddlmZmZ ddl	Z	ddl
mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7m8Z8m9Z9 dZ:dZ;g dZ<dddddZ=ddd d dZ>G d!d" d"e4Z?G d#d$ d$e4Z@G d%d& d&e	jjAZBG d'd( d(e-ZCG d)d* d*e+eC ZDG d+d, d,e,eC ZEe"jFeEeCeDd-G d.d/ d/ejAe8e9ZGdS )0zPyTorch Ovis model.    )IterableMapping)partial)	AnnotatedLiteralN)BaseImageProcessorBatchFeaturePretrainedConfig)
VllmConfig)BaseDummyOptions)ReplicatedLinear)QuantizationConfig)VisualEmbedding)Siglip2NavitModel)AutoWeightsLoader
flatten_bninit_vllm_registered_modelmaybe_prefix)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacement)IntermediateTensors)Ovis2_5Processor)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPPz<image>z<video>)iiiiz	<unused0>z<|reserved_special_token_0|>z<|image_pad|>)gemma2llamaqwen2qwen3   i igP c                   @   |   e Zd ZU dZed ed< eeje	ddf ed< eeje	df ed< ee
e e	d	f ed
< eeje	d	df ed< dS )Ovis2_5ImagePatchInputsa  
    Dimensions:
        - bnp: Batch size * number of images * number of patches
        - patch_size: patch_size_x * patch_size_y * num_channels
        - patch_indicators: Batch size * (number of patches + 1)
        - bn: Batch size * number of images
    image_patchestypebnp
patch_size	flat_datapatch_indicatorsindicator_tokensbnpatches_per_item   gridsN__name__
__module____qualname____doc__r   __annotations__r   torchTensorr!   listint rB   rB   V/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/ovis2_5.pyr,   >      
 r,   c                   @   r+   )Ovis2_5VideoPatchInputsa  
    Dimensions:
        - bnp: Batch size * number of videos * number of patches
        - patch_size: patch_size_x * patch_size_y * num_channels
        - patch_indicators: Batch size * (number of patches + 1)
        - bn: Batch size * number of videos
    video_patchesr.   r/   r0   r1   r2   r3   r4   r5   r6   r7   Nr8   rB   rB   rB   rC   rE   O   rD   rE   c                	       s   e Zd ZdZ		ddedededB def fdd	Z		ddededB defd
dZ	e
dejfddZe
dejfddZdejdejfddZdejdejdejfddZdejdejdejfddZ  ZS )VisualTokenizerz
    VIT
    N configvisual_vocab_sizequant_configprefixc                    sl   t    || _| j||| dd| _|tt }tj	t
| jj| jjd  |dddtj|| _d S )Nz.vitrI   rK   rL      F)biasreturn_bias)super__init__rI   _init_backbonevitlenINDICATOR_IDSr>   nn
Sequentialr   hidden_sizehidden_stride	LayerNormhead)selfrI   rJ   rK   rL   head_dim	__class__rB   rC   rR   e   s"   


zVisualTokenizer.__init__c                 C   s*   |j }|dkrt|||dS td| )Nsiglip2_navitrM   z)Unsupported visual tokenizer model_type: )
model_typer   
ValueError)r]   rI   rK   rL   rb   rB   rB   rC   rS      s   zVisualTokenizer._init_backbonereturnc                 C      t | j jS N)nextr\   
parametersdtyper]   rB   rB   rC   ri         zVisualTokenizer.dtypec                 C   re   rf   )rg   r\   rh   devicerj   rB   rB   rC   rl      rk   zVisualTokenizer.devicelogitsc                 C   s   t j|dt jd|j}|S )N)dimri   )r>   softmaxfloat32tori   )r]   rm   tokensrB   rB   rC   tokenize   s   zVisualTokenizer.tokenizepixel_values	grid_thwsc                 C   s2   |  ||}|j\}}||| jjd  d}|S )NrN   rn   )rT   shapereshaperI   rZ   )r]   ru   rv   featuresseq_len_rB   rB   rC   encode   s   
zVisualTokenizer.encodec                 C   sB   |  ||}| |}| |}tjjj|dttfddd}|S )Nr   constant)modevalue)	r|   r\   rt   r>   rW   
functionalpadrU   rV   )r]   ru   rv   ry   rm   rs   rB   rB   rC   forward   s   


zVisualTokenizer.forward)NrH   )r9   r:   r;   r<   r	   rA   r   strrR   rS   propertyr>   ri   rl   r?   rt   r|   r   __classcell__rB   rB   r_   rC   rG   `   sR    


rG   c                   @   s  e Zd Zdd Zdd ZdefddZdefdd	Zde	ee
d
B f fddZdefddZddde
de
de
deee
f fddZde
fddZde
de
fddZde
de	ee
f de
fddZde
de
de
ded
B de
f
d d!Zde
de	ee
f de
fd"d#Zd
S )$Ovis2_5ProcessingInfoc                 C   s
   | j  S rf   )ctxget_hf_configrj   rB   rB   rC   r         
z#Ovis2_5ProcessingInfo.get_hf_configc                 K   s*   |   j}| jjt|  |j|j|jdS )N)image_pad_tokenr0   rZ   temporal_patch_size)	r   
vit_configr   get_hf_processorr   get_image_pad_tokenr0   rZ   r   )r]   kwargsr   rB   rB   rC   r      s   
z&Ovis2_5ProcessingInfo.get_hf_processorrd   c                 C   s   |    }|j}t|S rf   )r   get_text_configrb   IMAGE_PAD_TOKEN_MAPget)r]   hf_text_configtext_model_typerB   rB   rC   r      s   
z)Ovis2_5ProcessingInfo.get_image_pad_tokenc                 C   s
   |   jS rf   )r   image_processorrj   rB   rB   rC   get_image_processor   r   z)Ovis2_5ProcessingInfo.get_image_processorNc                 C   s
   d ddS )Nr"   imagevideorB   rj   rB   rB   rC   get_supported_mm_limits   r   z-Ovis2_5ProcessingInfo.get_supported_mm_limitsc                 C   s   t dddS )Ni   )widthheight)r   rj   rB   rB   rC   !get_image_size_with_most_features   s   z7Ovis2_5ProcessingInfo.get_image_size_with_most_featuresr"   )
num_framesimage_widthimage_heightr   c                C   sZ   |   }|j}|j}|j}|| |  }t|| d}	|| }
|| }|	|
 | }|}|S )Nr"   )r   r   r0   r   max)r]   r   r   r   	hf_configr   r0   r   padded_num_framesgrid_tgrid_hgrid_wnum_patchesnum_vision_tokensrB   rB   rC   get_num_image_tokens   s   z*Ovis2_5ProcessingInfo.get_num_image_tokensc                 C   s   |   \}}| j||dS )N)r   r   )r   r   )r]   target_widthtarget_heightrB   rB   rC   get_max_image_tokens   s   z*Ovis2_5ProcessingInfo.get_max_image_tokens
max_tokensc                 C   s@   |   \}}d}	 |d }| j|||d d}||kr	 |S |}q	)Nr   Tr"   r   r   r   r   )r   get_num_video_tokens)r]   r   r   r   r   next_num_framesnext_max_tokensrB   rB   rC   _get_max_video_frames   s   z+Ovis2_5ProcessingInfo._get_max_video_framesrz   	mm_countsc                 C   sJ   | dd}| dd}|  | }| || }|t|d }t|dS )Nr   r   r   r"   )r   r   r   r   )r]   rz   r   
max_images
max_videosmax_image_tokensmax_total_framesmax_frames_per_videorB   rB   rC   !get_num_frames_with_most_features   s   
z7Ovis2_5ProcessingInfo.get_num_frames_with_most_featuresr   c                C   s   | j |||d}|S )N)r   r   r   )r   )r]   r   r   r   r   num_video_tokensrB   rB   rC   r     s   z*Ovis2_5ProcessingInfo.get_num_video_tokensc                 C   s&   |   \}}| j||| ||d dS )Nr   )r   r   r   )r]   rz   r   r   r   rB   rB   rC   get_max_video_tokens  s   
z*Ovis2_5ProcessingInfo.get_max_video_tokens)r9   r:   r;   r   r   r   r   r   r   r   rA   r   r   r   tupler   r   r   r   r   r   rB   rB   rB   rC   r      sX    







r   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Ovis2_5DummyInputsBuilderr   rd   c                 C   s(   | dd}| dd}t| t|  S )Nr   r   r   )r   IMAGE_TOKENVIDEO_TOKEN)r]   r   
num_images
num_videosrB   rB   rC   get_dummy_text'  s   z(Ovis2_5DummyInputsBuilder.get_dummy_textNrz   
mm_optionsc                 C   s   | dd}| dd}| j \}}| j||}|r!| dnd }	|r*| dnd }
| j||||	d| j|||||
dd}|S )Nr   r   r   )r   r   r   	overrides)r   r   r   r   r   r   )r   infor   r   _get_dummy_images_get_dummy_videos)r]   rz   r   r   r   r   r   r   target_num_framesimage_overridesvideo_overridesmm_datarB   rB   rC   get_dummy_mm_data,  s.   z+Ovis2_5DummyInputsBuilder.get_dummy_mm_datarf   )
r9   r:   r;   r   r   rA   r   r   r   r   rB   rB   rB   rC   r   &  s    	
r   c                
       s   e Zd Zdee dee fddZdedeeef deeef deeef de	f
 fd	d
Z
dee dee fddZde	deeef deeef fddZdedeeef dedee fddZ  ZS )Ovis2_5MultiModalProcessorvisual_indicatorsrd   c                    s"   | j  }|j  fdd|D S )z|
        Filter image indicators placeholders and convert them to corresponding
        tokens in visual tokenizer.
        c                    s0   g | ]}|d k r t t t|d  d qS )ii,  r"   )rU   rV   abs.0xvte_vocab_sizerB   rC   
<listcomp>Z  s
    zQOvis2_5MultiModalProcessor.visual_indicators_to_visual_tokens.<locals>.<listcomp>)r   r   rJ   )r]   r   r   rB   r   rC   "visual_indicators_to_visual_tokensP  s
   

z=Ovis2_5MultiModalProcessor.visual_indicators_to_visual_tokenspromptr   	mm_kwargs
tok_kwargsc           
         s   |sj  }|j|dd}tt|gdddS t j||||d}j   d|v rF fdd	|d
 D }fdd	|D }	t	|	|d< d|v re fdd	|d D }fdd	|D }	t	|	|d< |S )NF)add_special_tokens)	input_idspt)tensor_type)r   r   r   r   videosc                       g | ]}  d dqS )r"   r"   r"   Tconstruct_visual_indicatorsr   gridhf_processorrB   rC   r   v      
zAOvis2_5MultiModalProcessor._call_hf_processor.<locals>.<listcomp>video_gridsc                       g | ]}  |qS rB   r   r   	indicatorrj   rB   rC   r   z      video_indicator_tokensimagesc                    r   )r   Fr   r   r   rB   rC   r     r   r7   c                    r   rB   r   r   rj   rB   rC   r     r   r3   )
r   get_tokenizerr|   r   dictrQ   _call_hf_processorr   r>   tensor)
r]   r   r   r   r   	tokenizer
prompt_idsprocessed_outputsr   r3   r_   )r   r]   rC   r   `  s8   





z-Ovis2_5MultiModalProcessor._call_hf_processorprompt_tokensc                 C   s   |S rf   rB   )r]   r   rB   rB   rC   _apply_hf_processor_tokens_only  s   z:Ovis2_5MultiModalProcessor._apply_hf_processor_tokens_only	hf_inputshf_processor_mm_kwargsc              	   C   s8   t tdtdtdtdtdtddS )Nr   r   )ru   r7   r3   video_pixel_valuesr   r   )r   r   batched)r]   r   r   rB   rB   rC   _get_mm_fields_config  s   z0Ovis2_5MultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s&   dt ffdd  fdddD S )Nmodalityc                    sV   |dkr d |  }|d j }n|dkr d |  }|d j }j }||d S )Nr   r7   r   r   r   )datar   r   construct_visual_placeholders)item_idxr   out_itemr   r   )r   r]   rB   rC   get_replacement_ovis  s   

zLOvis2_5MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_ovisc              	      s.   g | ]}t ||d krtntt |ddqS )r   )r   )r   targetreplacement)r   r   r   r   )r   r   )r  rB   rC   r     s    
zBOvis2_5MultiModalProcessor._get_prompt_updates.<locals>.<listcomp>r   )r   )r]   r   r   r   rB   )r  r   r]   rC   _get_prompt_updates  s   
z.Ovis2_5MultiModalProcessor._get_prompt_updates)r9   r:   r;   r@   rA   r   r   r   objectr   r   r   r   r   r   r   r   r  r   rB   rB   r_   rC   r   O  sL    



,




r   )r   dummy_inputsc                       s.  e Zd ZededededB fddZddd	ed
ef fddZde	de
dB fddZde	dedB fddZde
eB defddZde	defddZde	defddZ		d%dejdB dejdedB dejdB de	dejeB fddZdejdejdB fd d!Zd"eeeejf  dee fd#d$Z  ZS )&Ovis2_5r   ird   Nc                 C   s$   | drtS | drtS td)Nr   r   z)Only image or video modality is supported)
startswithr   r   rc   )clsr   r  rB   rB   rC   get_placeholder_str  s
   

zOvis2_5.get_placeholder_strrH   )rL   vllm_configrL   c                   s   t    |jj}|j}|| _| | t||j	t
|dd| _W d    n1 s-w   Y  | |ddh t|j|j|| dd| _t|j|j| _W d    n1 s[w   Y  | j j}t| | _|  j| _d S )Nllm)r  rL   r   r   z.visual_tokenizer)rI   rJ   rK   rL   )rQ   rR   model_configr   rK   rI   _mark_language_modelr   with_hf_configtext_configr   r  _mark_tower_modelrG   r   rJ   visual_tokenizerr   rY   vter   rb   IMAGE_PAD_TOKEN_ID_MAPimage_pad_token_idget_language_modelmake_empty_intermediate_tensors)r]   r  rL   rI   rK   r   r_   rB   rC   rR     s.   


	
zOvis2_5.__init__r   c              	         | dd }| dd }| dd }|d u r|d u rd S |d urb|d urbt|tjtfs5tdt| t|tjtfsFtdt| tdt|dd fd	d
|D t|ddt|dddS t	d)Nru   r3   r7   *Incorrect type of pixel values. Got type: .Incorrect type of indicator_tokens. Got type: r-   Tconcatc                    $   g | ]}|j d   jjjd  qS r   rN   rw   rI   r   rZ   r   rj   rB   rC   r          z;Ovis2_5._parse_and_validate_image_input.<locals>.<listcomp>r.   r1   r5   r3   r7    This line should be unreachable.)
pop
isinstancer>   r?   r@   rc   r.   r,   r   AssertionErrorr]   r   ru   r3   r7   rB   rj   rC   _parse_and_validate_image_input  4   



z'Ovis2_5._parse_and_validate_image_inputc              	      r  )Nr   r   r   r  r  rF   Tr   c                    r"  r#  r$  r   rj   rB   rC   r   "  r%  z;Ovis2_5._parse_and_validate_video_input.<locals>.<listcomp>r&  r'  )
r(  r)  r>   r?   r@   rc   r.   rE   r   r*  r+  rB   rj   rC   _parse_and_validate_video_input
  r-  z'Ovis2_5._parse_and_validate_video_inputvisual_inputc              	   C   s  |d }|d }|d }|d }t tdd |}| jj}| |||}| |}	| |}
|	j|dd}|
|}g }t||D ]>\}}g }|d}t	|j
d D ]}|tj|||d	  || gdd qU|||d	 d   |tj|dd qCt|S )
Nr1   r5   r3   r7   c                 S   s   | dkrdS | d S )Nr"   rN   rB   )r   rB   rB   rC   <lambda>5  s    z/Ovis2_5._process_visual_input.<locals>.<lambda>r   )ro   r"   )r@   mapr  ri   rr   r  splitzip	unsqueezerangerw   appendr>   catr   )r]   r/  image_patches_flatpatches_per_imager3   rv   indicator_per_imagetarget_dtypevisual_tokensvisual_embedsindicator_embedsvisual_embeds_per_imageindicator_embeds_per_imagevision_embeddingsr   visualvision_embeddings_per_imager  rB   rB   rC   _process_visual_input,  s8   




 zOvis2_5._process_visual_inputc                 K   sZ   i }|D ]&}|dv rd|vr| j di ||d< |dv r*d|vr*| jdi ||d< q|S )N)ru   r3   r7   r   )r   r   r   r   rB   )r,  r.  )r]   r   
modalities	input_keyrB   rB   rC   %_parse_and_validate_multimodal_inputsQ  s   z-Ovis2_5._parse_and_validate_multimodal_inputsc           	      K   sv   | j di |}|sg S d}|D ](}|dkr%|d }| |}|t|7 }|dkr8|d }| |}|t|7 }q|S )NrB   r   r   )rG  rD  r   )	r]   r   rE  multimodal_embeddingsr   image_inputimage_embeddingsvideo_inputvideo_embeddingsrB   rB   rC   embed_multimodale  s   

zOvis2_5.embed_multimodalr   	positionsintermediate_tensorsinputs_embedsc                 K   s"   |d urd }| j ||||d}|S )N)r   rN  rO  rP  )r  )r]   r   rN  rO  rP  r   hidden_statesrB   rB   rC   r   y  s   zOvis2_5.forwardrQ  c                 C   s   | j |S rf   )r  compute_logits)r]   rQ  rB   rB   rC   rR    s   zOvis2_5.compute_logitsweightsc                 C   s   t | }||S rf   )r   load_weights)r]   rS  loaderrB   rB   rC   rT    s   
zOvis2_5.load_weights)NN)r9   r:   r;   classmethodr   rA   r  r
   rR   r	  r,   r,  rE   r.  r#   rD  r   rG  rM  r>   r?   r   r   rR  r   r   setrT  r   rB   rB   r_   rC   r    sT    
"
"
%

,r  )Hr<   collections.abcr   r   	functoolsr   typingr   r   r>   torch.nnrW   transformersr   r   r	   vllm.configr
   vllm.config.multimodalr   !vllm.model_executor.layers.linearr   'vllm.model_executor.layers.quantizationr   vllm.model_executor.models.ovisr   'vllm.model_executor.models.siglip2navitr    vllm.model_executor.models.utilsr   r   r   r   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   vllm.multimodal.processingr   r   r   r   vllm.sequencer   *vllm.transformers_utils.processors.ovis2_5r   vllm.utils.tensor_schemar    r!   
interfacesr#   r$   r%   r   r   rV   r   r  r,   rE   ModulerG   r   r   r   register_processorr  rB   rB   rB   rC   <module>   s^   Vp)m