o
    ivO                     @   s&  d Z ddlZddlmZmZ ddlmZmZ ddlZddl	m
Z
 ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z< dZ=g dZ>ddddZ?ddd dZ@d!ejd"eAfd#d$ZBG d%d& d&ej
jCZDG d'd( d(e7ZEG d)d* d*ej
jFZGG d+d, d,e0ZHG d-d. d.e.eH ZIG d/d0 d0e/eH ZJe%jKeJeHeId1G d2d3 d3e
jCe;e<ZLdS )4zPyTorch Ovis model.    N)IterableMapping)	AnnotatedLiteral)Tensor)gumbel_softmaxpadsoftmax)BatchFeaturePretrainedConfig)
VllmConfig)BaseDummyOptions)ReplicatedLinear)QuantizationConfig)
AIMv2Model)SiglipVisionModel)AutoWeightsLoader
flatten_bninit_vllm_registered_modelmaybe_prefix)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacement)IntermediateTensors)OvisProcessor)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPPz<image>)iiiiiz	<unused0>z<|reserved_special_token_0|>z<|image_pad|>)gemma2llamaqwen2   i igP y_softdimc                 C   s(   | j |dd}tj| tjd||dS )NT)keepdim)memory_formatg      ?)argmaxtorch
zeros_likelegacy_contiguous_formatscatter_)r,   r-   index r6   U/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/ovis.py	st_argmaxN   s   
r8   c                	       s   e Zd Z		ddededB def fddZ		ddededB dedejfd	d
Z	e
dejfddZe
dejfddZdejdejfddZdejdejfddZdejdejfddZ  ZS )VisualTokenizerN configquant_configprefixc                    sn   t    || _| j||| dd| _|jtt }tj	
t|jj|j |j |dddtj	|| _d S )Nz	.backboner;   r<   r=   F)biasreturn_bias)super__init__r;   _init_backbonebackbone
vocab_sizelenIMAGE_INDICATOR_IDSr1   nn
Sequentialr   backbone_confighidden_sizehidden_stride	LayerNormhead)selfr;   r<   r=   head_dim	__class__r6   r7   rB   W   s*   


zVisualTokenizer.__init__returnc                 C   sH   |j j}|dkrt|j |d|dS |dkrt|j ||dS td| )Naimv2F)r;   r<   require_post_normr=   siglip_vision_modelr>   z)Unsupported visual tokenizer model_type: )rJ   
model_typer   r   
ValueError)rO   r;   r<   r=   rW   r6   r6   r7   rC   r   s   zVisualTokenizer._init_backbonec                 C      t | j jS N)nextrN   
parametersdtyperO   r6   r6   r7   r]         zVisualTokenizer.dtypec                 C   rY   rZ   )r[   rN   r\   devicer^   r6   r6   r7   r`      r_   zVisualTokenizer.devicelogitsc                 C   sl   | j jdkrt|dd}|S | j jdkrt|| j jdd}|S | j jdkr-t|dd}|S td| j j )	Nr	   r-   gumbel_argmaxT)tauhardr8   zLInvalid `max_type`, expected softmax or gumbel_argmax or st_argmax, but got )r;   tokenize_functionr	   r   re   r8   rX   )rO   ra   tokensr6   r6   r7   tokenize   s   
zVisualTokenizer.tokenizepixel_valuesc                 C   s  |  |}| jjr|d d dd d d f }| jjdkr|j\}}}t|d }|d |ks2J d|||||}| jj|| jj  | jj }t|ddd|d|fdd}||7 }|||| jj | jj|| jj | jj|}|dddddd	}|	d}||d
| jj| jj | }|S )Nr$   g      ?   z5The token sequence length should be a perfect square.r   constant         rb   )
rD   r;   drop_cls_tokenrL   shapeintreshaper   permuteflatten)rO   rj   featuresnLdsqrt_lplr6   r6   r7   encode   s:   


	
zVisualTokenizer.encodec                 C   s@   |  |}| |}| |}tjjj|dttfddd}|S )z8[BatchSize, ImageShape] -> [BatchSize, Token, VocabSize]r   rl   )modevalue)	r|   rN   ri   r1   rH   
functionalr   rF   rG   )rO   rj   rv   ra   rh   r6   r6   r7   forward   s   



zVisualTokenizer.forward)Nr:   )__name__
__module____qualname__r   r   strrB   rH   ModulerC   propertyr1   r]   r`   r   ri   r|   r   __classcell__r6   r6   rQ   r7   r9   V   s8    
)r9   c                   @   sh   e Zd ZU dZed ed< eeje	ddddf ed< eeje	d	f ed
< ee
e e	df ed< dS )OvisImagePatchInputsa  
    Dimensions:
        - bnp: Batch size * number of images * number of patches
        - h: Height of each patch
        - w: Width of each patch
        - patch_indicators: Batch size * (number of patches + 1)
        - bn: Batch size * number of images
    image_patchestypebnprm   hw	flat_datapatch_indicatorsindicator_tokensbnpatches_per_imageN)r   r   r   __doc__r   __annotations__r   r1   r   r#   listrr   r6   r6   r6   r7   r      s   
 	r   c                       sJ   e Zd Z fddZdedef fddZedd Zed	d
 Z  Z	S )VisualEmbeddingc                    s   t  j|i | d S rZ   )rA   rB   )rO   argskwargsrQ   r6   r7   rB      s   zVisualEmbedding.__init__visual_tokensrS   c                    s8   |j tjtjtjtjtjfv rt |S t	|| j
S rZ   )r]   r1   int8int16int32int64longrA   r   matmulweight)rO   r   rQ   r6   r7   r      s   zVisualEmbedding.forwardc                 C      | j jS rZ   )r   r`   r^   r6   r6   r7   r`         zVisualEmbedding.devicec                 C   r   rZ   )r   r]   r^   r6   r6   r7   r]      r   zVisualEmbedding.dtype)
r   r   r   rB   r   r   r   r`   r]   r   r6   r6   rQ   r7   r      s    
r   c                   @   s^   e Zd ZdefddZdefddZdefddZde	eed	B f fd
dZ
defddZd	S )OvisProcessingInfor   c                 K   s"   | j jtf|  |  d|S )N)image_pad_tokenimage_segment_len)ctxget_hf_processorr!   get_image_pad_tokenget_image_segment_len)rO   r   r6   r6   r7   r     s   z#OvisProcessingInfo.get_hf_processorrS   c                 C   s^   |   j}|jj}|jj}|j}t|| }|| dks'J d| d| || d d S )Nr   zpatch_grid_length z# is not divisible by hidden_stride rk   r$   )get_hf_configvisual_tokenizer_configrJ   
image_size
patch_sizerL   mathceil)rO   r   r   r   rL   patch_grid_lengthr6   r6   r7   r     s   
z(OvisProcessingInfo.get_image_segment_lenc                 C   s   |    }|j}t|S rZ   )r   get_text_configrW   IMAGE_PAD_TOKEN_MAPget)rO   hf_text_configtext_model_typer6   r6   r7   r     s   
z&OvisProcessingInfo.get_image_pad_tokenNc                 C   s   dd iS )Nimager6   r^   r6   r6   r7   get_supported_mm_limits  s   z*OvisProcessingInfo.get_supported_mm_limitsc                 C   s8   |    \}}|  jj}t|| d || d dS )N	   )widthheight)r   get_image_sizer   r   rL   r   )rO   r   r   hsr6   r6   r7   !get_image_size_with_most_features   s   z4OvisProcessingInfo.get_image_size_with_most_features)r   r   r   objectr   rr   r   r   r   r   r   r   r   r6   r6   r6   r7   r     s    r   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )OvisDummyInputsBuilder	mm_countsrS   c                 C   s   | dd}t| S )Nr   r   )r   IMAGE_TOKEN)rO   r   
num_imagesr6   r6   r7   get_dummy_text)  s   z%OvisDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc           	      C   sF   | dd}| j \}}|r| dnd }d| j||||di}|S )Nr   r   )r   r   r   	overrides)r   infor   _get_dummy_images)	rO   r   r   r   r   target_widthtarget_heightimage_overridesmm_datar6   r6   r7   get_dummy_mm_data-  s   z(OvisDummyInputsBuilder.get_dummy_mm_datarZ   )
r   r   r   r   r   rr   r   r   r   r   r6   r6   r6   r7   r   (  s    
r   c                
       s   e Zd Zdee dee fddZdedeeef deeef deeef de	f
 fd	d
Z
dee dee fddZde	deeef deeef fddZdedeeef dedee fddZ  ZS )OvisMultiModalProcessorimage_indicatorsrS   c                    s$   | j  }|jj  fdd|D S )a  
        Filter image indicators placeholders and convert them to corresponding
        tokens in visual tokenizer.
        For example, [-301, -300, -302, -300, -303, -300, -304, -300, -305]
        should return [vocab_size-1, vocab_size-2, ..., vocab_size-5]
        c                    s    g | ]}|d k r | d qS )ii,  r6   .0xvte_vocab_sizer6   r7   
<listcomp>R  s     zMOvisMultiModalProcessor.image_indicators_to_visual_tokens.<locals>.<listcomp>)r   r   r   rE   )rO   r   	hf_configr6   r   r7   !image_indicators_to_visual_tokensE  s   

z9OvisMultiModalProcessor.image_indicators_to_visual_tokenspromptr   	mm_kwargs
tok_kwargsc           
         s   |sj  }|j|dd}tt|gdddS t j||||d}j    fdd|d	 D }fd
d|D }	t	|	|d< |S )NF)add_special_tokens)	input_idspt)tensor_type)r   r   r   r   c                       g | ]}  |qS r6   )construct_image_indicators)r   grid)hf_processorr6   r7   r   i      z>OvisMultiModalProcessor._call_hf_processor.<locals>.<listcomp>gridsc                    r   r6   )r   )r   	indicatorr^   r6   r7   r   m  r   r   )
r   get_tokenizerr|   r
   dictrA   _call_hf_processorr   r1   tensor)
rO   r   r   r   r   	tokenizer
prompt_idsprocessed_outputsr   r   rQ   )r   rO   r7   r   T  s&   



z*OvisMultiModalProcessor._call_hf_processorprompt_tokensc                 C   s   |S rZ   r6   )rO   r   r6   r6   r7   _apply_hf_processor_tokens_onlyt  s   z7OvisMultiModalProcessor._apply_hf_processor_tokens_only	hf_inputshf_processor_mm_kwargsc                 C   s    t tdtdtddS )Nr   )rj   r   r   )r   r   batched)rO   r   r   r6   r6   r7   _get_mm_fields_configz  s
   z-OvisMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s$   dt f fdd}tdt|dgS )Nitem_idxc                    s*    d |  }|d j }j }||S )Nr   r   )datar   r   construct_image_placeholders)r   out_itemr   r   r   rO   r6   r7   get_replacement_ovis  s   


zIOvisMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_ovisr   )modalitytargetreplacement)rr   r   r   )rO   r   r   r   r   r6   r   r7   _get_prompt_updates  s   z+OvisMultiModalProcessor._get_prompt_updates)r   r   r   r   rr   r   r   r   r   r
   r   r   r   r   r   r   r   r   r   r6   r6   rQ   r7   r   D  sL    



 




r   )r   dummy_inputsc                       s  e Zd ZededededB fddZddd	ed
ef fddZde	de
dB fddZde
defddZde	defddZ		d!dejdB dejdedB dejdB de	dejeB fddZdejdejdB fddZdeeeejf  dee fdd Z  ZS )"Ovisr   irS   Nc                 C   s   | drtS td)Nr   z Only image modality is supported)
startswithr   rX   )clsr   r  r6   r6   r7   get_placeholder_str  s   
zOvis.get_placeholder_strr:   )r=   vllm_configr=   c                   s   t    |jj}|j}|| _| | t||	 t
|dd| _W d    n1 s.w   Y  | |d t|j|| dd| _t| jjj| jj| _W d    n1 s[w   Y  | j	 j}t| | _|  j| _d S )Nllm)r  r=   r   z.visual_tokenizerr>   )rA   rB   model_configr   r<   r;   _mark_language_modelr   with_hf_configr   r   r  _mark_tower_modelr9   r   visual_tokenizerr   rE   rK   vterW   IMAGE_PAD_TOKEN_ID_MAPimage_pad_token_idget_language_modelmake_empty_intermediate_tensors)rO   r  r=   r;   r<   r   rQ   r6   r7   rB     s0   



zOvis.__init__r   c                 K   s   | dd }| dd }|d u r|d u rd S |d urU|d urUt|tjtfs/tdt| t|tjtfs@tdt| tdt|dddd	 |D t|ddd
S t	d)Nrj   r   z*Incorrect type of pixel values. Got type: z.Incorrect type of indicator_tokens. Got type: r   T)concatc                 S   s   g | ]}|j d  qS )r   )rq   r   r6   r6   r7   r     s    z8Ovis._parse_and_validate_image_input.<locals>.<listcomp>)r   r   r   r   z This line should be unreachable.)
pop
isinstancer1   r   r   rX   r   r   r   AssertionError)rO   r   rj   r   r6   r6   r7   _parse_and_validate_image_input  s,   

z$Ovis._parse_and_validate_image_inputimage_inputc              	   C   s   |d }|d }|d }t tdd |}| jj}| ||}| |}| |}	|	|}
|j|dd}g }t|
|D ]9\}}g }t|j	d D ]}|
tj|||d  || gdd qK|
||d d   |
tj|dd q>t|S )	Nr   r   r   c                 S   s   | dkr| d S | d S )Nr$   rk   r6   )r   r6   r6   r7   <lambda>  s    z+Ovis._process_image_input.<locals>.<lambda>r   rc   r$   )r   mapr  r]   tor  splitziprangerq   appendr1   cattuple)rO   r  image_patches_flatr   r   indicator_per_imagetarget_dtyper   visual_embedsindicator_embedsindicator_embeds_per_imagevisual_embeds_per_imagevision_embeddingsr   visualvision_embeddings_per_imager  r6   r6   r7   _process_image_input  s0   


 zOvis._process_image_inputc                 K   s*   | j di |}|d u rg S | |}|S )Nr6   )r  r*  )rO   r   r  image_featuresr6   r6   r7   embed_multimodal  s
   
zOvis.embed_multimodalr   	positionsintermediate_tensorsinputs_embedsc                 K   s"   |d urd }| j ||||d}|S )N)r   r-  r.  r/  )r  )rO   r   r-  r.  r/  r   hidden_statesr6   r6   r7   r     s   zOvis.forwardr0  c                 C   s   | j |S rZ   )r  compute_logits)rO   r0  r6   r6   r7   r1  #  s   zOvis.compute_logitsweightsc                 C   s   t | }||S rZ   )r   load_weights)rO   r2  loaderr6   r6   r7   r3  )  s   
zOvis.load_weights)NN)r   r   r   classmethodr   rr   r  r   rB   r   r   r  r%   r*  r,  r1   r   r    r   r1  r   r  setr3  r   r6   r6   rQ   r7   r     sH    

!

,r   )Mr   r   collections.abcr   r   typingr   r   r1   torch.nnrH   r   torch.nn.functionalr   r   r	   transformersr
   r   vllm.configr   vllm.config.multimodalr   !vllm.model_executor.layers.linearr   'vllm.model_executor.layers.quantizationr    vllm.model_executor.models.aimv2r   !vllm.model_executor.models.siglipr    vllm.model_executor.models.utilsr   r   r   r   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   vllm.multimodal.processingr   r   r   r   vllm.sequencer    'vllm.transformers_utils.processors.ovisr!   vllm.utils.tensor_schemar"   r#   
interfacesr%   r&   r'   r   rG   r   r  rr   r8   r   r9   r   	Embeddingr   r   r   r   register_processorr   r6   r6   r6   r7   <module>   s`    &W