o
    
۾i@                     @   s<  U d dl Z d dlmZmZmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZm Z  d dl!m"Z"m#Z#m$Z$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@mAZA dZBG dd de,ZCG dd de,ZDG dd de,ZEeDeEB ZFe
eGd< eFeCB ZHe
eGd< G dd  d e9e	ZIG d!d" d"e:ZJG d#d$ d$e5eJ ZKG d%d& d&e8eJ ZLG d'd( d(ejMZNejOeLeJeKd)G d*d+ d+ejMe2e3ZPdS ),    N)IterableMappingSequence)	AnnotatedFinalLiteralProtocol	TypeAlias)BatchFeatureLlavaOnevisionConfigLlavaOnevisionProcessor)get_anyres_image_grid_shapeunpad_image)
VllmConfig)BaseDummyOptions)
get_act_fn)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItemsVideoEmbeddingItemsVideoProcessorItems)PromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )CLIPVisionModel)MultiModalEmbeddingsSupportsMultiModal
SupportsPP)LlavaDummyInputsBuilderinit_vision_tower_for_llava) BaseLlavaNextMultiModalProcessorLlavaNextLikeConfigLlavaNextProcessingInfo)SiglipVisionModel)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix   c                
   @   sP   e Zd ZU dZdZed ed< eej	e
ej	 B eddddddhd	f ed< d
S )LlavaOnevisionVideoPixelInputsal  
    Dimensions:
        - bn: Batch size * number of videos
        - f: Number of frames
        - c: Number of channels (3)
        - h: Height
        - w: Width

        Note that `f` may be different for each batch, and 'num_frames'
        may be different for each video, in which case the data is passed as a
        list instead of a batched tensor.
    pixel_values_videostypebnf   hwdynamic_dimsN__name__
__module____qualname____doc__r1   r   __annotations__r   torchTensorlistr    rB   rB   ^/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/llava_onevision.pyr/   7   s   
 r/   c                
   @   sl   e Zd ZU dZdZed ed< eej	e
ej	 B eddddddhd	f ed< eej	d
B eddf ed< d
S )LlavaOnevisionImagePixelInputsaU  
    Dimensions:
        - bn: Batch size * number of images
        - np: Number of patches (1 + num_patches)
        - c: Number of channels (3)
        - h: Height
        - w: Width

        Note that `num_patches` may be different per batch and image,
        in which case the data is passed as a list instead of a batched tensor.
    pixel_valuesr1   r2   npr4   r5   r6   r7   N   image_sizesr9   rB   rB   rB   rC   rD   M   s   
  rD   c                   @   s<   e Zd ZU dZdZed ed< eej	e
dddf ed< dS )	"LlavaOnevisionImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - ifs: Image feature size
        - hs: Hidden size (must match language model backbone)
    image_embedsr1   r2   ifshsdataN)r:   r;   r<   r=   r1   r   r>   r   r?   r@   r   rB   rB   rB   rC   rI   d   s   
 
rI   LlavaOnevisionImageInputsLlavaOnevisionMultiInputsc                   @   s   e Zd ZU ee ed< dS )LlavaOnevisionLikeConfigvideo_token_indexN)r:   r;   r<   r   intr>   rB   rB   rB   rC   rP   }   s   
 rP   c                   @   s   e Zd ZdefddZdefddZdeee	dB f fdd	Z
d
e	de	de	de	de	dee	e	f fddZdefddZde	de	de	fddZde	de	de	de	fddZde	de	fddZde	deee	f de	fdd Zde	deee	f de	fd!d"ZdS )#LlavaOnevisionProcessingInforeturnc                 C   s   | j tS N)ctxget_hf_configr   selfrB   rB   rC   rW      s   z*LlavaOnevisionProcessingInfo.get_hf_configkwargsc                 K   s   | j jtfi |S rU   )rV   get_hf_processorr   )rY   rZ   rB   rB   rC   r[      s   z-LlavaOnevisionProcessingInfo.get_hf_processorNc                 C   s
   d d dS )NimagevideorB   rX   rB   rB   rC   get_supported_mm_limits   s   
z4LlavaOnevisionProcessingInfo.get_supported_mm_limitsoriginal_heightoriginal_widthnpatchesnum_patch_heightnum_patch_widthc                C   s   || }|| }|| }|| }	||	kr,t t|||  d}
||
 d }|d|  }nt t|||  d}|| d }|d|  }|| }|}t|| d|d   }|dkrlt || }t || }|| }|}||fS )N   rG   	   皙?)rR   roundmathsqrt)rY   r`   ra   rb   rc   rd   current_heightcurrent_widthaspect_ratiocurrent_aspect_ratio
new_heightpadding	new_widthunpadded_featuresnewline_featuresratioheight_factorwidth_factorrB   rB   rC   _get_num_unpadded_features   s0   	z7LlavaOnevisionProcessingInfo._get_num_unpadded_featuresc                 C   s   t dddS )Ni  i  )widthheight)r   rX   rB   rB   rC   !get_image_size_with_most_features   s   z>LlavaOnevisionProcessingInfo.get_image_size_with_most_featuresimage_widthimage_heightc                C   s:   |   }t|dd}|  }| }t|| }|| S )Nspatial_pool_striderG   )rW   getattrget_vision_encoder_infoget_patch_grid_lengthri   ceil)rY   r{   r|   	hf_configr}   vision_encoder_infopatch_grid_lengthpooled_grid_lengthrB   rB   rC   _get_num_frame_tokens   s   z2LlavaOnevisionProcessingInfo._get_num_frame_tokens
num_framesc                C   s   | j ||d}|| d S )N)r{   r|   r   )r   )rY   r{   r|   r   num_frame_tokensrB   rB   rC   get_num_video_tokens   s
   z1LlavaOnevisionProcessingInfo.get_num_video_tokens
max_tokensc                 C   s>   |   \}}d}	 |d }| j|||d}||kr	 |S |}q	)Nr   Tr   r{   r|   r   )rz   r   )rY   r   target_widthtarget_heightr   next_num_framesnext_max_tokensrB   rB   rC   _get_max_video_frames   s   z2LlavaOnevisionProcessingInfo._get_max_video_framesseq_len	mm_countsc                 C   s4   | dd}| |}t|t|d t}t|dS )Nr^   r   r   )getr   minmax_MAX_FRAMES_PER_VIDEO)rY   r   r   
max_videosmax_total_framesmax_frames_per_videorB   rB   rC   !get_num_frames_with_most_features   s   

z>LlavaOnevisionProcessingInfo.get_num_frames_with_most_featuresc                 C   s$   |   \}}| j||| ||dS )Nr   )rz   r   r   )rY   r   r   r   r   rB   rB   rC   get_max_video_tokens   s   
z1LlavaOnevisionProcessingInfo.get_max_video_tokens)r:   r;   r<   rP   rW   objectr[   r   strrR   r_   tuplerw   r   rz   r   r   r   r   r   rB   rB   rB   rC   rS      sb    

(




rS   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS ) LlavaOnevisionDummyInputsBuilderr   rT   c                 C   s>   | dd}| dd}| j }|j}|j}|| ||  S )Nr]   r   r^   )r   infor[   image_tokenvideo_token)rY   r   
num_images
num_videos	processorr   r   rB   rB   rC   get_dummy_text	  s   
z/LlavaOnevisionDummyInputsBuilder.get_dummy_textNr   
mm_optionsc                 C   s   | dd}| dd}| j \}}| j||}|r!| dnd }	|r*| dnd }
| j||||	d| j|||||
ddS )Nr]   r   r^   )rx   ry   r   	overrides)rx   ry   r   r   r   r\   )r   r   rz   r   _get_dummy_images_get_dummy_videos)rY   r   r   r   r   r   r   r   target_num_framesimage_overridesvideo_overridesrB   rB   rC   get_dummy_mm_data  s,   z2LlavaOnevisionDummyInputsBuilder.get_dummy_mm_datarU   )
r:   r;   r<   r   r   rR   r   r   r   r   rB   rB   rB   rC   r     s    
r   c                
       s   e Zd Zdedeeef deeef fddZdedeeef deeef d	eeef def
 fd
dZ	dede
deeef deeef def
 fddZde
deeef dedee f fddZ  ZS )!LlavaOnevisionMultiModalProcessor	hf_inputshf_processor_mm_kwargsrT   c                 C   s(   t tdtdtdtddS )Nr]   r^   )rE   rH   rJ   r0   )dictr   batched)rY   r   r   rB   rB   rC   _get_mm_fields_config8  s   z7LlavaOnevisionMultiModalProcessor._get_mm_fields_configpromptmm_data	mm_kwargs
tok_kwargsc                    s  t |}|dg }t|tsJ |st j||||dS | j }|j}|j	}t j|i ||d}	|dg }
t|
ts?J |
r[t j|t
|
 d|
i||d}dd | D }ni }g }|D ]}t j|d|i||d}||d d  qad|i}t |	fi ||}t|S )Nvideos)r   r   r   r   imagesc                 S   s   i | ]\}}|d v r||qS ))rE   rH   rB   ).0kvrB   rB   rC   
<dictcomp>o  s
    zHLlavaOnevisionMultiModalProcessor._call_hf_processor.<locals>.<dictcomp>r0   r   )r   pop
isinstancerA   super_call_hf_processorr   r[   r   r   lenitemsappendr
   )rY   r   r   r   r   r   r   r   r   text_outputsr   processor_outputsimage_outputsr0   r^   item_outputsvideo_outputscombined_outputs	__class__rB   rC   r   D  sd   

z4LlavaOnevisionMultiModalProcessor._call_hf_processorprompt_textmm_itemstokenization_kwargsc                    s*   t  j||||d}|o|jddddkS )N)r   r   r   r   r^   F)strictr   )r   _hf_processor_applies_updates	get_count)rY   r   r   r   r   base_resultr   rB   rC   r     s   z?LlavaOnevisionMultiModalProcessor._hf_processor_applies_updatesout_mm_kwargsc                    sP   t  j ||d}j }|jdtf fdd}g |tdg|dS )N)r   r   r   item_idxc                    sV     dttf}t|tr|| }n|| }jj|j|j	|
| d}g| S )Nr^   r   )	get_itemsr   r   r   get_feature_sizeget_frame_sizer   r   rx   ry   get_num_frames)r   r   num_video_tokens
image_sizer   rY   video_token_idrB   rC   get_video_replacement  s   


zTLlavaOnevisionMultiModalProcessor._get_prompt_updates.<locals>.get_video_replacementr^   )modalitytargetreplacement)r   _get_prompt_updatesr   rW   rQ   rR   r   )rY   r   r   r   image_replsr   r   r   r   rC   r     s"   
z5LlavaOnevisionMultiModalProcessor._get_prompt_updates)r:   r;   r<   r
   r   r   r   r   r   r   r   boolr   r   r   r   r   __classcell__rB   rB   r   rC   r   5  sN    





G


r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )!LlavaOnevisionMultiModalProjectorconfigc                    sR   t    tj|jj|jj|jd| _t	|j
| _tj|jj|jj|jd| _d S )N)bias)r   __init__nnLinearvision_confighidden_sizetext_configmultimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2)rY   r   r   rB   rC   r     s   
z*LlavaOnevisionMultiModalProjector.__init__image_featuresrT   c                 C   s"   |  |}| |}| |}|S rU   )r   r   r   )rY   r   hidden_statesrB   rB   rC   forward  s   


z)LlavaOnevisionMultiModalProjector.forward)	r:   r;   r<   r   r   r?   r@   r   r   rB   rB   r   rC   r     s    r   )r   dummy_inputsc                       s  e Zd ZedddddddZeded	ed
edB fddZddde	ded
df fddZ
ded
edB fddZded
edB fddZded
efddZdeeB dejd
ejfddZddd d!ejd"ejd#ed
ejfd$d%Zd&ed
ejeej B fd'd(Zd)ed
ejeej B fd*d+ZdeeB dejd
ejfd,d-Zd&efd.d/ZdCd1ejd2efd3d4Zded
efd5d6Z 		dDd7ejdB d8ejd9e!dB d:ejdB ded
eje!B fd;d<Z"d=ejd
ejdB fd>d?Z#d@e$e%eejf  d
e&e fdAdBZ'  Z(S )E&LlavaOnevisionForConditionalGenerationzlanguage_model.model.zvision_tower.zmulti_modal_projector.image_newlinezlanguage_model.lm_head.)zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.zmodel.image_newlinezlm_head.)orig_to_new_prefixr   irT   Nc                 C   s$   | drdS | drdS td)Nr]   z<image>r^   z<video>z)Only image or video modality is supported)
startswith
ValueError)clsr   r   rB   rB   rC   get_placeholder_str  s
   

z:LlavaOnevisionForConditionalGeneration.get_placeholder_str )prefixvllm_configr  c                   s   t    |jj}|j}|jj}|| _|| _| |ddh$ t||dt	|dd| _
tt|jj| _t|| _W d    n1 sEw   Y  | | t||jt	|dd| _W d    n1 sfw   Y  | jjj| _d S )Nr]   r^   Fvision_tower)quant_configrequire_post_normr  language_model)r  r   r  )r   r   model_configr   r  multimodal_configr   _mark_tower_modelr%   r-   r  r   	Parameterr?   emptyr   r   r   r   multi_modal_projector_mark_language_modelr,   r  modelmake_empty_intermediate_tensors)rY   r  r  r   r  r	  r   rB   rC   r     s6   

z/LlavaOnevisionForConditionalGeneration.__init__rZ   c                 K   s~   | dd }| dd }| dd }|d u r|d u rd S |d ur1td||| jjj| jjjddS |d ur;td|dS td)NrE   rH   rJ   r5   r6   )r1   rE   rH   resolve_bindings)r1   rM   z This line should be unreachable.)r   rD   r   r   r   rI   AssertionError)rY   rZ   rE   rH   rJ   rB   rB   rC   _parse_and_validate_image_input  s(   
zFLlavaOnevisionForConditionalGeneration._parse_and_validate_image_inputc                 K   s8   | dd}|du rdS td|| jjj| jjjddS )z
        A legal video input should have the following dimensions:
        {
            "pixel_values_videos" :
                list[b, Tensor(nb_frames, nb_channels, height, width)]
        }
        r0   Nr  )r1   r0   r  )r   r/   r   r   r   )rY   rZ   r0   rB   rB   rC   _parse_and_validate_video_input4  s   
zFLlavaOnevisionForConditionalGeneration._parse_and_validate_video_inputc                 K   sZ   i }|D ]&}|dv rd|vr| j di ||d< |dv r*d|vr*| jdi ||d< q|S )N)rE   rJ   r]   )r0   video_embedsr^   rB   )r  r  )rY   rZ   mm_input_by_modality	input_keyrB   rB   rC   %_parse_and_validate_multimodal_inputsK  s   

zLLlavaOnevisionForConditionalGeneration._parse_and_validate_multimodal_inputsr  rE   c                 C   s   ||| j jdS N)feature_select_strategy)r   vision_feature_select_strategy)rY   r  rE   rB   rB   rC   _image_pixels_to_featuresb  s   z@LlavaOnevisionForConditionalGeneration._image_pixels_to_featuresanyres_max_9)r   vision_aspect_ratior   patch_embeddingsstrategyc                C   sB  |dkr
| ddS |dr| jjj| jjj  }}|d }|| |jd kr-td|jd dkr|dd  }	| \}
}t	|
|f| jj
| jjj\}}|| }|	d | ||||d}	d|v r|	ddd	dd
  dd	 d	d
}	t|	|
|f}	t|d}|	j\}}}t|| ||d	   }|dkr|	d  }	tjj|	t|| t|| gddd }	|d urtj|	|d d d d f jg |	jd d dR  |	jfdd}	|	 dd	dd}	n|	dd	dd
d  dd
}	tj||	fdd}|S d|v rtj|| jd  |jfdd}|S |}|S td| )Nflatr   r   spatialz<The number of patches is not consistent with the image size.unpad   rG   r4   anyres_max_rg   bilinear)modedimz!Unexpected patch merge strategy: )flattenr   r   r   r   
patch_sizeshaper   tolistr   image_grid_pinpointsviewpermute
contiguousr   rR   removeprefixri   rj   r   
functionalinterpolater?   catexpandtodevice	transposer   )rY   r   r   r   r  r!  ry   rx   base_patch_embedsother_patch_embedsorig_height
orig_widthrc   rd   num_patchesmax_num_patcheschannelscurr_height
curr_widthrt   merged_patch_embeddingsrB   rB   rC   _merge_image_patch_embeddingso  s   	
	

zDLlavaOnevisionForConditionalGeneration._merge_image_patch_embeddingsinputsc                    s   |d }t |tjr7|j\}}}}}||| |||}  j|}	 |	}
|
j||g|
jdd  R  S dd |D }t|}  j|}	 fddt	|	|D S )NrE   r   c                 S   s   g | ]}|j d  qS )r   )r.  )r   r   rB   rB   rC   
<listcomp>  s    zPLlavaOnevisionForConditionalGeneration._process_image_pixels.<locals>.<listcomp>c                    s   g | ]}  |qS rB   )r  )r   r   rX   rB   rC   rH    s    )
r   r?   r@   r.  r1  r  r  r  r7  split)rY   rG  rE   br@  cr5   r6   stacked_pixel_valuesstacked_image_featuresstacked_patch_embeddingsnum_patches_per_batchrB   rX   rC   _process_image_pixels  s2   

z<LlavaOnevisionForConditionalGeneration._process_image_pixelsimage_inputc                    s   |d dkr
|d S  |}|dd u r6t|d }jj}|j  t fddt|D fddt	|D S )	Nr1   rJ   rM   rH   rE   c                    s   g | ]} gqS rB   rB   )r   _)default_heightdefault_widthrB   rC   rH  	      zOLlavaOnevisionForConditionalGeneration._process_image_input.<locals>.<listcomp>c                    s(   g | ]\}}j  | |jd dqS )spatial_unpad)r   r!  )rF  r   )r   r   patch_features_batch)rH   rY   rB   rC   rH    s    )
rP  r   r   r   r   r   r?   	as_tensorrange	enumerate)rY   rQ  r   
batch_sizer   rB   )rS  rT  rH   rY   rC   _process_image_input  s   


z;LlavaOnevisionForConditionalGeneration._process_image_inputc                 C   s(   ||| j jd}| |}| |}|S r  )r   r  r  apply_pooling)rY   r  rE   video_featuresrB   rB   rC   _video_pixels_to_features  s   

z@LlavaOnevisionForConditionalGeneration._video_pixels_to_featuresc           
         s   |d }t |tjrG|j\}}}}}||| |||}| | j|  || jd  d | jd d d d f 	|ddtj
 fddS dd |D }	t
|}| | j| | jd d d d f  fddt|	t |	D S )Nr0   r   r$  r*  c                 S   s   g | ]}t |qS rB   )r   )r   r^   rB   rB   rC   rH  9  rU  zPLlavaOnevisionForConditionalGeneration._process_video_pixels.<locals>.<listcomp>c              	      s6   g | ]\}}t j|d | jd   dfd dqS )r   r$  r*  )r?   r7  reshaper.  )r   	num_frameembedsembeddings_flatr   rB   rC   rH  B  s    )r   r?   r@   r.  r1  r_  r  r`  r   r8  r7  ziprI  )
rY   rG  video_pixelstotal_videosframesrK  r5   r6   video_pixels_flatframes_per_videorB   rc  rC   _process_video_pixels%  s4   

z<LlavaOnevisionForConditionalGeneration._process_video_pixelsrG   r   stridec                 C   s   | j j}|j|j  }}|j\}}}||||d}|dddd}|jdd  \}}t|| t|| g}	t	j
j||	dd}
|
dddd}
|
|d|}
|
S )Nr$  r   r4   r   rG   r(  )sizer)  )r   r   r   r-  r.  r1  r2  ri   r   r   r5  r6  )rY   r   rl  r   ry   rx   batch_framesrR  r+  scaled_shapeimage_featurerB   rB   rC   r]  P  s   z4LlavaOnevisionForConditionalGeneration.apply_poolingc                 K   sn   | j di |}|sg S d}|D ]$}|| }|dkr%| |}|t|7 }|dkr4| |}|t|7 }q|S )NrB   r]   r^   )r  r\  r   rk  )rY   rZ   r  multimodal_embeddingsr   multimodal_inputimage_embeddingsvideo_embeddingsrB   rB   rC   embed_multimodala  s   

z7LlavaOnevisionForConditionalGeneration.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s$   |durd}| j j||||d}|S )zRun forward pass for LlaVA-Onevision.
        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            pixel_values_videos: Pixels in each frames for each input videos.
        N)ry  )r  r  )rY   rv  rw  rx  ry  rZ   r   rB   rB   rC   r   x  s   z.LlavaOnevisionForConditionalGeneration.forwardr   c                 C   s   | j |S rU   )r  compute_logits)rY   r   rB   rB   rC   rz    s   z5LlavaOnevisionForConditionalGeneration.compute_logitsweightsc                 C   s   t | }|j|| jdS )N)mapper)r*   load_weightshf_to_vllm_mapper)rY   r{  loaderrB   rB   rC   r}    s   z3LlavaOnevisionForConditionalGeneration.load_weights)rG   )NN))r:   r;   r<   r+   r~  classmethodr   rR   r   r   r   r   rN   r  r/   r  r   r  r    r)   r?   r@   r  rF  rD   rA   rP  r\  r_  rk  r]  r!   ru  r   r   rz  r   r   setr}  r   rB   rB   r   rC   r     s     !



j
!

+

,r   )Qri   collections.abcr   r   r   typingr   r   r   r   r	   r?   torch.nnr   transformersr
   r   r   <transformers.models.llava_onevision.modeling_llava_onevisionr   r   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   r   vllm.multimodal.processingr   r   vllm.sequencer   vllm.utils.tensor_schemar   r   clipr    
interfacesr!   r"   r#   llavar$   r%   
llava_nextr&   r'   r(   siglipr)   utilsr*   r+   r,   r-   r   r/   rD   rI   rN   r>   rO   rP   rS   r   r   Moduler   register_processorr   rB   rB   rB   rC   <module>   s^   


 

/ 