o
    iE                     @   st  d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZmZmZ d d	lmZmZmZmZ d d
lmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' de(de(de)de)de*e(e(f f
ddZ+de(de(de*e(e(f dB de,e*e(e(f  fddZ-de(de(de,e*e(e(f  de(de)de*e(e(e(e*e(e(f f fddZ.d ejde,e*e(e(f  de(de)de*e,ej e*e(e(f f f
d!d"Z/d ejd#e(de(de(de)de*e(e(f dB de*ej0e*e(e(f f fd$d%Z1d ejd#e(de(de(de)d&e)dej0fd'd(Z2G d)d* d*e#Z3G d+d, d,e"Z4G d-d. d.e!e4 Z5ej6e5e4e d/G d0d1 d1e$Z7dS )2    )MappingSequenceN)Image)PretrainedConfig)QuantizationConfig)MULTIMODAL_REGISTRY)MultiModalKwargsItemsMultiModalUUIDDict)ImageEmbeddingItemsImageProcessorItemsMultiModalDataItems)MultiModalProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)TokenizerLike   )InternVisionModel)IMG_CONTEXTIMG_END	IMG_STARTBaseInternVLDummyInputsBuilderBaseInternVLMultiModalProcessorBaseInternVLProcessingInfoBaseInternVLProcessorInternVLChatModelbuild_transformfind_closest_aspect_ratioget_internvl_target_ratiosmin_dynamic_patchmax_dynamic_patchdynamic_image_sizeuse_thumbnailreturnc                 C   s4   |r| nd} |r
|nd}|r|dkr|d7 }| |fS )Nr    r   r    r!   r"   r$   r$   V/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/h2ovl.pyresolve_h2ovl_min_max_num1   s
   r'   min_nummax_numprior_aspect_ratioc                   s(   t | |} d ur fdd|D }|S )Nc                    s8   g | ]} d  |d   d kr d |d  d kr|qS )r   r   r$   ).0ratior*   r$   r&   
<listcomp>K   s    z+get_h2ovl_target_ratios.<locals>.<listcomp>)r   )r(   r)   r*   target_ratiosr$   r-   r&   get_h2ovl_target_ratiosA   s   

r0   
orig_widthorig_heightr/   
image_sizec           
      C   sb   | | }t ||| ||d}||d  }||d  }|d |d  }	|r+|	dkr+|	d7 }	|	|||fS )N)widthheightr3   r   r   )r   )
r1   r2   r/   r3   r"   aspect_ratiotarget_aspect_ratiotarget_widthtarget_heightblocksr$   r$   r&   calculate_h2ovl_targetsV   s   	r;   imagec                C   s   | j \}}t||||dd\}}}}	| ||f}
g }t|D ].}|||  | |||  | |||  d | |||  d | f}|
|}|| qt||ksVJ |rjt|dkrj| ||f}|| ||	fS )NF)r1   r2   r/   r3   r"   r   )sizer;   resizerangecropappendlen)r<   r/   r3   r"   r1   r2   r:   r8   r9   r7   resized_imgprocessed_imagesibox	split_imgthumbnail_imgr$   r$   r&   dynamic_preprocess_h2ovlw   s8   



rI   
input_sizec          
         sL   t |||d}t|d t| |||d\}}t fdd|D }	|	|fS )Nr-   )rJ   )r3   r"   r/   c                    s   g | ]} |qS r$   r$   r+   r<   	transformr$   r&   r.      s    z%_preprocess_image.<locals>.<listcomp>)r0   r   rI   torchstack)
r<   rJ   r(   r)   r"   r*   r/   imagesr7   pixel_valuesr$   rL   r&   _preprocess_image   s   	

rR   use_msacc                C   s~   |r1t | |d|dd d\}}t | |d|d|d\}}	t|d d |d d |dd  gd}
|
S t | ||||d d\}
}	|
S )Nr   T)rJ   r(   r)   r"   r*      r   )rR   rN   cat)r<   rJ   r(   r)   r"   rS   pixel_values1aspect_ratio1pixel_values2_rQ   r$   r$   r&   image_to_pixel_values_h2ovl   s<   

	
	"
	r[   c                       s  e Zd ZddddddedededB dedB dedB dedB d	df fd
dZed	efddZ	dededB d	e
e fddZddddddedB dedB dedB dedB d	eeef f
ddZddddddddedB dedB dedB dedB deeef dB dedB d	eeeef  fddZdddedededB d	efddZ			d#d eej dedB dedB dedB d	eej f
d!d"Z  ZS )$H2OVLProcessorN)r   r    r!   rS   config	tokenizerr   r    r!   rS   r#   c                   s<   t  j|||||d |d u r|j}t|tsJ || _d S )N)r   r    r!   )super__init__rS   
isinstancebool)selfr]   r^   r   r    r!   rS   	__class__r$   r&   r`      s   

zH2OVLProcessor.__init__c                 C   s   | j  t S N)r^   	get_vocabr   )rc   r$   r$   r&   image_token_id	  s   zH2OVLProcessor.image_token_idfeature_sizenum_patchesc                 C   s    t | }t| t }t|t S rf   )r   r   r   r   select_text)rc   ri   rj   repl_features	repl_fullr$   r$   r&   get_image_repl  s   zH2OVLProcessor.get_image_replr%   r"   c                C   sX   |d u r| j n|}|d u r| jn|}|d u r| jn|}|d u r"| jn|}t||||dS )Nr%   )r   r    r!   r"   r'   )rc   r   r    r!   r"   r$   r$   r&   resolve_min_max_num  s   	z"H2OVLProcessor.resolve_min_max_num)r   r    r!   r"   r*   override_min_numr*   rp   c          	      C   s0   | j ||||d\}}|d ur|}t|||dS )Nr%   r-   )ro   r0   )	rc   r   r    r!   r"   r*   rp   r(   r)   r$   r$   r&   resolve_target_ratios3  s   

z$H2OVLProcessor.resolve_target_ratiosrS   image_widthimage_heightc                C   s   |d u r| j n|}| j}|r@| jddd}t||| j|dd\}}}}| jd|dd}	t||| j|	dd\}
}}}||
 d }n| jdd}t||| j||d\}}}}|| j S )	NFr   )r"   rp   T)r1   r2   r3   r/   r"   rT   )r"   r*   rp   )r"   )rS   r"   rq   r;   r3   num_image_token)rc   rs   rt   rS   r"   target_ratios_1num_patches_1rZ   aspect_ratio_1target_ratios_2num_patches_2rj   r/   r$   r$   r&   get_num_image_tokensL  sL   
z#H2OVLProcessor.get_num_image_tokensrP   c                    sD   t |dkr	jndj|||dd\  fdd|D S )Nr   Fr%   c              
      s$   g | ]}t |j jd qS ))rJ   r(   r)   r"   rS   )r[   r3   r"   rK   r)   r(   rc   rS   r$   r&   r.     s    	z>H2OVLProcessor._images_to_pixel_values_lst.<locals>.<listcomp>)rB   rS   ro   )rc   rP   r   r    r!   r$   r|   r&   _images_to_pixel_values_lst  s   
	z*H2OVLProcessor._images_to_pixel_values_lst)NNN)__name__
__module____qualname__r   r   intrb   r`   propertyrh   r   strrn   tuplero   listrq   r{   r   rN   Tensorr}   __classcell__r$   r$   rd   r&   r\      s    	


	

7r\   c                   @   sJ   e Zd ZdedefddZdddeded	edB d
edB def
ddZdS )H2OVLProcessingInfokwargsr#   c                 K   s"   | j jtf|  |  d|S )N)r]   r^   )ctxinit_processorr\   get_hf_configget_tokenizer)rc   r   r$   r$   r&   get_hf_processor  s   z$H2OVLProcessingInfo.get_hf_processorNrr   rs   rt   	processorrS   c                C   s    |d u r|   }|j|||dS )N)rs   rt   rS   )r   r{   )rc   rs   rt   r   rS   r$   r$   r&   r{     s   z(H2OVLProcessingInfo.get_num_image_tokens)	r~   r   r   objectr\   r   r   rb   r{   r$   r$   r$   r&   r     s    r   c                       s   e Zd Zdedeeef dedee	 fddZ
	ddeee B d	edeeef d
eeef dedB deee eef f fddZ  ZS )H2OVLMultiModalProcessormm_itemshf_processor_mm_kwargsout_mm_kwargsr#   c                    s   j jd	i | | }d|v r"|d ttjsJ  nd|v r0d gt|d  ng tdtf fdd}t	dd|dgS )
Nimage_num_patchesimage_embedsitem_idxc                    s~    dttf}t|tr|| }n|| }jj|j|j	 dkr&d ndd}|  }|d ur9t|t
s9J  ||S )Nr<   r   F)rs   rt   r   rS   )	get_itemsr
   r   ra   get_feature_sizeget_image_sizeinfor{   r4   r5   r   rn   )r   rP   ri   r3   rj   hf_processorr   r   
num_imagesrc   r$   r&   get_replacement_internvl  s    

zNH2OVLMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_internvlr<   z<image>)modalitytargetreplacementr$   )
r   r   get_datara   rN   r   tolistrB   r   r   )rc   r   r   r   out_mm_datar   r$   r   r&   _get_prompt_updates  s"   
z,H2OVLMultiModalProcessor._get_prompt_updatesNpromptmm_data_itemstokenization_kwargsmm_uuidsc                    s<   |j ddddkr| j|||||dS t j|||||dS )Nr<   F)strictr   )r   r   r   r   r   )	get_count_apply_hf_processorr_   _cached_apply_hf_processor)rc   r   r   r   r   r   rd   r$   r&   r     s   z3H2OVLMultiModalProcessor._cached_apply_hf_processorrf   )r~   r   r   r   r   r   r   r   r   r   r   r   r   r	   r   r   rb   r   r   r$   r$   rd   r&   r     s0    

:


r   )r   dummy_inputsc                   @   s*   e Zd ZdededB dedefddZdS )H2OVLChatModelr]   quant_configNis_monoprefixc                C   sJ   |s|j }|dk r|jj| d }n|d }t|j|||dS d}t|)Nr   r   )r   num_hidden_layers_overrider   z(Monolith mode is not applicable to H2OVL)select_layervision_confignum_hidden_layersr   NotImplementedError)rc   r]   r   r   r   vision_feature_layerr   msgr$   r$   r&   _init_vision_model  s   z!H2OVLChatModel._init_vision_model)r~   r   r   r   r   rb   r   r   r$   r$   r$   r&   r     s    r   )8collections.abcr   r   rN   PILr   transformersr   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.inputsr   r	   vllm.multimodal.parser
   r   r   $vllm.multimodal.processing.processorr   r   r   r   vllm.tokenizersr   
intern_vitr   internvlr   r   r   r   r   r   r   r   r   r   r   r   rb   r   r'   r   r0   r;   rI   r   rR   r[   r\   r   r   register_processorr   r$   r$   r$   r&   <module>   s   
4



!
.

/ .S