o
    
۾i͜                     @   s  U d dl Z d dlmZ d dlmZmZmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@ ddlAmBZB ddlCmDZDmEZEmFZFmGZG ddlHmIZI dZJdZKeLeMd< dZNeLeMd< 	 dcd!eOd"ePeOeOf d#eOfd$d%ZQG d&d' d'e9ZReRZSG d(d) d)e9ZTeTZUG d*d+ d+e2ZVG d,d- d-e0eV ZWG d.d/ d/e1eV ZXd0e3d#eVfd1d2ZYdd3d4eVd5e0eV d6e'dB d#e1fd7d8ZZddd9d:d;e#dB d<eOdB d=e[dB d>eLd#e<eBB f
d?d@Z\G dAdB dBej]Z^G dCdD dDej]Z_e%j`eZeYeWdEG dFdG dGej]e?e@ZadHejbdIePeOeOf d#ejbfdJdKZcdIePdLedd#ePfdMdNZedOePeOeOf dPeLedePeOeOf  B dQeOd#ePeOeOf fdRdSZfdTejbdUeOdVeOdOePeOeOf dLedePeOeOf  dWeOdXe[dYejbd#ejbfdZd[Zg	\	]ddd^edejb d_ededeO  dLedePeOeOf  dQeOdWeOdYejbd`eOdXe[d#edejb fdadbZhdS )e    N)defaultdict)IterableMappingSequence)partial)
accumulate)	AnnotatedLiteral)	rearrange)	LayerNormLayerNorm2d)RegStage)BatchFeatureCLIPVisionConfigSiglipVisionConfig)
VllmConfig)BaseDummyOptions)QuantizationConfig)MULTIMODAL_REGISTRY)BaseMultiModalProcessorCache)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoInputProcessingContextPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )CLIPVisionModel)MultiModalEmbeddingsSupportsMultiModal
SupportsPP)SiglipVisionModel)AutoWeightsLoader
flatten_bninit_vllm_registered_modelmaybe_prefix)get_vision_encoder_infoz<|endofturn|>z
<|dummy3|>IMAGE_TOKENz<|_unuse_missing_100270|>VIDEO_TOKEN   r2   
num_framesmax_grid_shapereturnc                 C   s,   |d |d  }| | }| | }||dk S )Nr   r$    )r3   r4   max_num_gridsnum_canvasesleftover_framesr6   r6   a/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/hyperclovax_vision.pyget_num_combined_frames;   s   r;   c                
   @   sb   e Zd ZU dZdZed ed< eee	j
 eddddddhd	f ed
< ee	j
eddf ed< dS )HCXVisionImagePixelInputsz
    Dimensions:
        - n: Number of images
        - g: Number of grids
        - c: Number of channels (3)
        - h: Height
        - w: Width
    pixel_valuestypengr2   hwdynamic_dimspixel_values_images   image_sizes_imagesN__name__
__module____qualname____doc__r>   r	   __annotations__r   listtorchTensorr#   r6   r6   r6   r:   r<   H   s   
 	r<   c                   @   sR   e Zd ZU dZdZed ed< eeee	j
  edddddd	ddhd
f ed< dS )HCXVisionVideoPixelInputsz
    Dimensions:
        - n: Number of videos
        - f: Number of frames
        - g: Number of grids
        - c: Number of channels (3)
        - h: Height
        - w: Width
    pixel_values_videosr>   r?   fr@   r2   rA   rB   rC   NrH   r6   r6   r6   r:   rQ   \   s   
 
rQ   c                   @   s~   e Zd Zdd ZdeeedB f fddZdeee B defdd	Z	deee B defd
dZ
defddZdefddZdS )HCXVisionProcessingInfoc                 C   s   t |  S N)r.   get_hf_configselfr6   r6   r:   r.   r   s   z/HCXVisionProcessingInfo.get_vision_encoder_infor5   Nc                 C   s
   d d dS )Nimagevideor6   rW   r6   r6   r:   get_supported_mm_limitsu   s   
z/HCXVisionProcessingInfo.get_supported_mm_limitsvision_query_lengthc                C      t |tr|S t|S rU   
isinstanceintsumrX   r]   r6   r6   r:   get_num_image_tokensx      
z,HCXVisionProcessingInfo.get_num_image_tokensc                C   r^   rU   r_   rc   r6   r6   r:   get_num_video_tokens   re   z,HCXVisionProcessingInfo.get_num_video_tokensc                 C   s    |   }|  }}t||dS )N)widthheight)r.   get_image_sizer   )rX   vision_encoder_inforg   rh   r6   r6   r:   !get_image_size_with_most_features   s   z9HCXVisionProcessingInfo.get_image_size_with_most_featuresc                 C   s   |   \}}| j||dS )N)image_widthimage_height)rk   rd   )rX   target_widthtarget_heightr6   r6   r:   get_max_image_tokens   s
   z,HCXVisionProcessingInfo.get_max_image_tokens)rI   rJ   rK   r.   r   strra   r\   rN   rd   rf   r   rk   rp   r6   r6   r6   r:   rT   q   s    





rT   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )HCXVisionDummyInputsBuilder	mm_countsr5   c                 C   s$   t |dd t|dd  }|S )NrZ   r   r[   )r/   getr0   )rX   rs   
dummy_textr6   r6   r:   get_dummy_text   s   z*HCXVisionDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   s~   | dd}| dd}| j \}}d}|r| dnd }	|r%| dnd }
| j||||	d| j|d |d |||
ddS )	NrZ   r   r[       )rg   rh   
num_images	overridesr$   )rg   rh   r3   
num_videosr{   rY   )rt   infork   _get_dummy_images_get_dummy_videos)rX   rw   rs   rx   rz   r|   rn   ro   target_num_framesimage_overridesvideo_overridesr6   r6   r:   get_dummy_mm_data   s(   z-HCXVisionDummyInputsBuilder.get_dummy_mm_datarU   )
rI   rJ   rK   r   rq   ra   rv   r   r   r   r6   r6   r6   r:   rr      s     


rr   c                
   @   s   e Zd Zdedeeef deeef deeef def
ddZded	ed
eeef deeef de	f
ddZ
d	ed
eeef dedee fddZded
eeef deeef fddZdS )HCXVisionMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsr5   c                    s  t |dg D ]\}}|jtjkr|tj|d |< q| jjj| jj	di |t
|d d dd}t|dkr|d}|d}	| jjj| jj	di |t
d |d u rVd n|g|	d u r^d n|	gdd D ]\}
}t|trt|dkrt|dksJ |d |
< qi|rtd d< td d< |	rdgtd	d
 |	D   fddtt|	D d<  fddtt|	D d< | |S )Nvideos)textimagesr   )hf_processordatar   r   r$   rG   vision_query_lengths_imagesc                 s   s    | ]	}t t|V  qd S rU   )r;   len.0r[   r6   r6   r:   	<genexpr>   s    
zBHCXVisionMultiModalProcessor._call_hf_processor.<locals>.<genexpr>c                    s(   g | ]}d   |  |d   qS )rR   r$   r6   r   i_idx_per_video_processed_outputsr6   r:   
<listcomp>   s    zCHCXVisionMultiModalProcessor._call_hf_processor.<locals>.<listcomp>rR   c              	      s.   g | ]}t d   |  |d   qS )vision_query_lengths_videosr$   )rO   tensorr   r   r6   r:   r     s    r   r6   )	enumeratert   dtypenpuint8astyper}   ctxcall_hf_processorget_hf_processordictr   itemsr`   rN   rO   r   r   rangeupdate)rX   r   r   r   r   	video_idx	video_arrprocessed_outputsr   r   kvr6   r   r:   _call_hf_processor   sd   	

	




	z/HCXVisionMultiModalProcessor._call_hf_processorprompt_textmm_itemshf_processor_mm_kwargstokenization_kwargsc                 C   s   dS )NFr6   )rX   r   r   r   r   r6   r6   r:   _hf_processor_applies_updates  s   z:HCXVisionMultiModalProcessor._hf_processor_applies_updatesout_mm_kwargsc                    sJ   j  }|j|jddtdtdtffdd  fdddD S )	NrY   item_idxmodalityr   c                    sn   || |  }|dkr|d j  }jj|d}n|dkr,|d j  }jj|d}nt| | g| S )NrZ   r   )r]   r[   r   )r   tolistr}   rd   rf   NotImplementedError)r   r   r   out_itemlens
num_tokens)placeholderrX   r6   r:   get_replacement_hyperclovax#  s   zUHCXVisionMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_hyperclovaxc              
      s*   g | ]}t || gt |d dqS ))r   r   )r   targetreplacement)r   r   )r   r   )r   r   r   r6   r:   r   5  s    zDHCXVisionMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>)r}   rV   image_token_idvideo_token_idra   rq   r   )rX   r   r   r   	hf_configr6   )r   r   r   rX   r:   _get_prompt_updates  s   
z0HCXVisionMultiModalProcessor._get_prompt_updates	hf_inputsc                 C   s0   t tdtdtdtdtddS )NrZ   r[   )rE   rG   r   rR   r   )r   r   batched)rX   r   r   r6   r6   r:   _get_mm_fields_configD  s   z2HCXVisionMultiModalProcessor._get_mm_fields_configN)rI   rJ   rK   rq   r   objectr   r   r   boolr   r   r   r    r   r   r   r6   r6   r6   r:   r      sN    



I


	

-

r   r   c                 C   s   t | S rU   )rT   )r   r6   r6   r:   _build_hcxvision_hf_infoR  s   r   cacher}   dummy_inputsr   c                C   s$   t | trt| ||dS tt| )Nr   )r`   rT   r   r   r>   )r}   r   r   r6   r6   r:   _build_hcxvision_hf_processorX  s   
r    )use_nth_layerrequire_post_normprefixquant_configr   r   r   c                C   s~   | j }t|ts	n|dkr|d }n|| d }t| tr&t| ||||dS t| tr4t| ||||dS dt|  }t|)Nr   r$   )r   num_hidden_layers_overrider   r   zUnsupported vision config: )	num_hidden_layersr`   ra   r   r%   r   r)   r>   r   )vision_configr   r   r   r   r   msgr6   r6   r:   init_vision_tower_for_hcxvisionh  s0   



r   c                       s.   e Zd Zddejf fdd	Zdd Z  ZS )HCXVisionMlpNc                    s   t    |p|}|p|}|| _| jdkr)t||| _| | _t||| _d S | jdkrFt|d| | _| | _td| || _d S td	| j)Nmlpinverted_mlprF   z{} is not implemented)
super__init__mm_projector_typennLinearfc1actfc2r   format)rX   r   in_featureshidden_featuresout_features	act_layer	__class__r6   r:   r     s   



zHCXVisionMlp.__init__c                 C   s"   |  |}| |}| |}|S rU   )r   r   r   )rX   xr6   r6   r:   forward  s   


zHCXVisionMlp.forward)rI   rJ   rK   r   GELUr   r   __classcell__r6   r6   r   r:   r     s    r   c                       s.  e Zd ZdZ		d!dededededed	ed
ef fddZ		d"dejde	e	e  dB de	e dB dejfddZ
		d"dejde	e	e  dB de	e dB dejfddZ		d"dejde	e	e  dB de	e dB de	ej fddZ		d#dedededededefddZdededefdd Z  ZS )$HCXVisionCAbstractorz
    This module is based on C-Abstractor, whose license is under apache-2.0.
    You can check the original code at
    https://github.com/khanrc/honeybee/blob/main/honeybee/projectors/projectors.py
    and we made necessary modifications.
    TFnum_queriesnum_input_tokensencoder_hidden_sizehidden_sizeoutput_hidden_sizepos_embprenormc                    s   t    || _|| _|r#tjtd||| _| jj	j
ddd nd | _|r.t|| _nd | _| |||| t|  j| _d S )Nr$   g        g{Gz?)meanstd)r   r   r   r   rO   r   	Parameterzerosr   r   normal_r   r   	build_netnext
parametersr   )rX   r   r   r   r   r   r   r   r   r6   r:   r     s    

zHCXVisionCAbstractor.__init__Nr   num_queries_vis_abstractors	num_gridsr5   c                 C   s<   | j d ur
|  |}| jd ur|| j }| j|||d}|S )N)r   r   )r   r   _forward)rX   r   r   r   r6   r6   r:   r     s   



zHCXVisionCAbstractor.forwardc                 C   sl   |j \}}}t|d }t|d||d}|d ur%|d usJ | |||S | |}t|d}| |}|S )N      ?zb (h w) d -> b d h w)rA   rB   b d h w -> b (h w) d)shapera   r
   _forward_adaptive_num_querynetreadout)rX   r   r   r   BLdimhwr6   r6   r:   r     s   


zHCXVisionCAbstractor._forwardc           
      C   s   t | jdks	J | jd |}g }t|D ]9\}}t|d }t||f}|||| ||d  d d f }	| jd |	}	t|	d}	| |	}	||	 q|S )Nr2   r   r   r$   rF   r   )	r   r  r   ra   r   AdaptiveAvgPool2dr
   r  append)
rX   r   r   r   new_xr   r   r  sampleroutr6   r6   r:   r    s   $

z0HCXVisionCAbstractor._forward_adaptive_num_queryr2   rF   	n_queriesdepth	mlp_depthc                 C   s   |d   sJ d| t|d }ttddtjtd}||||}	t||f}
||||}t|	|
|| _	| 
|||| _d S )Nr   z,n_queries must be square number. n_queries: r$   )stridedilationr   
norm_layer)
is_integerra   r   r   r   SiLUr   r	  
Sequentialr  	build_mlpr  )rX   r  r   r   r   r  r  r  RegBlocks1r  s2r6   r6   r:   r     s0   	zHCXVisionCAbstractor.build_netc                 C   sH   t ||g}td|D ]}|t   |t || qt j| S )Nr$   )r   r   r   r
  r  r  )rX   r  r   r   layers_r6   r6   r:   r  ?  s
   
zHCXVisionCAbstractor.build_mlp)TFNN)r2   rF   )rI   rJ   rK   rL   ra   r   r   rO   rP   rN   r   r   r  r   r  r   r6   r6   r   r:   r     s    %






&r   )r}   r   c                       s  e Zd Zg dddgdZdddeded	d
f fddZededed	ed
B fddZ	de
d	ed
B fddZde
d	ed
B fddZded	eejdf fddZded	eejdf fddZde
d	efddZde
d	efdd Z	
	
d:d!ejd
B d"ejd#ed
B d$ejd
B de
d	ejeB fd%d&Zd'eej d(ejd	eejdf fd)d*Zd+eeej  d	eejdf fd,d-Zde
fd.d/Zd0ejd	ejd
B fd1d2Zd3eeeejf  d	e e fd4d5Z!d6d7 Z"d8d9 Z#  Z$S );HCXVisionForCausalLM)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projr   )r   vllm_configr   r5   Nc             	      sJ  t    |jj}|j}|j}|jdv rd|_|jdkrd|_|j	}i |_
|j|_|j|_|jj| _| |||_| |ddh1 t||t|ddd	t|d
d| _| |||| _|jrkttj|j| jd| _W d    n1 suw   Y  | | t||t|dd| _W d    n1 sw   Y  || _ || _	|| _d S )N)gpt2hyperclovaxllamasdpar(  g      ?rZ   r[   r   Fvision_model)r   r   r   r   r   language_model)r&  r   r   )!r   r   model_configr   r   text_config
model_type_attn_implementationlogits_scalingr   auto_mapanyresr7   r   _init_possible_resolutionspossible_resolutions_mark_tower_modelr   getattrr-   r,  _init_mm_projectormm_projectorr   r   rO   emptyr   image_newline_mark_language_modelr,   r.  config)rX   r&  r   r?  r   r0  r   r   r6   r:   r   W  sV   






zHCXVisionForCausalLM.__init__r   r   c                 C   s$   | drtS | drtS td)NrZ   r[   z)Only image or video modality is supported)
startswithr/   r0   
ValueError)clsr   r   r6   r6   r:   get_placeholder_str  s
   

z(HCXVisionForCausalLM.get_placeholder_strkwargsc                 K   s.   | dd }|d u rd S | d}t||dS NrE   rG   )rE   rG   )popr<   )rX   rD  rE   rG   r6   r6   r:   _parse_and_validate_image_input  s   
z4HCXVisionForCausalLM._parse_and_validate_image_inputc                 K   s"   | dd }|d u rd S t|dS NrR   )rR   )rF  rQ   )rX   rD  rR   r6   r6   r:   _parse_and_validate_video_input  s   z4HCXVisionForCausalLM._parse_and_validate_video_inputimage_input.c                 C   s   | j |d |d dS rE  )forward_images)rX   rJ  r6   r6   r:   _process_image_input  s   z)HCXVisionForCausalLM._process_image_inputvideo_inputc                 C   s   | j |d dS rH  )forward_videos)rX   rM  r6   r6   r:   _process_video_input  s   z)HCXVisionForCausalLM._process_video_inputc                 K   sZ   i }|D ]&}|dkrd|vr| j di ||d< |dkr*d|vr*| jdi ||d< q|S )NrE   r   rR   r   r6   )rG  rI  )rX   rD  
modalities	input_keyr6   r6   r:   %_parse_and_validate_multimodal_inputs  s   z:HCXVisionForCausalLM._parse_and_validate_multimodal_inputsc           	      K   sv   | j di |}|sg S d}|D ](}|dkr%|d }| |}|t|7 }|dkr8|d }| |}|t|7 }q|S )Nr6   r   r   )rR  rL  tuplerO  )	rX   rD  rP  multimodal_embeddingsr   rJ  image_embeddingsrM  video_embeddingsr6   r6   r:   embed_multimodal  s   

z%HCXVisionForCausalLM.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s$   |d urd }| j j||||d}|S )N)r[  )r.  model)rX   rX  rY  rZ  r[  rD  hidden_statesr6   r6   r:   r     s   zHCXVisionForCausalLM.forwardrE   rG   c              
   C   s   t |dd}d| jjv rdnd}| |d d |d f }|j| jjd}| |}dd |D }tj||dd	}t	||
 | jj| jj| jj| jj| j| jjd
}t|S )NTconcatsiglipr   r$   r-  c                 S      g | ]}t |qS r6   r   )r   itemr6   r6   r:   r         z7HCXVisionForCausalLM.forward_images.<locals>.<listcomp>r  )image_forward_outsimage_sizesnum_queries_vis_abstractorunpad
patch_size	grid_sizer=  r7  )r+   r   r1  r,  tor;  r   rO   splitanyres_postprocessingr   r?   num_queries_vis_abstractor_imageri  rj  
image_sizer=  r7  rS  )rX   rE   rG   pixel_values_image_flatvisual_token_idxrf  split_sizesimage_featuresr6   r6   r:   rK    s(   
z#HCXVisionForCausalLM.forward_imagesrR   c                    s  t dd |D dd}d| jjv rdnd}| |d d |d f }|j| jjd}d}|g}g }|jd }| jj	r|dks@J |d	krU|
| jj ||7 }|
| ng|
| jj |d7 }|
| |
| jj ||d	 7 }|
| |
| jj |d7 }|
| n4|D ]1}	|	D ],}
t|
dkr|
| jj |d7 }|
| |
| jj |t|
 d }|
| qq| |||}g g }d}d}d
d |D }|D ]<}|t|7 }|
|dd || }||kr
tj|dd g }|d7 }d}q||k rtd|d|qt|dksJ d| t|tks*J dd |D }dgt| t fddtt|D S )Nc                 S   s   g | ]	}|D ]}|qqS r6   r6   r   framesframer6   r6   r:   r   !  s    z7HCXVisionForCausalLM.forward_videos.<locals>.<listcomp>Tr^  r`  r   r$   r-  rF   c                 S   s   g | ]}|D ]}t |qqS r6   rb  ru  r6   r6   r:   r   h  s
    re  zvideo_group_size=z < target_group_size=ztarget_features is not empty!! c                 S   ra  r6   rb  r   r6   r6   r:   r     rd  c                 3   s.    | ]}t  |  |d    V  qdS )r$   N)rO   catr   idxs_per_videovideo_featuresr6   r:   r     s
    
z6HCXVisionForCausalLM.forward_videos.<locals>.<genexpr>)r+   r   r1  r,  rl  r;  r   r  r?  first_last_frames_slowr
  %num_queries_vis_abstractor_video_slow%num_queries_vis_abstractor_video_fastr   flattenrO   rx  RuntimeErrorr   rS  r   )rX   rR   pixel_values_videos_flatrr  video_forward_outsgrid_idxr   r   len_total_framespixel_values_framespixel_values_frametarget_featurestarget_group_sizegroup_countervideo_groupsforward_outvideo_group_sizefeats_per_videor6   ry  r:   rN    s   







z#HCXVisionForCausalLM.forward_videosc           	      K   s  t t}| D ]\}}t|dk st|d dk rq|d}}|ds+|ds+n|dd d |dd }}d|}|dk}t|D ]q\}}|d	vrxt|| |d k rc|| t  |	 
   }|| |  |7  < qIt|tjrt|| |d k r|| t  |d
 t  ttj|dd}|| |  |7  < |d
 |  |gt| 7  < qIqt|S )Nr$   r   F_images_videosr  r+  r   )r=   	is_videosre  )r   rN   r   r   endswithrm  joinr   r
  detachcpunumpyr   r`   rO   rP   unbindr   )	rX   rD  outputr   r   new_kis_video_sample_idx_vr6   r6   r:   _prepare_multimodal_kwargs  s<   
"
z/HCXVisionForCausalLM._prepare_multimodal_kwargsr]  c                 C   s   | j |S rU   )r.  compute_logits)rX   r]  r6   r6   r:   r    s   z#HCXVisionForCausalLM.compute_logitsweightsc                 C   s   t | }||S rU   )r*   load_weights)rX   r  loaderr6   r6   r:   r    s   
z!HCXVisionForCausalLM.load_weightsc                    s   t |dg sMg }|jrK|jdksJ td|jd D ]'}td|jd D ]}|dkr2|dkr2|js2q$|| |jkr@|||g q$q fdd|D }|S |jS )Nr7  r   r$   c                    s$   g | ]\}}| j  | j  gqS r6   )rp  )r   ysxsr   r6   r:   r     s    zCHCXVisionForCausalLM._init_possible_resolutions.<locals>.<listcomp>)r9  r5  r7   r   use_1x1_gridr
  r7  )rX   r?  r   r7  r   jr6   r  r:   r6    s"   
z/HCXVisionForCausalLM._init_possible_resolutionsc              	   C   s   |j }|jdkrt||j }t| j|_|S |jdkr5t|j|j	|j
 d |||j |j|jd}|S t|j||| jj d}|S )NlinearcabstractorrF   )r   r   r   r   r   r   r   )r   r   )r   r   r   r   r   r   r   r   ro  rp  rj  proj_pos_embproj_prenormr   r0  )rX   r?  r0  r   input_hidden_sizer;  r6   r6   r:   r:    s2   


z'HCXVisionForCausalLM._init_mm_projectorr  )%rI   rJ   rK   packed_modules_mappingr   rq   r   classmethodra   rC  r   HCXVisionImageInputsrG  HCXVisionVideoInputsrI  rS  rO   rP   rL  rO  r   rR  r&   rW  r!   r   rN   rK  rN  r  r  r   setr  r6  r:  r   r6   r6   r   r:   r  L  s     7


	



 
j

r  r   original_sizec                 C   s   |\}}| j dd  \}}|| }|| }||kr:|| }t|| }	||	 d }
| d d |
||
 d d f }|S || }t|| }|| d }
| d d d d |
||
 f }|S )Nr$   rF   )r  ra   )r   r  original_widthoriginal_heightcurrent_heightcurrent_widthoriginal_aspect_ratiocurrent_aspect_ratioscale_factor
new_heightpaddingunpadded_tensor	new_widthr6   r6   r:   unpad_image  s   r  r7  c                 C   s   | \}}d }d}t d}|D ]=\}}t|| || }	t||	 t||	 }
}t|
| || }|| | }||ksC||krK||k rK|}|}||f}q|S )Nr   inf)floatminra   )r  r7  r  r  best_fitmax_effective_resolutionmin_wasted_resolutionrh   rg   scaledownscaled_widthdownscaled_heighteffective_resolutionwasted_resolutionr6   r6   r:   select_best_resolution  s*   

r  rp  grid_pinpointsrj  c                 C   sB   t |tr|nt|}| \}}t||f|\}}|| || fS rU   )r`   rN   astliteral_evalr  )rp  r  rj  r7  r  r  rh   rg   r6   r6   r:   get_anyres_image_grid_shape  s   r  image_featurerh   rg   rk  ri  r=  c                 C   s0  | d }| dd  } || |j d ks#J d|d|d|j d t|||\}	}
| |
|	||d} |r|| ddddd	 } | dddd	} t| |} tj| |d d d d f j	g | j d d dR  
| jfdd
} | dddd} n| dddd	d } | dd	} tj|| fdd
} | S )Nr   r$   zheight=z	 * width=z  != base_image_feature.shape[0]=r+     rF   r2   re  )r  r  viewpermute
contiguousr  r  rO   rx  expandrl  device	transpose)r  rh   rg   rp  r7  rk  ri  r=  base_image_featurenum_patch_widthnum_patch_heightr6   r6   r:    reshape_and_unpad_image_features,  sB   


	r  r+  Frf  rg  rh  c                 C   s   ||  }}	|dkr|d   sJ dt|d  }}	g }
t| D ]2\}}|jd dkr<t|||	|| ||||d}n|d }tj||d  |jfdd}|
	| q"|
S )Nr   r   zn_queries must be square numberr$   )r  rh   rg   rp  r7  rk  ri  r=  re  )
r  ra   r   r  r  rO   rx  rl  r  r
  )rf  rg  r7  rj  rk  r=  rh  ri  rh   rg   new_image_features	image_idxr  r6   r6   r:   rn  Z  s2   
rn  )r1   )r+  F)ir  collectionsr   collections.abcr   r   r   	functoolsr   	itertoolsr   typingr   r	   r  r   rO   torch.nnr   einopsr
   timm.layersr   r   timm.models.regnetr   transformersr   r   r   vllm.configr   vllm.config.multimodalr   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.cacher   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   vllm.multimodal.processingr   r   r   r   r   r    vllm.sequencer!   vllm.utils.tensor_schemar"   r#   clipr%   
interfacesr&   r'   r(   r`  r)   utilsr*   r+   r,   r-   visionr.   EOTr/   rq   rM   r0   ra   rS  r;   r<   r  rQ   r  rT   rr   r   r   r   r   r   Moduler   r   register_processorr  rP   r  rN   r  r  r  rn  r6   r6   r6   r:   <module>   s  
 

)* 



%!    "



	
5
	