o
    i#                     @   s  U d dl mZmZ d dlmZmZmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d d	lm Z  d d
l!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z-m.Z.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZAmBZB ddlCmDZDmEZEmFZFmGZG ddlHmIZImJZJmKZK dZLdZMdZNdZOdZPG dd deAZQG dd  d eAZReQeRB ZSeeTd!< G d"d# d#eAZUG d$d% d%eAZVeUeVB ZWeeTd&< d'eXfd(d)ZYd*eZd+e[e\eXeXf  d,eXd-eXd.eXd/e\eXeXf fd0d1Z]d2eXd3eXd4e^d5e^d/e\eXeXf f
d6d7Z_d8eXd9eXd/e[e\eXeXf  fd:d;Z`d<eXd=eXd+e[e\eXeXf  d.eXd5e^d/e\eXeXeXf fd>d?Zad@ejd+e[e\eXeXf  d.eXd5e^d/e[ej f
dAdBZbd@ejd'eXd8eXd9eXd5e^d/ejcfdCdDZddEejed'eXd8eXd9eXd5e^d/ejcfdFdGZfG dHdI dIeZgG dJdK dKegZhG dLdM dMe8ZiedNeidOZjG dPdQ dQe6ej ZkG dRdS dSe7ej ZlG dTdU dUeiZmG dVdW dWekem ZnG dXdY dYelem Zoe)jpeoemendZG d[d\ d\ejqeFeGeEZrdS )]    )ABCabstractmethod)IterableMappingSequence)	AnnotatedAnyLiteral	TypeAliasTypeVarN)Image)BatchFeaturePretrainedConfig
TensorType)
VllmConfig)BaseDummyOptions)QuantizationConfig)	AWQConfig)InternVisionModelInternVisionPatchModel)MultiModelKeys)MULTIMODAL_REGISTRYconvert_image_mode)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TokenizerLike)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixz<img>z</img>z<IMG_CONTEXT>)g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?c                   @   P   e Zd ZU dZed ed< eeje	ddddf ed< eeje	d	f ed
< dS )InternVLImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - bnp: Batch size * number of images * (1 + num_patches)
        - c: Number of channels (3)
        - h: Height of each image patch
        - w: Width of each image patch
    pixel_valuestypebnp   hwpixel_values_flatbnnum_patchesN
__name__
__module____qualname____doc__r	   __annotations__r   torchTensorr*    rF   rF   Y/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/internvl.pyr4   G   
   
 	r4   c                   @   B   e Zd ZU dZed ed< eeje	ej B e
dddf ed< dS )	InternVLImageEmbeddingInputsz
    Dimensions:
        - n: Number of images
        - f: Total image feature size
        - h: Hidden size (must match the hidden size of language model backbone)
    image_embedsr6   nfr9   dataNr?   r@   rA   rB   r	   rC   r   rD   rE   listr*   rF   rF   rF   rG   rJ   V      
 (rJ   InternVLImageInputsc                   @   r3   )InternVLVideoPixelInputsz
    Dimensions:
        - bvf: Batch size * number of videos * num_frames
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each video frame
        - w: Width of each video frame
    pixel_values_videosr6   bvfr8   r9   r:   r;   r<   r=   Nr>   rF   rF   rF   rG   rS   e   rH   rS   c                   @   rI   )	InternVLVideoEmbeddingInputsz
    Dimensions:
        - n: Number of videos
        - f: Total video feature size
        - h: Hidden size (must match the hidden size of language model backbone)
    video_embedsr6   rL   rM   r9   rN   NrO   rF   rF   rF   rG   rV   t   rQ   rV   InternVLVideoInputs
input_sizec              	   C   sJ   t t}}ttdd tj| | ftjjdt tj	||dg}|S )Nc                 S   s
   t | dS )NRGBr   )imgrF   rF   rG   <lambda>   s   
 z!build_transform.<locals>.<lambda>)interpolation)meanstd)
IMAGENET_MEANIMAGENET_STDTComposeLambdaResizeInterpolationModeBICUBICToTensor	Normalize)rY   MEANSTD	transformrF   rF   rG   build_transform   s   

rm   aspect_ratiotarget_ratioswidthheight
image_sizereturnc                C   s|   t d}d}|| }|D ]/}|d |d  }	t| |	 }
|
|k r%|
}|}q|
|kr;|d| | |d  |d  kr;|}q|S )Ninf)r+   r+   r   r+         ?)floatabs)rn   ro   rp   rq   rr   best_ratio_diff
best_ratioarearatiotarget_aspect_ratio
ratio_diffrF   rF   rG   find_closest_aspect_ratio   s    r~   min_dynamic_patchmax_dynamic_patchdynamic_image_sizeuse_thumbnailc                 C   s4   |r| nd} |r
|nd}|r|dkr|d7 }| |fS )Nr+   rF   r   r   r   r   rF   rF   rG   resolve_internvl_min_max_num   s
   r   min_nummax_numc                    s.    fddt  d D }t|dd dS )Nc                    sX   h | ](}t d |d  D ]}t d |d  D ]}||   kr" krn n||fqqqS )r+   )range).0rL   ijr   r   rF   rG   	<setcomp>   s    
z-get_internvl_target_ratios.<locals>.<setcomp>r+   c                 S   s   | d | d  S Nr   r+   rF   )xrF   rF   rG   r\      s    z,get_internvl_target_ratios.<locals>.<lambda>)key)r   sorted)r   r   ro   rF   r   rG   get_internvl_target_ratios   s   r   
orig_widthorig_heightc           
      C   s`   | | }t ||| ||d}||d  }||d  }|d |d  }	|r+|	dkr+|	d7 }	|	||fS )N)rp   rq   rr   r   r+   )r~   )
r   r   ro   rr   r   rn   r|   target_widthtarget_heightblocksrF   rF   rG   calculate_internvl_targets   s   	
r   imagec                C   s   | j \}}t||||dd\}}}| ||f}	g }
t|D ].}|||  | |||  | |||  d | |||  d | f}|	|}|
| qt|
|ksUJ |rit|
dkri| ||f}|
| |
S )NF)r   r   ro   rr   r   r+   )sizer   resizer   cropappendlen)r   ro   rr   r   r   r   r   r   r   resized_imgprocessed_imagesr   box	split_imgthumbnail_imgrF   rF   rG   dynamic_preprocess_internvl   s.   
	

r   c                   s@   t ||}t|d t| |||d}t fdd|D }|S )NrY   ro   rr   r   c                       g | ]} |qS rF   rF   r   r   rl   rF   rG   
<listcomp>%      z2image_to_pixel_values_internvl.<locals>.<listcomp>)r   rm   r   rD   stack)r   rY   r   r   r   ro   imagesr5   rF   r   rG   image_to_pixel_values_internvl  s   

r   videoc          
         sz   t ||}t|d ttj  }| D ]}ttj|dd|||d}t|dks)J || qt	 fdd|D }	|	S )Nr   rZ   )moder   r+   c                    r   rF   rF   r   r   rF   rG   r   @  r   z2video_to_pixel_values_internvl.<locals>.<listcomp>)
r   rm   rP   r   r   	fromarrayr   extendrD   r   )
r   rY   r   r   r   ro   frames_listframe	pil_framer5   rF   r   rG   video_to_pixel_values_internvl*  s   

r   c                       s:  e Zd ZdZdddddedededB dedB dedB d	df fd
dZe	e
d	efddZe
dededB d	ee fddZddddddedB dedB dedB dedB d	eeef f
ddZddddddedB dedB dedB dedB d	eeeef  f
ddZdeded	efddZ			d(deej dedB dedB dedB d	eej f
ddZ			d(dee deej dedB dedB dedB d	eee eeejf f fd d!Zd)d"eee B dB fd#d$Z						d*deee B dB dejeej B dB dedB dedB dedB d%eeB dB d	efd&d'Z  ZS )+BaseInternVLProcessorz
    This model doesn't define its own HF processor,
    so we implement our own one here.

    The code to insert image tokens is based on:
    https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
    Nr   r   r   config	tokenizerr   r   r   rs   c                   s   t    || _|| _|jj}|jj}|d u r|j}t|t	s!J |d u r(|j
}t|t	s/J |d u r6|j}t|ts=J t	|| d |jd  | _|| _|| _|| _
|| _|j| _d S )N   )super__init__r   r   vision_configrr   
patch_sizer   
isinstanceintr   r   booldownsample_rationum_image_tokenr   )selfr   r   r   r   r   rr   r   	__class__rF   rG   r   M  s,   
	zBaseInternVLProcessor.__init__c                 C      t NNotImplementedErrorr   rF   rF   rG   image_token_ids  s   z$BaseInternVLProcessor.image_token_idfeature_sizer=   c                 C   r   r   r   )r   r   r=   rF   rF   rG   get_image_replx  s   z$BaseInternVLProcessor.get_image_replr   r   c                C   sX   |d u r| j n|}|d u r| jn|}|d u r| jn|}|d u r"| jn|}t||||dS Nr   )r   r   r   r   r   )r   r   r   r   r   rF   rF   rG   resolve_min_max_num  s   	z)BaseInternVLProcessor.resolve_min_max_numc                C   s    | j ||||d\}}t||S r   )r   r   )r   r   r   r   r   r   r   rF   rF   rG   resolve_target_ratios  s   

z+BaseInternVLProcessor.resolve_target_ratiosimage_widthimage_heightc                C   s2   | j dd}t||| j|| jd\}}}|| j S )NF)r   )r   r   rr   ro   r   )r   r   rr   r   r   )r   r   r   ro   r=   _rF   rF   rG   get_num_image_tokens  s   
z*BaseInternVLProcessor.get_num_image_tokensr   c                    s,   j |||dd\  fdd|D S )NFr   c              	      s"   g | ]}t |j jd qS )rY   r   r   r   )r   rr   r   r   r   r   r   rF   rG   r     s    zEBaseInternVLProcessor._images_to_pixel_values_lst.<locals>.<listcomp>r   )r   r   r   r   r   rF   r   rG   _images_to_pixel_values_lst  s   
z1BaseInternVLProcessor._images_to_pixel_values_lsttextc                    s   t |dkri }||fS | j||||d}t|tdd |D d}|D ]}|jd }	|	| j }
| |
|	  fdd|D }q'||fS )Nr   r   c                 S      g | ]}t |qS rF   r   r   itemrF   rF   rG   r     r   z;BaseInternVLProcessor._preprocess_image.<locals>.<listcomp>)r;   image_num_patchesc                       g | ]
}| d  jdqS )<image>r+   replacefullr   t
image_replrF   rG   r         )r   r   rD   cattensorshaper   r   )r   r   r   r   r   r   image_inputspixel_values_lstr5   r=   r   rF   r   rG   _preprocess_image  s(   

z'BaseInternVLProcessor._preprocess_image
input_itemc                 C   s    |d u rg }t |ts|g}|S r   )r   rP   )r   r   rF   rF   rG   _make_batch_input  s
   
z'BaseInternVLProcessor._make_batch_inputreturn_tensorsc           
         sT    fdd||fD \}} j |||||d\}} |}i ||}	t|	|dS )Nc                       g | ]}  |qS rF   r   r   r   r   rF   rG   r   
  s    z2BaseInternVLProcessor.__call__.<locals>.<listcomp>r   r   r   r   r   tensor_type)r   r   r   )
r   r   r   r   r   r   r   r   text_inputscombined_outputsrF   r   rG   __call__  s   	

zBaseInternVLProcessor.__call__)NNNr   )NNNNNN) r?   r@   rA   rB   r   r(   r   r   r   propertyr   r   r&   strr   tupler   rP   r   r   r   rD   rE   r   dictr   r   r   r   r   r   __classcell__rF   rF   r   rG   r   D  s    &






 	
r   c                       s  e Zd ZdZddddddedededB dedB dedB d	edB d
df fddZ	e
d
efddZe
d
edB fddZe
d
efddZ	d$deej dedB d
eej fddZ	d$dee deej dedB fddZ							d%deee B dB dejeej B dB dejeej B dB dedB dedB dedB deeB dB d
efddZdededB d
ee fdd ZdefdededB d!ed
ee fd"d#Z  ZS )&InternVLProcessorz
    HF Processor for InternVLChatModel with extended video processing logic.

    Code for video processing is adapted from video example:
    https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
    N)r   r   r   video_tokenr   r   r   r   r   r  rs   c                   s    t  j|||||d || _d S )N)r   r   r   r   r   )r   r   r  )r   r   r   r   r   r   r  r   rF   rG   r   #  s   

zInternVLProcessor.__init__c                 C   s   | j  t S r   )r   	get_vocabIMG_CONTEXTr   rF   rF   rG   r   7  s   z InternVLProcessor.image_token_idc                 C   s"   | j d u rd S | j | j d S r   )r  r   r  getr   rF   rF   rG   video_token_id;  s   
z InternVLProcessor.video_token_idc                 C   s
   | j d uS r   )r  r   rF   rF   rG   supports_videoA     
z InternVLProcessor.supports_videovideosc                    s,   j dd|dd\  fdd|D S )Nr+   Fr   c              	      s    g | ]}t |j d dqS )Fr   )r   rr   )r   r   r   rF   rG   r   Q  s    zAInternVLProcessor._videos_to_pixel_values_lst.<locals>.<listcomp>r   )r   r	  r   rF   r   rG   _videos_to_pixel_values_lstE  s   
z-InternVLProcessor._videos_to_pixel_values_lstr   c                    s   t |dks	| jsi }||fS | j||d}t|tdd |D d}|D ]}|jd }| | j|| j	  fdd|D }q(||fS )Nr   )r   c                 S   r   rF   r   r   rF   rF   rG   r   l  r   z7InternVLProcessor._preprocess_video.<locals>.<listcomp>)pixel_values_flat_videovideo_num_patchesc                    r   )<video>r+   r   r   
video_replrF   rG   r   v  r   )
r   r  r
  rD   r   r   r   get_video_replr   r  )r   r   r	  r   video_inputspixel_values_lst_videor5   r=   rF   r  rG   _preprocess_video\  s&   

z#InternVLProcessor._preprocess_videor   r   c                    sp    fdd|||fD \}}} j |||||d\}} j|||d\}}	 |}
i |
||	}t||dS )Nc                    r   rF   r   r   r   rF   rG   r     s    
z.InternVLProcessor.__call__.<locals>.<listcomp>r   )r   r	  r   r   )r   r  r   r   )r   r   r   r	  r   r   r   r   r   r  r   r   rF   r   rG   r   y  s$   




zInternVLProcessor.__call__r   r=   c                 C   s    t | }t| t }t|t S r   )r  	IMG_STARTIMG_ENDr&   select_text)r   r   r=   repl_features	repl_fullrF   rF   rG   r     s   z InternVLProcessor.get_image_replvideo_context_tokenc                    s>   || j  }t| t  d fddt|D }t||S )N c                    s    g | ]}d |d  d  qS )Framer+   z: rF   )r   r   repl_features_with_seprF   rG   r     s     z4InternVLProcessor.get_video_repl.<locals>.<listcomp>)r   r  r  joinr   r&   r  )r   r   r=   r  r  r  rF   r  rG   r    s   
z InternVLProcessor.get_video_replr   )NNNNNNN)r?   r@   rA   rB   r   r(   r   r   r   r   r   r   r  r  rP   nptNDArrayrD   rE   r
  r  r   r   r   r   r&   r   r  r  r   rF   rF   r   rG   r    s    	


	
"
r  c                   @   sz   e Zd ZdZededefddZdee	e
dB f fddZd	e
d
e
dedB de
fddZdefddZde
fddZdS )BaseInternVLProcessingInfoz:Basic image-only ProcessingInfo for InternVL-style models.kwargsrs   c                 K   r   r   r   r   r"  rF   rF   rG   get_hf_processor  s   z+BaseInternVLProcessingInfo.get_hf_processorNc                 C   s   dd iS )Nr   rF   r   rF   rF   rG   get_supported_mm_limits  s   z2BaseInternVLProcessingInfo.get_supported_mm_limitsr   r   	processorc                C   s   |d u r|   }|j||dS )N)r   r   )r$  r   )r   r   r   r&  rF   rF   rG   r     s   z/BaseInternVLProcessingInfo.get_num_image_tokensc                 C   s   |   }|j}| }d\}}|D ]!\}}|| || }}	| j||	|d}
|
|kr2|
}t||	d}q|dks;|d u r?td|S )N)r   Nr   r   r&  )rp   rq   r   z(Cannot have a largest feature size of 0!)r$  rr   r   r   r   
ValueError)r   r&  	base_sizero   largest_feature_sizelargest_feature_pinpointwrhrrp   rq   	feat_sizerF   rF   rG   !get_image_size_with_most_features  s$   z<BaseInternVLProcessingInfo.get_image_size_with_most_featuresc                 C   s$   |   }|  \}}| j|||dS )Nr'  )r$  r/  r   )r   r&  r   r   rF   rF   rG   get_max_image_tokens  s   z/BaseInternVLProcessingInfo.get_max_image_tokens)r?   r@   rA   rB   r   objectr   r$  r   r   r   r%  r   r   r/  r0  rF   rF   rF   rG   r!    s     
r!  _I)boundc                	   @   s\   e Zd ZdZdeeef defddZ	ddedeeef deeef dB de	fd	d
Z
dS )BaseInternVLDummyInputsBuilderz>Basic image-only DummyInputsBuilder for InternVL-style models.	mm_countsrs   c                 C   s   | dd}d| S )Nr   r   r   )r  )r   r5  
num_imagesrF   rF   rG   get_dummy_text  s   z-BaseInternVLDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | j  \}}|dd}|r|dnd }d| j||||diS )Nr   r   )rp   rq   r6  	overrides)infor/  r  _get_dummy_images)r   r8  r5  r9  r   r   r6  image_overridesrF   rF   rG   get_dummy_mm_data  s   z0BaseInternVLDummyInputsBuilder.get_dummy_mm_datar   )r?   r@   rA   rB   r   r   r   r7  r   r   r>  rF   rF   rF   rG   r4    s    	
r4  c                
       s   e Zd ZdZdedeeef deeef deeef def
 fddZd	ed
eeef deee	f fddZ
ded
eeef dedee fddZ  ZS )BaseInternVLMultiModalProcessorz?Basic image-only MultiModalProcessor for InternVL-style models.promptmm_data	mm_kwargs
tok_kwargsrs   c                    s>   t  j||||d}| jjdi |}|j}t||d< |S )N)r@  rA  rB  rC  r   rF   )r   _call_hf_processorr;  r$  r   rD   r   )r   r@  rA  rB  rC  processed_outputshf_processorr   r   rF   rG   rD    s   z2BaseInternVLMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   sF   | dtd}t|}ttd|tdtdtd|dS )Nr   r   r   )r;   r   rK   r   )	r  rD   emptyr   r   r   flat_from_sizesbatchedshared)r   rG  rH  r   r6  rF   rF   rG   _get_mm_fields_config,  s   
z5BaseInternVLMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s   j jd	i | | }d|v r"|d ttjsJ  nd|v r0d gt|d  ng dtf fdd}t	dd|dgS )
Nr   rK   item_idxc                    sp    dttf}t|tr|| }n|| }jj|j|j	 d}|  }|d ur2t|t
s2J  ||S )Nr   r'  )	get_itemsr   r   r   get_feature_sizeget_image_sizer;  r   rp   rq   r   r   )rP  r   r   rr   r=   rF  r   rN  r   rF   rG   get_replacement_internvlQ  s   

zUBaseInternVLMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_internvlr   r   modalitytargetreplacementrF   )
r;  r$  get_datar   rD   rE   tolistr   r   r$   )r   rN  rH  rO  out_mm_datarU  rF   rT  rG   _get_prompt_updates=  s    
z3BaseInternVLMultiModalProcessor._get_prompt_updatesr?   r@   rA   rB   r   r   r1  r   rD  r   rM  r    r   r   r%   r]  r   rF   rF   r   rG   r?    s:    






r?  c                       sn   e Zd ZdZedd Z fddZdedB fdd	Zd
e	de
ee	f de	fddZdedefddZ  ZS )InternVLProcessingInfoz5InternVL ProcessingInfo extended for video processingc                 C   s
   |   jS r   )r$  r  r   rF   rF   rG   r  r  r  z%InternVLProcessingInfo.supports_videoc                    s$   | j rdd ini }i t  |S )Nr   )r  r   r%  )r   video_limitr   rF   rG   r%  v  s   z.InternVLProcessingInfo.get_supported_mm_limitsrs   Nc                 C   s&   |    j}ddddd}||S )Nz<|video_pad|>z<|reserved_200000|>)qwen2qwen3	qwen3_moegpt_oss)get_hf_configget_text_config
model_typer  )r   text_model_typevideo_token_maprF   rF   rG   get_video_tokenz  s   
z&InternVLProcessingInfo.get_video_tokenr8  r5  c           	      C   sR   | dd}| dd}|  }|  | }|| |j }|t|d }t|dS )Nr   r   r   r+   )r  r$  r0  r   max)	r   r8  r5  
max_images
max_videosr&  max_image_tokensmax_total_framesmax_frames_per_videorF   rF   rG   !get_num_frames_with_most_features  s   
z8InternVLProcessingInfo.get_num_frames_with_most_featuresr"  c                 K   s(   | j jtf|  |  |  d|S )N)r   r   r  )ctxinit_processorr  re  get_tokenizerrj  r#  rF   rF   rG   r$    s   z'InternVLProcessingInfo.get_hf_processor)r?   r@   rA   rB   r   r  r%  r   rj  r   r   rq  r1  r  r$  r   rF   rF   r   rG   r_  o  s    



r_  c                	       sh   e Zd ZdZdeeef def fddZ	ddedeeef deeef dB de	f fd	d
Z
  ZS )InternVLDummyInputsBuilderz6InternVL DummyInputsBuilder extended for video supportr5  rs   c                    s    | dd}t |d|  S )Nr   r   r  )r  r   r7  )r   r5  
num_videosr   rF   rG   r7    s   z)InternVLDummyInputsBuilder.get_dummy_textNr8  r9  c                    s   t  j|||d}| jjr9| j }|jj}| j||}|dd}|r*|dnd }	d| j	|||||	di}
ni }
i ||
S )N)r8  r5  r9  r   r   )rp   rq   
num_framesrv  r:  )
r   r>  r;  r  re  r   rr   rq  r  _get_dummy_videos)r   r8  r5  r9  dummy_imager   rr   target_num_framesrv  video_overridesdummy_videor   rF   rG   r>    s*   

z,InternVLDummyInputsBuilder.get_dummy_mm_datar   )r?   r@   rA   rB   r   r   r   r7  r   r   r>  r   rF   rF   r   rG   ru    s    	
ru  c                
       s   e Zd ZdZdedeeef deeef deeef def
 fddZd	ed
eeef deee	f f fddZ
ded
eeef dedee f fddZ  ZS )InternVLMultiModalProcessorz7InternVL MultiModalProcessor extended for video supportr@  rA  rB  rC  rs   c                    sL   t  ||||}| jjdi |}| jjr$|j }d ur$t||d< |S )Nr  rF   )r   rD  r;  r$  r  r  rD   r   )r   r@  rA  rB  rC  rE  rF  r  r   rF   rG   rD    s   z.InternVLMultiModalProcessor._call_hf_processorrG  rH  c                    sh   t  ||}| jjr.|dtd}t|}tt	
d|t	dt	d|d}||B S i }||B S )Nr  r   r   )r  r  r  )r   rM  r;  r  r  rD   rI  r   r   r   rJ  rK  rL  )r   rG  rH  image_fieldsr  rv  video_fieldsr   rF   rG   rM    s   

z1InternVLMultiModalProcessor._get_mm_fields_configrN  rO  c                    s   t  j|||d}| jjd	i | | }d|v r+|d ttjs&J  ng dt	f fdd}| jj
rFg |tdd|d}|S )
N)rN  rH  rO  r  rP  c                    s6    j }|  }|d urt|tsJ  j|| jdS )N)r  )r   r   r   r  r  )rP  r   r=   rF  r  rF   rG   get_video_replacement_internvl
  s   zWInternVLMultiModalProcessor._get_prompt_updates.<locals>.get_video_replacement_internvlr   r  rV  rF   )r   r]  r;  r$  rZ  r   rD   rE   r[  r   r  r$   )r   rN  rH  rO  prompt_replr\  r  r   r  rG   r]    s0   

	z/InternVLMultiModalProcessor._get_prompt_updatesr^  rF   rF   r   rG   r}    s:    





r}  )r;  dummy_inputsc                       s   e Zd ZdZededededB fddZdd	d
ededdf fddZ	de
defddZde
dedB dedefddZde
dejfddZdGddZdejdejfddZdededB fddZdededB fd d!Zd"eeB deejd#f fd$d%Zdedefd&d'Zd(ejddfd)d*Z dede!fd+d,Z"	dHdd-d.d(ejd/e!dB d0ejdB d1edejf
 fd2d3Z#		dId(ejdB d4ejd5e$dB d6ejdB dede$fd7d8Z%d9ejdejdB fd:d;Z&d<e'eeejf  de(e fd=d>Z)de*fd?d@Z+dAedefdBdCZ,dDedefdEdFZ-  Z.S )JInternVLChatModelTrW  r   rs   Nc                 C   s$   | drdS | drdS td)Nr   r   r   r  z)Only image or video modality is supported)
startswithr(  )clsrW  r   rF   rF   rG   get_placeholder_str)  s
   

z%InternVLChatModel.get_placeholder_strr  )prefixvllm_configr  c          	         sd  t    |jj}|j}|jj}|| _|| _|jdk| _| 	|| |j
p(|jj}|jj}|| _|| d | _t| j|jd  | _|j| _|j| _|jjd }|dk| _| |ddh | j||| jt|dd| _| || _W d    n1 s|w   Y  | | t||jt|d	d
| _W d    n1 sw   Y  d | _d | _ d | _!| jj"| _"d S )NrN   r   r   InternLM2VEForCausalLMr   r   vision_model)quant_configis_monor  language_model)r  	hf_configr  )#r   r   model_configr  r  multimodal_configr   mm_encoder_tp_modeuse_data_parallel_patch_quant_configforce_image_sizer   rr   r   patch_tokensr   r   r   
ps_versiontext_configarchitecturesr  _mark_tower_model_init_vision_modelr2   r  
_init_mlp1mlp1_mark_language_modelr1   r  img_context_token_idvideo_context_token_idvisual_token_maskmake_empty_intermediate_tensors)	r   r  r  r   r  r  rr   r   llm_arch_namer   rF   rG   r   2  sL   

	
zInternVLChatModel.__init__r   r  c                 C   sF   t |tr|j}t|dd }|js|d ur!|jd d S d S d S d S )Nquantization_configr  )r   r   r  getattrmodules_to_not_convertr   )r   r   r  r  llm_quant_configrF   rF   rG   r  a  s   
z%InternVLChatModel._patch_quant_configr  c                C   sH   |s|j }|dk r|jj| d }n|d }t|j|||dS t|jS )Nr   r+   )r  num_hidden_layers_overrider  )select_layerr   num_hidden_layersr   r   )r   r   r  r  r  vision_feature_layerr  rF   rF   rG   r  n  s   
z$InternVLChatModel._init_vision_modelc              	   C   s^   |j j}|jj}tt|td| j d  t|td| j d  |t	 t||S )Nr+   r   )
r   hidden_sizer  nn
Sequential	LayerNormr   r   LinearGELU)r   r   vit_hidden_sizellm_hidden_sizerF   rF   rG   r    s   
zInternVLChatModel._init_mlp1ru   c              	   C   s   |  \}}}}|||t|| t|| }|dddd }||t|| t|| t|||  }| jdkr@	 |S |dddd }|S )Nr   r   r+   r8   v1)r   viewr   permute
contiguousr  )r   r   scale_factorrL   r:   r9   crF   rF   rG   pixel_shuffle  s    


zInternVLChatModel.pixel_shuffler5   c                 C   s   | j |d}|d d dd d d f }t|jd d  }}||jd ||d}| j|| jd}||jd d|jd }| |}|S )N)r5   r+   ru   r   )r  )r  r   r   reshaper  r   r  )r   r5   
vit_embedsr9   r:   rF   rF   rG   extract_feature  s   
z!InternVLChatModel.extract_featurer"  c           	      K   s   | dd }| dd }| dd }|d u r|d u rd S |d ur&td|dS |d }t|tjr8|   }t|ts?J || _	|d urZ| j
jj }}||d}td|||dS td	)
Nr;   r   rK   r6   rN   r   r9   r:   r5   r6   r;   r=   resolve_bindings This line should be unreachable.)poprJ   r   rD   rE   flattenuniquer   r   r  r   r   rr   r4   AssertionError)	r   r"  r;   r   rK   r   
expected_h
expected_wr  rF   rF   rG   _parse_and_validate_image_input  2   
z1InternVLChatModel._parse_and_validate_image_inputc           	      K   s   | dd }| dd }| dd }|d u r|d u rd S |d ur&td|dS |d }t|tjr8|   }t|ts?J || _	|d urZ| j
jj }}||d}td|||d	S td
)Nr  r  rK   rW   r  r  r  rT   r  r  )r  rV   r   rD   rE   r  r  r   r   r  r   r   rr   rS   r  )	r   r"  r  r  rW   r  r  r  r  rF   rF   rG   _parse_and_validate_video_input  r  z1InternVLChatModel._parse_and_validate_video_inputimage_input.c                    s   |d dks|d dkr|d S |  |d }|d }t|dkr+|d| jjjfS |jd  |d| jjj} fd	d
|D }||S )Nr6   rK   rW   rN   r;   r=   r+   r  c                    s   g | ]}|  qS rF   rF   )r   r=   r   rF   rG   r     s    z;InternVLChatModel._process_vision_input.<locals>.<listcomp>)r  r   r  r   r  r  r   split)r   r  rK   r=   image_feature_sizesrF   r  rG   _process_vision_input  s   


z'InternVLChatModel._process_vision_inputc                 K   sZ   i }|D ]&}|dv rd|vr| j di ||d< |dv r*d|vr*| jdi ||d< q|S )N)r;   rK   r   )r  r	  rF   )r  r  )r   r"  
modalities	input_keyrF   rF   rG   %_parse_and_validate_multimodal_inputs  s   z7InternVLChatModel._parse_and_validate_multimodal_inputs	input_idsc                 C   s6   | j r| jd us
J || jkdd| _d S d | _d S )Nr  r+   )r  r  r  r  )r   r  rF   rF   rG   _set_visual_token_mask%  s   


z(InternVLChatModel._set_visual_token_maskc           	      K   sv   | j di |}|sg S d}|D ](}|dkr%|d }| |}|t|7 }|dkr8|d }| |}|t|7 }q|S )NrF   r   r	  )r  r  r   )	r   r"  r  multimodal_embeddingsrW  r  image_embeddingsvideo_inputvideo_embeddingsrF   rF   rG   embed_multimodal.  s   

z"InternVLChatModel.embed_multimodalF)is_multimodalhandle_oov_mm_tokenr  r  r  c                   sN   |d urt |dkr| | |d u s|d u rt |S t j||||dS )Nr   )r  r  r  )r   r  r   embed_input_ids)r   r  r  r  r  r   rF   rG   r  E  s   
z!InternVLChatModel.embed_input_ids	positionsintermediate_tensorsinputs_embedsc                 K   sP   |d urd }||||d}| j d ur|d| j i d | _ | jjdi |}|S )N)r  r  r  r  r  rF   )r  updater  model)r   r  r  r  r  r"  forward_kwargshidden_statesrF   rF   rG   forward[  s   
zInternVLChatModel.forwardr  c                 C   s   | j |S r   )r  compute_logits)r   r  rF   rF   rG   r  u  s   z InternVLChatModel.compute_logitsweightsc                 C   s   g d}t | |d}||S )N)action_embedtemporal_embedtrack_embedtrack_embed_decoder	box_tokencg_criterioncg_modelloc_encoderloc_decodersamtemporal_tokentrack_token)skip_prefixes)r0   load_weights)r   r  r  loaderrF   rF   rG   r  {  s   
zInternVLChatModel.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r  r  r  )r  	connectortower_model)r   from_string_fieldr   rF   rF   rG   get_mm_mapping  s
   z InternVLChatModel.get_mm_mappingnum_image_tokensc                 C   s.   |dks	| j dkrdS || j  }|| jd  S r   r   r  )r   r  r=   rF   rF   rG   get_num_mm_encoder_tokens  s   
z+InternVLChatModel.get_num_mm_encoder_tokensnum_vision_tokensc                 C   s.   |dks	| j dkrdS || jd  }|| j  S r   r  )r   r  r=   rF   rF   rG   get_num_mm_connector_tokens  s   
z-InternVLChatModel.get_num_mm_connector_tokens)ru   r   )NN)/r?   r@   rA   supports_encoder_tp_dataclassmethodr   r   r  r   r   r   r   r  r   r  r  Moduler  r  rD   rE   r  r1  rR   r  rS   r  rX   r   r  r   r  r  r,   r  r  r'   r  r  r   setr  r   r  r  r   r   rF   rF   r   rG   r  !  s     /



$
$
	

$
r  )sabcr   r   collections.abcr   r   r   typingr   r   r	   r
   r   numpy.typingr  rD   torch.nnr  torchvision.transforms
transformsrb   PILr   transformersr   r   r   vllm.configr   vllm.config.multimodalr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.quantization.awqr   %vllm.model_executor.models.intern_vitr   r   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.imager   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   r    vllm.multimodal.processingr!   r"   r#   r$   r%   r&   vllm.sequencer'   vllm.tokenizersr(   vllm.utils.tensor_schemar)   r*   
interfacesr,   r-   r.   r/   utilsr0   r1   r2   r  r  r  r`   ra   r4   rJ   rR   rC   rS   rV   rX   r   rm   rv   rP   r   r~   r   r   r   r   r   rE   r   r   r   r   r  r!  r2  r4  r?  r_  ru  r}  register_processorr  r  rF   rF   rF   rG   <module>   s  	 





 
*

 X <^
/
)Z