o
    
۾i0w                     @   s  U d dl mZmZmZ d dlmZmZmZ d dlZ	d dl
Z
d dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$m%Z%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z> ddl?m@Z@mAZAmBZBmCZC G dd dejDZEG dd de8ZFG dd de8ZGeFeGB ZHeeId< G dd de8ZJG d d! d!e8ZKeJeKB ZLeeId"< d#eMd$eMd%eNd&eNd'eOeMeMf f
d(d)ZPd*eMd+eMd'eQeOeMeMf  fd,d-ZRG d.d/ d/e/ZSG d0d1 d1e-eS ZTG d2d3 d3e.eS ZUe"jVeUeSeTd4G d5d6 d6ejDe=e>e<ZWdS )7    )IterableMappingSequence)	AnnotatedLiteral	TypeAliasN)BatchFeatureInternVLProcessorPretrainedConfig)ACT2FN)GotOcr2ImageProcessorFast)InternVLVideoProcessor)
VllmConfig)BaseDummyOptions)QuantizationConfig)InternS1VisionModel)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)"cached_video_processor_from_config)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefixc                       s$   e Zd Z fddZdd Z  ZS )InternS1MultiModalProjectorc                    sz   t    t|jjtd|j d  | _t	|jjtd|j d  |j
j| _t|j | _t	|j
j|j
j| _d S )Nr%      )super__init__nn	LayerNormvision_confighidden_sizeintdownsample_ratio
layer_normLineartext_configlinear_1r   projector_hidden_actactlinear_2selfconfig	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/interns1.pyr1   D   s   

z$InternS1MultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S N)r8   r;   r=   r>   )r@   image_featureshidden_statesrD   rD   rE   forwardR   s
   



z#InternS1MultiModalProjector.forward)__name__
__module____qualname__r1   rI   __classcell__rD   rD   rB   rE   r.   C   s    r.   c                   @   sT   e Zd ZU dZdZed ed< eej	e
ddddf ed< eej	e
df ed	< d
S )InternS1ImagePixelInputsz
    Dimensions:
        - bnp: Batch size * number of images * (1 + num_patches)
        - c: Number of channels (3)
        - h: Height
        - w: Width
        - bn: Batch size * number of images
    pixel_valuestypebnp   hwbnnum_patchesNrJ   rK   rL   __doc__rP   r   __annotations__r   torchTensorr$   rD   rD   rD   rE   rN   Z   
   
 	rN   c                   @   F   e Zd ZU dZdZed ed< eej	e
ej	 B edddf ed< dS )	InternS1ImageEmbeddingInputsz
    Dimensions:
        - ni: Number of images
        - tifs: Total image feature size
        - hs: Hidden size (must match language model backbone)
    image_embedsrP   nitifshsdataNrJ   rK   rL   rX   rP   r   rY   r   rZ   r[   listr$   rD   rD   rD   rE   r^   i      
 (r^   InternS1ImageInputsc                   @   sT   e Zd ZU dZdZed ed< eej	e
ddddf ed< eej	e
d	f ed
< dS )InternS1VideoPixelInputsz
    Dimensions:
        - bnv: Batch size * number of videos * number of frames
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height
        - w: Width
    pixel_values_videosrP   bnvrR   rS   rT   rO   rU   rV   NrW   rD   rD   rD   rE   rh   x   r\   rh   c                   @   r]   )	InternS1VideoEmbeddingInputsz
    Dimensions:
        - nv: Number of videos
        - tvfs: Total video feature size
        - hs: Hidden size (must match language model backbone)
    video_embedsrP   nvtvfsrb   rc   Nrd   rD   rD   rD   rE   rk      rf   rk   InternS1VideoInputsmin_dynamic_patchmax_dynamic_patchdynamic_image_sizeuse_thumbnailreturnc                 C   s4   |r| nd} |r
|nd}|r|dkr|d7 }| |fS )Nr%   rD   )rp   rq   rr   rs   rD   rD   rE   resolve_interns1_min_max_num   s
   ru   min_nummax_numc                    s.    fddt  d D }t|dd dS )Nc                    sX   h | ](}t d |d  D ]}t d |d  D ]}||   kr" krn n||fqqqS )r%   )range).0nijrw   rv   rD   rE   	<setcomp>   s    
z-get_interns1_target_ratios.<locals>.<setcomp>r%   c                 S   s   | d | d  S )Nr   r%   rD   )xrD   rD   rE   <lambda>   s    z,get_interns1_target_ratios.<locals>.<lambda>)key)rx   sorted)rv   rw   target_ratiosrD   r}   rE   get_interns1_target_ratios   s   r   c                	   @   s   e Zd ZdZdedefddZdeee	dB f fddZ
dd	d
e	de	dedB de	fddZddedB fddZdefddZde	fddZde	deee	f de	fddZdS )InternS1ProcessingInfoz)ProcessingInfo for InternS1-style models.kwargsrt   c                 K   s8   | j jtfi |}t| j jft|jjd||_|S )N)processor_clssize)	ctxget_hf_processorr	   r"   model_configr   image_processorr   video_processor)r@   r   hf_processorrD   rD   rE   r      s   z'InternS1ProcessingInfo.get_hf_processorNc                 C   s
   d d dS )NimagevideorD   r@   rD   rD   rE   get_supported_mm_limits      
z.InternS1ProcessingInfo.get_supported_mm_limits)	processorimage_widthimage_heightr   c                C   sR   |d u r	|   j}t|tstdt| |j||t d}|   j| }|S )Nz.GotOcr2ImageProcessorFast is expected but got )images_kwargs)	r   r   
isinstancer   
ValueErrorrP   get_number_of_image_patchesdictimage_seq_length)r@   r   r   r   num_image_patchesnum_image_tokensrD   rD   rE   get_num_image_tokens   s   

z+InternS1ProcessingInfo.get_num_image_tokensrs   c                 C   s>   |   j}|j}|j}|j}d}t||||d\}}t||S )NT)rs   )r   r   min_patchesmax_patchescrop_to_patchesru   r   )r@   rs   r   rp   rq   rr   rv   rw   rD   rD   rE   resolve_target_ratios   s   


z,InternS1ProcessingInfo.resolve_target_ratiosc                 C   s   |   }| j }|jj\}}|  }d\}}|D ]"\}}	|| ||	 }
}| j|
||jd}||kr;|}t|
|d}q|dksD|d u rHJ d|S )N)r   Nr   r   r   )widthheightr   z(Cannot have a largest feature size of 0!)	r   r   get_hf_configr4   
image_sizer   r   r   r   )r@   r   	hf_configbase_height
base_widthr   largest_feature_sizelargest_feature_pinpointwrhrr   r   	feat_sizerD   rD   rE   !get_image_size_with_most_features   s(   
z8InternS1ProcessingInfo.get_image_size_with_most_featuresc                 C   s&   |   }|  \}}| j|||jdS )Nr   )r   r   r   r   )r@   r   target_widthtarget_heightrD   rD   rE   get_max_image_tokens  s   z+InternS1ProcessingInfo.get_max_image_tokensseq_len	mm_countsc           	      C   sR   | dd}| dd}|  }|  | }|| |j }|t|d }t|dS )Nr   r   r   r%   )getr   r   r   max)	r@   r   r   
max_images
max_videosr   max_image_tokensmax_total_framesmax_frames_per_videorD   rD   rE   !get_num_frames_with_most_features  s   
z8InternS1ProcessingInfo.get_num_frames_with_most_featuresrF   )rJ   rK   rL   rX   objectr	   r   r   strr6   r   r   r   boolr   r   r   r   r   rD   rD   rD   rE   r      s0    



r   c                	   @   s\   e Zd ZdZdeeef defddZ	ddedeeef deeef dB de	fd	d
Z
dS )InternS1DummyInputsBuilderz-DummyInputsBuilder for InternS1-style models.r   rt   c                 C   s@   | dd}| dd}| j j}| j j}|| ||  S )Nr   r   r   )r   infor   image_tokenvideo_token)r@   r   
num_images
num_videosr   r   rD   rD   rE   get_dummy_text   s
   z)InternS1DummyInputsBuilder.get_dummy_textNr   
mm_optionsc                 C   s   | j  \}}| j ||}|dd}|dd}| j  }	|	jj\}
}|r,|dnd }|r5|dnd }| j||||d| j||
|||ddS )Nr   r   r   )r   r   r   	overrides)r   r   
num_framesr   r   r   )	r   r   r   r   r   r4   r   _get_dummy_images_get_dummy_videos)r@   r   r   r   r   r   target_num_framesr   r   rA   image_size_himage_size_wimage_overridesvideo_overridesrD   rD   rE   get_dummy_mm_data(  s0   
z,InternS1DummyInputsBuilder.get_dummy_mm_datarF   )rJ   rK   rL   rX   r   r   r6   r   r   r   r   rD   rD   rD   rE   r     s    
r   c                
       s   e Zd ZdZdedeeef deeef deeef def
 fddZd	ed
eeef deee	f fddZ
ded
eeef dedee fddZ  ZS )InternS1MultiModalProcessorz?Basic image-only MultiModalProcessor for InternS1-style models.promptmm_data	mm_kwargs
tok_kwargsrt   c                    s  t |}|dg }|dg }t|tsJ t|tsJ | jjdi |}|j}|j|jdd}	t	|	dks:J |	d }	t
|jd|}t
|jd|}i }
|rg }|D ]*}t j|jd|i||d	}||d
 |d}||d }|d|d}qVdd |D }t|t|t|jd}
i }|rg }|D ]1}t j|jd|i||d	}||d
 |d}|	|||jk< ||d }|d|d}qdd |D }t|t|t|	d}t
d|j|}t
d|j|}||fi |ddi}ti ||
|S )NvideosimagesF)add_special_tokensr%   r   z<image_placeholder>z<video_placeholder>)r   r   r   r   rO   	input_idsc                 S      g | ]}t |qS rD   lenry   itemrD   rD   rE   
<listcomp>w      zBInternS1MultiModalProcessor._call_hf_processor.<locals>.<listcomp>)rO   image_num_patchesimage_token_idc                 S   r   rD   r   r   rD   rD   rE   r     r   )ri   video_num_patchesvideo_token_idreturn_tensorsptrD   )r   popr   re   r   r   	tokenizerencoder   r   resubr   r0   _call_hf_processorappendbatch_decodereplacerZ   concattensorr   r   )r@   r   r   r   r   r   r   r   r   r   image_outputsimage_pixel_valuesr   processed_outputsr   image_placeholderrV   video_outputsvideo_pixel_valuesr   video_placeholderr   text_outputsrB   rD   rE   r   O  sr   


z.InternS1MultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s|   | dtd}| dtd}t|}t|}ttd|tdtdtd|td|tdtd|dS )Nr   r   r   r   r   )rO   r   r_   r   ri   r   r   )	r   rZ   emptyr   r   r   flat_from_sizesbatchedshared)r@   r   r   r   r   r   r   rD   rD   rE   _get_mm_fields_config  s"   

z1InternS1MultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s   | j jdi |jjj j| }d|v r.|d ttj	s)J 
 ng d|v rE|d ttj	s@J 
 ng dtf fdd}dtf fdd}td|d	td
|d	gS )Nr   r   item_idxc                    sX    dttf}t|tr|| }n	|  }|j }| }|   }t|S )Nr   )	get_itemsr   r   r   get_feature_sizer   r    select_text)r  r   feature_sizerV   repl_features	repl_full)end_image_tokenr   r   img_context_tokenr  start_image_tokenrD   rE   get_replacement_interns1_image  s   

zWInternS1MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_interns1_imagec                    sF   |  }j  }|   d fddt|D }t|S )N
c                    s    g | ]}d |d  d  qS )Framer%   z: rD   )ry   r{   repl_features_with_seprD   rE   r     s     zkInternS1MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_interns1_video.<locals>.<listcomp>)r   joinrx   r    r
  )r  rV   r  r  )r  r   r  r   r   r  rE   get_replacement_interns1_video  s   
zWInternS1MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_interns1_videor   )modalitytargetreplacementr   rD   )r   r   r   r  r  r   get_datar   rZ   r[   tolistr6   r   )r@   r  r   r  out_mm_datar  r  rD   )r  r   r   r  r  r  r   r   rE   _get_prompt_updates  s:   

z/InternS1MultiModalProcessor._get_prompt_updates)rJ   rK   rL   rX   r   r   r   r   r   r   r  r   r   r   r   r  rM   rD   rD   rB   rE   r   L  s:    


N



r   )r   dummy_inputsc                       s  e Zd ZeddddddZededed	ed
B fddZddde	ded	d
f fddZ
deded
B defddZded	ejfddZdCddZdejd	ejfddZded	ed
B fd d!Zded	ed
B fd"d#Zd$eeB d	eejd%f fd&d'Zded	efd(d)Zd*ejd	d
fd+d,Zded	efd-d.Z 	
dDd
d/d0d*ejd1ed
B d2ejd
B d3e!d	ejf
 fd4d5Z"	
	
dEd*ejd
B d6ejd7e#d
B d8ejd
B ded	e#fd9d:Z$d;ejd	ejd
B fd<d=Z%d>e&eeejf  d	e'e fd?d@Z(d	e)fdAdBZ*  Z+S )F InternS1ForConditionalGenerationzlanguage_model.lm_head.zlanguage_model.model.zvision_tower.zmulti_modal_projector.)zlm_head.zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.)orig_to_new_prefixr  r{   rt   Nc                 C   s$   | drdS | drdS td)Nr   z<IMG_CONTEXT>r   z<video>z)Only image or video modality is supported)
startswithr   )clsr  r{   rD   rD   rE   get_placeholder_str
  s
   

z4InternS1ForConditionalGeneration.get_placeholder_str )prefixvllm_configr&  c                   s$  t    |jj}|j}|jj}|| _|| _|jjd }|jj	d }|| _	t
|| d |jd  | _|j| _| |ddh | j||t|dd| _| || _W d    n1 s\w   Y  | | t||jt|dd| _W d    n1 s}w   Y  d | _d | _d | _| jj| _d S )	Nr   r/   r   r   vision_tower)quant_configr&  language_model)r'  r   r&  )r0   r1   r   r   r)  multimodal_configrA   r4   r   
patch_sizer6   r7   num_image_token_mark_tower_model_init_vision_modelr-   r(  
_init_mlp1multi_modal_projector_mark_language_modelr,   r:   r*  img_context_token_idvideo_context_token_idvisual_token_maskmake_empty_intermediate_tensors)r@   r'  r&  rA   r)  r+  r   r,  rB   rD   rE   r1     sB   

z)InternS1ForConditionalGeneration.__init__rA   r)  c                C   s   |j j}t|j |||dS )N)r)  num_hidden_layers_overrider&  )r4   num_hidden_layersr   )r@   rA   r)  r&  r8  rD   rD   rE   r/  =  s   z3InternS1ForConditionalGeneration._init_vision_modelc                 C   s   t |S rF   )r.   r?   rD   rD   rE   r0  L  s   z+InternS1ForConditionalGeneration._init_mlp1      ?c              	   C   s   |  \}}}}|||t|| t|| }|dddd }||t|| t|| t|||  }|dddd }|S )Nr   r/   r%   rR   )r   viewr6   permute
contiguous)r@   r   scale_factorrz   rT   rS   crD   rD   rE   pixel_shuffleO  s    

z.InternS1ForConditionalGeneration.pixel_shufflerO   c                 C   s   | j |d}|d d dd d d f }t|jd d  }}||jd ||d}| j|| jd}||jd d|jd }| |}|S )N)rO   r%   r9  r   )r=  )r(  r6   shapereshaper?  r7   r1  )r@   rO   
vit_embedsrS   rT   rD   rD   rE   extract_feature^  s   
z0InternS1ForConditionalGeneration.extract_featurer   c                 K   s   | dd }| dd }| dd }|d u r|d u rd S |d ur&td|dS |d }t|tjr8|   }t|ts?J || _	|d urX| j
jj\}}td||||ddS td)	NrO   r   r_   rP   rc   r   rS   rT   )rP   rO   rV   resolve_bindings This line should be unreachable.)r   r^   r   rZ   r[   flattenuniquer   r6   r3  rA   r4   r   rN   AssertionError)r@   r   rO   r   r_   r   rS   rT   rD   rD   rE   _parse_and_validate_image_inputj  4   
z@InternS1ForConditionalGeneration._parse_and_validate_image_inputc                 K   s   | dd }| dd }| dd }|d u r|d u rd S |d ur&td|dS |d }t|tjr8|   }t|ts?J || _	|d urX| j
jj\}}td||||ddS td)	Nri   r   rl   rE  r   rF  )rP   rV   rO   rG  rH  )r   rk   r   rZ   r[   rI  rJ  r   r6   r4  rA   r4   r   rh   rK  )r@   r   pixel_values_flat_videor   rl   r   rS   rT   rD   rD   rE   _parse_and_validate_video_input  rM  z@InternS1ForConditionalGeneration._parse_and_validate_video_inputimage_input.c                    s   |d dks|d dkr|d S |  |d }|d }t|dkr+|d| jjjfS |jd  |d| jjj} fd	d
|D }||S )NrP   r_   rl   rc   rO   rV   r%   r@  c                    s   g | ]}|  qS rD   rD   )ry   rV   r  rD   rE   r     s    zJInternS1ForConditionalGeneration._process_vision_input.<locals>.<listcomp>)rD  r   r:  rA   r:   r5   rA  split)r@   rP  r_   rV   image_feature_sizesrD   rQ  rE   _process_vision_input  s   


z6InternS1ForConditionalGeneration._process_vision_inputc                 K   sZ   i }|D ]&}|dv rd|vr| j di ||d< |dv r*d|vr*| jdi ||d< q|S )N)rO   r_   r   )ri   r   rD   )rL  rO  )r@   r   
modalities	input_keyrD   rD   rE   %_parse_and_validate_multimodal_inputs  s   zFInternS1ForConditionalGeneration._parse_and_validate_multimodal_inputsr   c                 C   s
   d | _ d S rF   )r5  )r@   r   rD   rD   rE   _set_visual_token_mask  r   z7InternS1ForConditionalGeneration._set_visual_token_maskc           	      K   sv   | j di |}|sg S d}|D ](}|dkr%|d }| |}|t|7 }|dkr8|d }| |}|t|7 }q|S )NrD   r   r   )rW  rT  tuple)	r@   r   rU  multimodal_embeddingsr  rP  image_embeddingsvideo_inputvideo_embeddingsrD   rD   rE   embed_multimodal  s   

z1InternS1ForConditionalGeneration.embed_multimodalF)is_multimodalhandle_oov_mm_tokenrZ  r_  r`  c                   sN   |d urt |dkr| | |d u s|d u rt |S t j||||dS )Nr   )rZ  r_  r`  )r   rX  r0   embed_input_ids)r@   r   rZ  r_  r`  rB   rD   rE   ra    s   
z0InternS1ForConditionalGeneration.embed_input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s0   |d urd }||||d}| j jdi |}|S )N)r   rb  rc  rd  rD   )r*  model)r@   r   rb  rc  rd  r   forward_kwargsrH   rD   rD   rE   rI     s   z(InternS1ForConditionalGeneration.forwardrH   c                 C   s   | j |S rF   )r*  compute_logits)r@   rH   rD   rD   rE   rg  $  s   z/InternS1ForConditionalGeneration.compute_logitsweightsc                 C   s   t | }|j|| jdS )N)mapper)r*   load_weightshf_to_vllm_mapper)r@   rh  loaderrD   rD   rE   rj  *  s   z-InternS1ForConditionalGeneration.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r*  r1  r(  )r*  	connectortower_model)r   from_string_fieldr   rD   rD   rE   get_mm_mapping.  s
   z/InternS1ForConditionalGeneration.get_mm_mapping)r9  rF   )NN),rJ   rK   rL   r+   rk  classmethodr   r6   r$  r   r1   r
   r   r/  r2   Moduler0  r?  rZ   r[   rD  r   rg   rL  ro   rO  rY  rT  r   rW  rX  r&   r^  r   ra  r!   rI   rg  r   setrj  r   rp  rM   rD   rD   rB   rE   r     s    		 
(


%
%


$r   )Xcollections.abcr   r   r   typingr   r   r   regexr   rZ   torch.nnr2   transformersr   r	   r
   transformers.activationsr   ;transformers.models.got_ocr2.image_processing_got_ocr2_fastr   6transformers.models.internvl.video_processing_internvlr   vllm.configr   vllm.config.multimodalr   'vllm.model_executor.layers.quantizationr   'vllm.model_executor.models.interns1_vitr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   r   vllm.multimodal.processingr   r   r   r   r   r    vllm.sequencer!   !vllm.transformers_utils.processorr"   vllm.utils.tensor_schemar#   r$   
interfacesr&   r'   r(   r)   utilsr*   r+   r,   r-   rr  r.   rN   r^   rg   rY   rh   rk   ro   r6   r   rY  ru   re   r   r   r   r   register_processorr   rD   rD   rD   rE   <module>   sx    


j/ -


