o
    
۾i                     @   s$  d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZ d dlmZmZmZ d d	lmZmZmZ d
dlmZ d
dlmZmZmZmZm Z  dZ!G dd deZ"G dd deZ#G dd dee# Z$G dd dee# Z%ej&e%e#e$dG dd de Z'dS )    )MappingSequenceN)PretrainedConfig)BaseDummyOptions)QuantizationConfig)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalKwargsItems)ImageEmbeddingItemsImageProcessorItemsMultiModalDataItems)PromptReplacementPromptUpdatePromptUpdateDetails   )InternVisionModel)BaseInternVLDummyInputsBuilderBaseInternVLMultiModalProcessorBaseInternVLProcessingInfoBaseInternVLProcessorInternVLChatModelz<|vision_pad|>c                   @   s<   e Zd ZedefddZdededB dee fddZdS )	NVLMProcessorreturnc                 C   s   | j  t S N)	tokenizer	get_vocabIMG_PAD)self r   U/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/nvlm_d.pyimage_token_id,   s   zNVLMProcessor.image_token_idfeature_sizenum_patchesNc                    sl   |d u rt ddd td|D }| jr|dg7 }||  d fdd|D }d	| d
 }t|tS )Nz"Embedding inputs are not supportedc                 S   s   g | ]}d | dqS )z<tile_>r   ).0ir   r   r   
<listcomp>8   s    z0NVLMProcessor.get_image_repl.<locals>.<listcomp>r   z<tile_global_thumbnail> c                 3   s    | ]	}|t    V  qd S r   )r   )r$   
identifiercontext_sizer   r   	<genexpr>=   s    
z/NVLMProcessor.get_image_repl.<locals>.<genexpr>z<Image>z</Image>)NotImplementedErrorrangeuse_thumbnailjoinr   select_textr   )r   r!   r"   tile_pos_identifiersfeaturesreplr   r)   r   get_image_repl0   s   
zNVLMProcessor.get_image_repl)	__name__
__module____qualname__propertyintr    r   strr4   r   r   r   r   r   +   s    r   c                   @   s   e Zd ZdedefddZdS )NVLMProcessingInfokwargsr   c                 K   s"   | j jtf|  |  d|S )N)configr   )ctxinit_processorr   get_hf_configget_tokenizer)r   r<   r   r   r   get_hf_processorJ   s   z#NVLMProcessingInfo.get_hf_processorN)r5   r6   r7   objectr   rB   r   r   r   r   r;   I   s    r;   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )NVLMDummyInputsBuilder	mm_countsr   c                 C   s   | dd}d| S )Nimager   <image>
)get)r   rE   
num_imagesr   r   r   get_dummy_textT   s   z%NVLMDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | j  \}}|dd}|r|dnd }d| j||||diS )NrF   r   )widthheightrI   	overrides)info!get_image_size_with_most_featuresrH   _get_dummy_images)r   rK   rE   rL   target_widthtarget_heightrI   image_overridesr   r   r   get_dummy_mm_data[   s   z(NVLMDummyInputsBuilder.get_dummy_mm_datar   )
r5   r6   r7   r   r:   r9   rJ   r   r   rV   r   r   r   r   rD   S   s    
rD   c                	   @   s2   e Zd Zdedeeef dedee	 fddZ
dS )NVLMMultiModalProcessormm_itemshf_processor_mm_kwargsout_mm_kwargsr   c                    s   j jd	i | | }d|v r"|d ttjsJ  nd|v r0d gt|d  ng dtf fdd}t	dd|dgS )
Nimage_num_patchesimage_embedsitem_idxc                    s    dttf}t|tr|| }n|| }jj|j|j	 d}|  }|d ur2t|t
s2J  ||}t|jd tS )NrF   )image_widthimage_height	processor
)	get_itemsr
   r   
isinstanceget_feature_sizeget_image_sizerP   get_num_image_tokensrM   rN   r9   r4   r   r0   fullr   )r]   imagesr!   
image_sizer"   r3   hf_processorr[   rX   r   r   r   get_replacement_nvlm   s    

zINVLMMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_nvlmrF   rG   )modalitytargetreplacementr   )
rP   rB   get_datarc   torchTensortolistlenr9   r   )r   rX   rY   rZ   out_mm_datarl   r   rj   r   _get_prompt_updatesq   s    
z+NVLMMultiModalProcessor._get_prompt_updatesN)r5   r6   r7   r   r   r:   rC   r	   r   r   rv   r   r   r   r   rW   p   s    
rW   )rP   dummy_inputsc                   @   s>   e Zd ZdedejfddZdededB dede	fd	d
Z
dS )NVLM_D_Modelr=   r   c              
   C   sn   |j j}|jj}|jj}tt|td| j d  tj	|td| j d  |ddt
 tj	||ddS )Nr      F)bias)vision_confighidden_sizetext_configintermediate_sizenn
Sequential	LayerNormr9   downsample_ratioLinearGELU)r   r=   vit_hidden_sizellm_intermediate_sizellm_hidden_sizer   r   r   
_init_mlp1   s   zNVLM_D_Model._init_mlp1quant_configNis_monoprefixc                C   sL   |s |j }|dk r|jj| d }n|d }t|j||d|dS d}t|)Nr   r      )r   num_hidden_layers_overridenum_dummy_headsr   z)Monolith mode is not applicable to NVLM_D)select_layerr{   num_hidden_layersr   r,   )r   r=   r   r   r   vision_feature_layerr   msgr   r   r   _init_vision_model   s   zNVLM_D_Model._init_vision_model)r5   r6   r7   r   r   Moduler   r   boolr:   r   r   r   r   r   rx      s    rx   )(collections.abcr   r   rq   torch.nnr   transformersr   vllm.config.multimodalr   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.inputsr   r	   vllm.multimodal.parser
   r   r   vllm.multimodal.processingr   r   r   
intern_vitr   internvlr   r   r   r   r   r   r   r;   rD   rW   register_processorrx   r   r   r   r   <module>   s.   	
6