o
    
۾iW                     @   s~  d dl mZ d dlmZ d dlZd dlmZ d dlmZ	 d dl
mZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4m5Z5 dZ6dZ7dZ8de9fddZ:de;de<e=e9e9f  de9de9de9d e=e9e9f fd!d"Z>d#e9d$e9de<e=e9e9f  de9d%e?d e=e9e9e9f fd&d'Z@d(ejde<e=e9e9f  de9d%e?d e<ej f
d)d*ZAd+e9d,e9d e<e=e9e9f  fd-d.ZBd(ejde9d+e9d,e9d%e?d ejCfd/d0ZDG d1d2 d2eZEG d3d4 d4eZFe"jGeeF eFeeF d5G d6d7 d7ejHe0e1e/ZIdS )8    )ABC)IterableN)Image)	AutoModelPretrainedConfig)BaseImageProcessorFast)
VllmConfig)QuantizationConfig)	AWQConfig)BaseInternVLDummyInputsBuilderBaseInternVLMultiModalProcessorBaseInternVLProcessingInfoInternVLImageEmbeddingInputsInternVLImageInputsInternVLImagePixelInputsInternVLProcessor)MultiModelKeys)MULTIMODAL_REGISTRYconvert_image_mode)PromptUpdateDetails)IntermediateTensors)TokenizerLike)"cached_image_processor_from_config   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixz<img>z</img><image>
input_sizec                 C   s0   t t dd t j| | ft jjdt  gS )Nc                 S   s
   t | dS )NRGBr   )img r&   Z/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/nemotron_vl.py<lambda>8   s   
 z!build_transform.<locals>.<lambda>)interpolation)TComposeLambdaResizeInterpolationModeBICUBICToTensorr#   r&   r&   r'   build_transform5   s   r2   aspect_ratiotarget_ratioswidthheight
image_sizereturnc                C   sv   t d}d}|| }|D ],\}}	||	 }
t||	 | | | d}t|
|  | |
 }|| }||kr8|}||	f}q|S )Nz-inf)r   r   g333333?)floatmin)r3   r4   r5   r6   r7   best_factor
best_ratioarearwrhtarget_aspect_ratiosize_factorratio_closenessfactorr&   r&   r'   find_closest_aspect_ratioB   s   rD   
orig_widthorig_heightuse_thumbnailc           
      C   s`   | | }t ||| ||d}||d  }||d  }|d |d  }	|r+|	dkr+|	d7 }	|	||fS )N)r5   r6   r7   r   r   )rD   )
rE   rF   r4   r7   rG   r3   r@   target_widthtarget_heightblocksr&   r&   r'   calculate_nemotron_vl_targets]   s   	
rK   imagec                C   s   | j \}}t||||dd\}}}| ||f}	g }
t|D ].}|||  | |||  | |||  d | |||  d | f}|	|}|
| qt|
|ksUJ |rit|
dkri| ||f}|
| |
S )NF)rE   rF   r4   r7   rG   r   )sizerK   resizerangecropappendlen)rL   r4   r7   rG   rE   rF   rJ   rH   rI   resized_imgprocessed_imagesibox	split_imgthumbnail_imgr&   r&   r'   dynamic_preprocess_nemotron_vl|   s.   
	

rY   min_nummax_numc                    s.    fddt  d D }t|dd dS )Nc                    sX   h | ](}t d |d  D ]}t d |d  D ]}||   kr" krn n||fqqqS )r   )rO   ).0nrU   jr[   rZ   r&   r'   	<setcomp>   s    
z0get_nemotron_vl_target_ratios.<locals>.<setcomp>r   c                 S   s   | d | d  S )Nr   r   r&   xr&   r&   r'   r(      s    z/get_nemotron_vl_target_ratios.<locals>.<lambda>)key)rO   sorted)rZ   r[   r4   r&   r_   r'   get_nemotron_vl_target_ratios   s   re   c                   s@   t ||}t|d t| |||d}t fdd|D }|S )Nr1   )r4   r7   rG   c                    s   g | ]} |qS r&   r&   r\   rL   	transformr&   r'   
<listcomp>       z5image_to_pixel_values_nemotron_vl.<locals>.<listcomp>)re   r2   rY   torchstack)rL   r#   rZ   r[   rG   r4   imagespixel_valuesr&   rg   r'   !image_to_pixel_values_nemotron_vl   s   

ro   c                   @   s"  e Zd ZdddddededededB dedB dedB d	dfd
dZe	d	efddZ
deded	efddZ			ddeej dedB dedB dedB d	eej f
ddZ			ddee deej dedB dedB dedB d	eee eeejf f fddZdededB d	ee fddZdS )NemotronVLProcessorNmin_dynamic_patchmax_dynamic_patchdynamic_image_sizeconfig	tokenizerimage_processorrr   rs   rt   r8   c          	      C   s   t |  || _|| _|| _|j}|j}|d u rd}t|ts!J |d u r)| jj	}t|ts0J |d u r6d}t|t
s=J t|| d |jd  | _|| _|| _|| _|| _| jj| _d S )Nr   T   )r   __init__ru   rv   rw   force_image_size
patch_size
isinstanceintmax_num_tilesbooldownsample_rationum_image_tokenr7   rr   rs   rt   rG   )	selfru   rv   rw   rr   rs   rt   r7   r{   r&   r&   r'   ry      s.   

zNemotronVLProcessor.__init__c                 C   s   | j  t S N)rv   	get_vocabIMG_CONTEXTr   r&   r&   r'   image_token_id   s   z"NemotronVLProcessor.image_token_idimage_widthimage_heightc                C   s2   | j dd}t||| j|| jd\}}}|| j S )NF)rG   )rE   rF   r7   r4   rG   )resolve_target_ratiosrK   r7   rG   r   )r   r   r   r4   num_patches_r&   r&   r'   get_num_image_tokens   s   
z(NemotronVLProcessor.get_num_image_tokensrm   c                    s,   j |||dd\  fdd|D S )NF)rr   rs   rt   rG   c              	      s"   g | ]}t |j jd qS ))r#   rZ   r[   rG   )ro   r7   rG   rf   r[   rZ   r   r&   r'   ri     s    zCNemotronVLProcessor._images_to_pixel_values_lst.<locals>.<listcomp>)resolve_min_max_num)r   rm   rr   rs   rt   r&   r   r'   _images_to_pixel_values_lst	  s   
z/NemotronVLProcessor._images_to_pixel_values_lsttextc                    s   t |dkri }||fS | j||||d}t|tdd |D d}|D ]"}|jd }	|	| j }
| |
|	}|j	dd  fdd|D }q'd	d |D }||fS )
Nr   rq   c                 S   s   g | ]}t |qS r&   )rR   )r\   itemr&   r&   r'   ri   6  rj   z9NemotronVLProcessor._preprocess_image.<locals>.<listcomp>)pixel_values_flatimage_num_patchesr"   <NVL_IMG_CONTEXT>c                    s   g | ]	}| d  dqS )r"   r   )replacer\   tNVL_IMAGE_CONTEXTr&   r'   ri   A  s    c                 S   s   g | ]}| d tqS )r   )r   r   r   r&   r&   r'   ri   B  s    )
rR   r   rk   cattensorshaper   get_image_replfullr   )r   r   rm   rr   rs   rt   image_inputspixel_values_lstrn   r   feature_size
image_replr&   r   r'   _preprocess_image"  s0   

z%NemotronVLProcessor._preprocess_imager   r   c                 C   s    t | }t| t }t|t S r   )r   	IMG_STARTIMG_ENDr   select_text)r   r   r   repl_features	repl_fullr&   r&   r'   r   E  s   z"NemotronVLProcessor.get_image_repl)NNN)__name__
__module____qualname__r   r   r   r}   r   ry   propertyr   r   listr   rk   Tensorr   strtupledictr   r   r   r&   r&   r&   r'   rp      s    	
&


#rp   c                   @   s0   e Zd ZdZdedefddZdefddZdS )	NemotronVLProcessingInfoz'Processing info for Nemotron VL models.kwargsr8   c                 K   s(   | j jtf|  |  |  d|S )N)ru   rv   rw   )ctxinit_processorrp   get_hf_configget_tokenizerget_image_processorr   r   r&   r&   r'   get_hf_processorS  s   z)NemotronVLProcessingInfo.get_hf_processorc                 K   s   t | jjfi |S r   )r   r   model_configr   r&   r&   r'   r   \  s
   z,NemotronVLProcessingInfo.get_image_processorN)r   r   r   __doc__objectrp   r   r   r&   r&   r&   r'   r   P  s    	r   )infodummy_inputsc                       s  e Zd ZededededB fddZddd	ed
eddf fddZde	de
fddZde	de
dB d
efddZde	dejfddZd=ddZdejdejfddZdededB fddZdedeejdf fd d!Zdedefd"d#Zd$ejddfd%d&Zdedefd'd(Z	d>dd)d*d$ejd+edB d,ejdB d-edejf
 fd.d/Z		d?d$ejdB d0ejd1e dB d2ejdB dede fd3d4Z!d5ejdejdB fd6d7Z"d8e#eeejf  de$e fd9d:Z%de&fd;d<Z'  Z(S )@LlamaNemotronVLChatModelmodalityrU   r8   Nc                 C   s   | drdS td)NrL   r"   z Only image modality is supported)
startswith
ValueError)clsr   rU   r&   r&   r'   get_placeholder_stri  s   
z,LlamaNemotronVLChatModel.get_placeholder_str )prefixvllm_configr   c                   s,  t    |jj}|j}|jj}|| _|| _| || |jp"|j	j
}|j	j}|| _t|| d |jd  | _|j| _|j| _| |d | j||t|dd| _| || _W d    n1 scw   Y  | | t||jt|dd| _W d    n1 sw   Y  d | _d | _| jj| _d S )Nrx   rL   vision_model)quant_configr   language_model)r   	hf_configr   )superry   r   r   r   multimodal_configru   _patch_quant_configrz   vision_configr7   r{   r}   r   r   
ps_version_mark_tower_model_init_vision_modelr!   r   
_init_mlp1mlp1_mark_language_modelr    text_configr   img_context_token_idvisual_token_maskmake_empty_intermediate_tensors)r   r   r   ru   r   r   r7   r{   	__class__r&   r'   ry   p  sD   

z!LlamaNemotronVLChatModel.__init__ru   r   c                 C   sF   t |tr|j}t|dd }|js|d ur!|jd d S d S d S d S )Nquantization_configr   )r|   r
   r   getattrmodules_to_not_convertrQ   )r   ru   r   r   llm_quant_configr&   r&   r'   r     s   
z,LlamaNemotronVLChatModel._patch_quant_configc                C   s   t j|jddS )NT)trust_remote_code)r   from_configr   )r   ru   r   r   r&   r&   r'   r     s   z+LlamaNemotronVLChatModel._init_vision_modelc              	   C   sj   |j }|j}|jj}ttj|td| j d  ddtj	|td| j d  |ddt
 t	||S )Nr   rx   T)bias)vit_hidden_sizeprojector_hidden_sizer   hidden_sizenn
Sequential	LayerNormr}   r   LinearGELU)r   ru   r   vision_projection_hidden_sizellm_hidden_sizer&   r&   r'   r     s   
z#LlamaNemotronVLChatModel._init_mlp1      ?c              	   C   s   |  \}}}}|||t|| t|| }|dddd }||t|| t|| t|||  }| jdkr@	 |S |dddd }|S )Nr   rx   r      v1)rM   viewr}   permute
contiguousr   )r   rb   scale_factorr]   whcr&   r&   r'   pixel_shuffle  s    


z&LlamaNemotronVLChatModel.pixel_shufflern   c                 C   s   | j |dj}|jtjd}t|jd d  }}||jd ||d}| j|| j	d}||jd d|jd }| 
|}|S )Nra   )dtyper   r   r   )r   )r   featurestork   bfloat16r}   r   reshaper   r   r   )r   rn   
vit_embedsr   r   r&   r&   r'   extract_feature  s   
z(LlamaNemotronVLChatModel.extract_featurer   c                 K   s   | dd }| dd }| dd }|d u r|d u rd S |d ur&td|dS |d }t|tjr8|   }t|ts?J || _	|d urUt
d||| jj| jjddS td	)
Nr   r   image_embeds)typedatar   rn   )r   r   )r  r   r   resolve_bindingsz This line should be unreachable.)popr   r|   rk   r   flattenuniquer   r}   r   r   ru   rz   AssertionError)r   r   r   r   r  r   r&   r&   r'   _parse_and_validate_image_input  s2   
z8LlamaNemotronVLChatModel._parse_and_validate_image_inputimage_input.c                    s   |d dkr
|d S |  |d }|d }t|dkr%|d| jjjfS |jd  |d| jjj} fdd	|D }||S )
Nr  r  r  r   r   r   r   c                    s   g | ]}|  qS r&   r&   )r\   r   r   r&   r'   ri     s    zALlamaNemotronVLChatModel._process_image_input.<locals>.<listcomp>)r  rR   r   ru   r   r   r   split)r   r  r  r   image_feature_sizesr&   r  r'   _process_image_input  s   


z-LlamaNemotronVLChatModel._process_image_inputc                 K   s6   i }|D ]}|dv rd|vr| j di ||d< q|S )N)r   r  rm   r&   )r
  )r   r   
modalities	input_keyr&   r&   r'   %_parse_and_validate_multimodal_inputs  s   z>LlamaNemotronVLChatModel._parse_and_validate_multimodal_inputs	input_idsc                 C   s
   d | _ d S r   )r   )r   r  r&   r&   r'   _set_visual_token_mask*  s   
z/LlamaNemotronVLChatModel._set_visual_token_maskc                 K   sP   | j di |}|sg S d}|D ]}|dkr%|d }| |}|t|7 }q|S )Nr&   rm   )r  r  r   )r   r   r  multimodal_embeddingsr   r  image_embeddingsr&   r&   r'   embed_multimodal-  s   
z)LlamaNemotronVLChatModel.embed_multimodalF)is_multimodalhandle_oov_mm_tokenr  r  r  c                   sN   |d urt |dkr| | |d u s|d u rt |S t j||||dS )Nr   )r  r  r  )rR   r  r   embed_input_ids)r   r  r  r  r  r   r&   r'   r  @  s   
z(LlamaNemotronVLChatModel.embed_input_ids	positionsintermediate_tensorsinputs_embedsc                 K   sP   |d urd }||||d}| j d ur|d| j i d | _ | jjdi |}|S )N)r  r  r  r  r   r&   )r   updater   model)r   r  r  r  r  r   forward_kwargshidden_statesr&   r&   r'   forwardV  s   
z LlamaNemotronVLChatModel.forwardr!  c                 C   s   | j |S r   )r   compute_logits)r   r!  r&   r&   r'   r#  p  s   z'LlamaNemotronVLChatModel.compute_logitsweightsc                 C   s   ddg}t | |d}||S )N	norm_meannorm_std)skip_substrs)r   load_weights)r   r$  r'  loaderr&   r&   r'   r(  v  s   
z%LlamaNemotronVLChatModel.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r   r   r   )r   	connectortower_model)r   from_string_fieldr   r&   r&   r'   get_mm_mapping}  s
   z'LlamaNemotronVLChatModel.get_mm_mapping)r   r   )NN))r   r   r   classmethodr   r}   r   r   ry   r   r	   r   r   r   Moduler   r   rk   r   r  r   r   r
  r   r  r   r  r  r   r  r   r  r   r"  r#  r   setr(  r   r-  __classcell__r&   r&   r   r'   r   c  s     *

	

$


$r   )Jabcr   collections.abcr   rk   torch.nnr   torchvision.transforms
transformsr*   PILr   transformersr   r   (transformers.image_processing_utils_fastr   vllm.configr   'vllm.model_executor.layers.quantizationr	   +vllm.model_executor.layers.quantization.awqr
   #vllm.model_executor.models.internvlr   r   r   r   r   r   r   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.imager   vllm.multimodal.processingr   vllm.sequencer   vllm.tokenizersr   !vllm.transformers_utils.processorr   
interfacesr   r   r   r   utilsr   r    r!   r   r   r   r}   r2   r9   r   r   rD   r   rK   rY   re   r   ro   rp   r   register_processorr/  r   r&   r&   r&   r'   <module>   s   	$	



)

 