o
    
۾iZ                     @   s@  U d Z ddlZddlmZmZmZ ddlmZmZm	Z	 ddl
Z
ddlmZ ddlm  mZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z? ddl@mAZA ddlBmCZCmDZDmEZE ddlFmGZGmHZHmIZImJZJ dZKG dd de>ZLG dd de>ZMeLeMB ZNe	eOd< G d d! d!ejPZQG d"d# d#e/ZRG d$d% d%e,eR ZSG d&d' d'e.eR ZTe jUeTeReSd(G d)d* d*ejPeDeEZVdS )+zFInference-only Deepseek-VL2 model compatible with HuggingFace weights.    N)IterableMappingSequence)	AnnotatedLiteral	TypeAlias)	rearrangerepeat)BatchFeature)
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)QuantizationConfig)replace_linear_class)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsMultiModalUUIDDict)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilder)BaseMultiModalProcessorBaseProcessingInfoMultiModalProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)cached_tokenizer_from_config)DeepseekVLV2ConfigMlpProjectorConfigVisionEncoderConfig)DeepseekVLV2Processor)TensorSchemaTensorShape)set_default_torch_dtype   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix<image>c                	   @   sX   e Zd ZU dZed ed< eeje	dddddhdf ed	< eeje	d
df ed< dS )DeepseekVL2ImagePixelInputsz
    Dimensions:
        - bnp: Batch size * number of images * number of patches
        - p: Number of patches
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image
    pixel_valuestypebnp   hw)dynamic_dimsdatabn   images_spatial_cropN)
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr&    rD   rD   [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/deepseek_vl2.pyr1   B   s
   
 	"r1   c                   @   sB   e Zd ZU dZed ed< eeje	ej B e
dddf ed< dS )	 DeepseekVL2VImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - f: Image feature size
        - h: Hidden size (must match language model backbone)
    image_embedsr3   r:   fr6   r9   N)r=   r>   r?   r@   r   rA   r   rB   rC   listr&   rD   rD   rD   rE   rF   Q   s   
 (rF   DeepseekVL2ImageInputsc                       s*   e Zd Zdef fddZdd Z  ZS )MlpProjectorcfgc                    s  t    || _|j| _|jrJ d| jdkri|j}|j}t|j	|j
 |j
 |j| g}td|d D ]}|t  |t|j| |j|  q6|t  |t|j| |j tj| }n| jdkrwt|j	|j}ntd|j || _d S )Nz)Token pooling is not supported currently.downsample_mlp_gelur(   linearzUnsupported projector type: )super__init__rL   projector_typetoken_poolingdepth	mlp_rationnLinear	input_dimdownsample_ration_embedrangeappendGELU
SequentialNotImplementedErrorlayers)selfrL   	mlp_depthrT   modules_	__class__rD   rE   rP   c   s4   




zMlpProjector.__init__c              	   C   s   |j \}}}| jdkr^t|d  }}	 || jj r%| jj|| jj  }nd}|||||}|dkrAt|ddd|d|fdd}	 |dddd}tj	|| jj| jjdd}|ddd}| 
|S )	NrM         ?r   constantr5   r(   r;   )kernel_sizestridepadding)shaperQ   intrL   rX   reshapeFpadpermuteunfoldr_   )r`   xbshwrW   r6   r7   ro   rD   rD   rE   forward   s(   

zMlpProjector.forward)r=   r>   r?   r"   rP   ru   __classcell__rD   rD   rd   rE   rK   b   s    !rK   c                	   @   sj   e Zd Zdd ZdefddZdeeedB f fdd	Z	d
ddedede
defddZdefddZdS )DeepseekVL2ProcessingInfoc                 C   s   | j tS N)ctxget_hf_configr!   r`   rD   rD   rE   rz      s   z'DeepseekVL2ProcessingInfo.get_hf_configkwargsc                 K   s   | j jtfi |S rx   )ry   get_hf_processorr$   )r`   r|   rD   rD   rE   r}      s   z*DeepseekVL2ProcessingInfo.get_hf_processorreturnNc                 C   s   dd iS )NimagerD   r{   rD   rD   rE   get_supported_mm_limits   s   z1DeepseekVL2ProcessingInfo.get_supported_mm_limitsT)croppingimage_widthimage_heightr   c                C   s   |   }|j}|j}|j}|r"|||f\}}	|| |	| }
}nd }
}t|| |  }}||d  }|| |
| d  }|| d S )Nr(   )r}   
image_size
patch_sizerX   select_best_resolutionmathceil)r`   r   r   r   hf_processorr   r   rX   
best_widthbest_heightnum_width_tilesnum_height_tilesr6   r7   global_views_tokenslocal_views_tokensrD   rD   rE   get_num_image_tokens   s    z.DeepseekVL2ProcessingInfo.get_num_image_tokensc                    s2      }|j}t| fddd\}}t||dS )Nc                    s    j | d | d dS )Nr(   r   )r   r   )r   )rr   r{   rD   rE   <lambda>   s    zMDeepseekVL2ProcessingInfo.get_image_size_with_most_features.<locals>.<lambda>)key)widthheight)rz   candidate_resolutionsmaxr   )r`   	hf_configr   r   r   rD   r{   rE   !get_image_size_with_most_features   s   

z;DeepseekVL2ProcessingInfo.get_image_size_with_most_features)r=   r>   r?   rz   objectr}   r   strrl   r   boolr   r   r   rD   rD   rD   rE   rw      s    
rw   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )DeepseekVL2DummyInputsBuilder	mm_countsr~   c                 C   s$   | dd}| j }|j}|| S )Nr   r   )getinfor}   image_token)r`   r   
num_images	processorr   rD   rD   rE   get_dummy_text   s   
z,DeepseekVL2DummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j }|r| dnd }d| j|j|j||diS )Nr   r   )r   r   r   	overrides)r   r   r   _get_dummy_imagesr   r   )r`   r   r   r   r   max_image_sizeimage_overridesrD   rD   rE   get_dummy_mm_data   s   
z/DeepseekVL2DummyInputsBuilder.get_dummy_mm_datarx   )
r=   r>   r?   r   r   rl   r   r   r   r   rD   rD   rD   rE   r      s    
r   c                       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ	ddeee B de
d	eeef deeef dedB deee eef f fddZ  ZS )DeepseekVL2MultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsr~   c                    sJ   |s| j  }||dddS t j||||d}|d dd |d< |S )	NTpt)add_special_tokensreturn_tensors)r   r   r   r   r<   r(   num_patches)r   get_tokenizerrO   _call_hf_processorprod)r`   r   r   r   r   	tokenizerprocessed_outputsrd   rD   rE   r      s   
z1DeepseekVL2MultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s4   | dtd}ttd|tdtddS )Nr   r   r   )r2   r<   rG   )r   rB   emptydictr   flat_from_sizesbatched)r`   r   r   r   rD   rD   rE   _get_mm_fields_config  s   
z4DeepseekVL2MultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    sN   j jdi |}|j t tsJ dtf fdd}td g|dgS )Nitem_idxc                    sX    dttf}t|tr|| }n|| }jj|j|j	t
|dkd} g| S )Nr   r;   )r   r   r   )	get_itemsr   r   
isinstanceget_feature_sizeget_image_sizer   r   r   r   len)r   imagesnum_image_tokensr   image_token_idr   r`   rD   rE   get_replacement_deepseek_vl2  s   



zXDeepseekVL2MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_deepseek_vl2r   )modalitytargetreplacementrD   )r   r}   r   r   rl   r   )r`   r   r   r   r   r   rD   r   rE   _get_prompt_updates  s   z2DeepseekVL2MultiModalProcessor._get_prompt_updatesNmm_data_itemstokenization_kwargsmm_uuidsc                    s<   |j ddddkr| j|||||dS t j|||||dS )Nr   F)strictr;   )r   r   r   r   r   )	get_count_apply_hf_processorrO   _cached_apply_hf_processor)r`   r   r   r   r   r   rd   rD   rE   r   7  s   z9DeepseekVL2MultiModalProcessor._cached_apply_hf_processorrx   )r=   r>   r?   r   r   r   r
   r   r   r   r   r   r   r   r   rI   rl   r   tupler   r   r   rv   rD   rD   rd   rE   r      sT    







*


r   )r   dummy_inputsc                       s  e Zd ZeddidZededededB fdd	Zd
dde	def fddZ
dejjdefddZdejjdefddZ	
d3dededB dedejfddZdededB fddZdejdejdeej fd d!Zd"edejeej B fd#d$Zdedefd%d&Z		d4d'ejdB d(ejd)edB d*ejdB def
d+d,Zd-ejdejdB fd.d/Zd0ee eejf  de!e fd1d2Z"  Z#S )5DeepseekVLV2ForCausalLMz	language.zlanguage_model.)orig_to_new_prefixr   ir~   Nc                 C   s   | drdS td)Nr   r0   z Only image modality is supported)
startswith
ValueError)clsr   r   rD   rD   rE   get_placeholder_stra  s   
z+DeepseekVLV2ForCausalLM.get_placeholder_str prefixvllm_configr   c          	         s|  t    |jj}|j}|jj}|| _|| _|j| _|j| _|j	| _	|j}t
|}|jt | _| |dY | | j|t|d| _t| j| _|j| _|j| _dttj| jjtjd }| jdkrtt| jj| | _tt| jj| | _nt d| j W d    n1 sw   Y  | !| t"|| j	t|dd| _#W d    n1 sw   Y  | j#j$| _$d S )	Nr   visionr(   dtype2Dz.Only 2D tile_tag is supported currently, got: language)r   r   r   )%rO   rP   model_configr   quant_configmultimodal_configconfigvision_configprojector_configtext_configr    vocab_IMAGE_TOKENr   _mark_tower_model_init_vision_moduler/   r   rK   	projectortile_tagglobal_view_posrB   sqrttensorrY   float32rU   	Parameterrandnimage_newlineview_seperatorr   _mark_language_modelr.   language_modelmake_empty_intermediate_tensors)	r`   r   r   r   r   r   r   r   	embed_stdrd   rD   rE   rP   h  sV   



z DeepseekVLV2ForCausalLM.__init__rootdotted_namec                 C   s6   | d}|}|dd D ]}t||}q||d fS )zAReturn (parent_module, final_attr_name) for a dotted module path..Nr   )splitgetattr)r`   r  r  namesparentnrD   rD   rE   _get_parent_and_attr  s
   
z,DeepseekVLV2ForCausalLM._get_parent_and_attrvitr   c           
   
   C   s   zdd l }W n ty } ztd|d }~ww | D ]E\}}t|tjra| ||\}}t||jjrH|dkrHt	|d||d}	t
|||	 qt||jjra|dkrat	|d||d}	t
|||	 q|S )Nr   Please install timmfc1colwiser   fc2rowwise)timmImportErrornamed_modulesr   rU   rV   r  r_   Mlpr   setattr)
r`   r  r   r  enamemoduler
  	attr_name
new_linearrD   rD   rE   patch_vit_for_tp  s*   
z(DeepseekVLV2ForCausalLM.patch_vit_for_tpr   c              
   C   s   zdd l }W n ty } ztd|d }~ww ttj |jdddddd}W d    n1 s2w   Y  t dkrB| ||}|jt	 d}|S )	Nr   r  z#vit_so400m_patch14_siglip_384.webliFT)
pretrainednum_classesdynamic_img_sizedynamic_img_padr(   r   )
r  r  r'   rB   float16create_modelr   r  toget_default_dtype)r`   r   r   r   r  r  modelrD   rD   rE   r     s&   

	z+DeepseekVLV2ForCausalLM._init_vision_moduler|   c                 K   s~   | dd }| dd }| dd }|d u r|d u rd S |d ur1| jj }}td||||ddS |d ur;td|dS td)Nr2   r<   rG   )r6   r7   )r3   r9   r<   resolve_bindings)r3   r9   z This line should be unreachable.)popr   r   r1   rF   AssertionError)r`   r|   r2   r<   rG   
expected_h
expected_wrD   rD   rE   _parse_and_validate_image_input  s*   
z7DeepseekVLV2ForCausalLM._parse_and_validate_image_inputr2   r<   c              	   C   sz  | j |}| |}|j\}}}t|d  }}	d}
g }t|dD ]}|| \}}|dks4|dkr7 |S || }||
 }||
d |
d |  }|
|d 7 }
|||	|}t| j	d|d}t
j||gdd}|d|}t|d||||	d	}t| j	d
||d}t
j||gdd}|d|}| jdkrt
|| jd d d f |g}nt
|| jd d d f |g}|| q$|S )Nrf   r   r(   z
d -> h 1 d)r6   )dimr   z"(th tw) (h w) d -> (th h) (tw w) d)thtwr6   r7   zd -> (th h) 1 d)r.  r6   head)r   forward_featuresr   rk   rl   rZ   sizeviewr	   r   rB   catr   r   r   r[   )r`   r2   r<   images_featureimages_embedsrc   rt   n_dimr6   r7   
tile_indexvision_embeddingsjdxr   r   num_tiles_in_imageglobal_featureslocal_featuresnew_lines_in_globalnew_lines_in_localglobal_local_featuresrD   rD   rE   _pixel_values_to_embedding  sb   
E


z2DeepseekVLV2ForCausalLM._pixel_values_to_embeddingimage_inputc                 C   s2   |d dkr
|d S |d }|d }| j ||dS )Nr3   rG   r9   r<   )r2   r<   )rA  )r`   rB  r2   r<   rD   rD   rE   _process_image_inputS  s   z,DeepseekVLV2ForCausalLM._process_image_inputc                 K   s*   | j di |}|d u rg S | |}|S )NrD   )r,  rC  )r`   r|   rB  r9  rD   rD   rE   embed_multimodal`  s
   
z(DeepseekVLV2ForCausalLM.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s"   |d urd }| j ||||d}|S )N)rH  )r  )r`   rE  rF  rG  rH  r|   hidden_statesrD   rD   rE   ru   g  s   zDeepseekVLV2ForCausalLM.forwardrI  c                 C   s   | j |S rx   )r  compute_logits)r`   rI  rD   rD   rE   rJ  x  s   z&DeepseekVLV2ForCausalLM.compute_logitsweightsc                 C   s   t | }|j|| jd}|S )N)mapper)r,   load_weightshf_to_vllm_mapper)r`   rK  loaderautoloaded_weightsrD   rD   rE   rM  ~  s   z$DeepseekVLV2ForCausalLM.load_weights)r   )NN)$r=   r>   r?   r-   rN  classmethodr   rl   r   r   rP   rB   rU   Moduler  r   r  r#   r   r   rJ   r,  rC   rI   rA  rC  r)   rD  r   ru   rJ  r   r   setrM  rv   rD   rD   rd   rE   r   U  sr    7	


\


,r   )Wr@   r   collections.abcr   r   r   typingr   r   r   rB   torch.nnrU   torch.nn.functional
functionalrn   einopsr   r	   transformersr
   vllm.configr   vllm.config.multimodalr   vllm.distributedr   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.models.transformers.utilsr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   vllm.multimodal.parser   r   r   r   vllm.multimodal.processingr   $vllm.multimodal.processing.processorr   r   r   r   r   vllm.sequencer   vllm.tokenizersr    ,vllm.transformers_utils.configs.deepseek_vl2r!   r"   r#   /vllm.transformers_utils.processors.deepseek_vl2r$   vllm.utils.tensor_schemar%   r&   vllm.utils.torch_utilsr'   
interfacesr)   r*   r+   utilsr,   r-   r.   r/   r   r1   rF   rJ   rA   rR  rK   rw   r   r   register_processorr   rD   rD   rD   rE   <module>   sX   
;/
j