o
    
۾i<                     @   s  d Z ddlZddlmZmZmZ ddlmZ ddlZddl	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZB dZCdZDG dd de,ZEG dd de*eE ZFG d d! d!e+eE ZGejHeGeEeFd"G d#d$ d$e
jIeeeZJdS )%zFInference-only Deepseek-OCR model compatible with HuggingFace weights.    N)IterableMappingSequence)partial)BatchFeature)
VllmConfig)BaseDummyOptions)MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)MultiModelKeys)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsNestedTensors)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)cached_tokenizer_from_config)DeepseekVLV2Config)	BASE_SIZE	CROP_MODEDeepseekOCRProcessor   )count_tiles   )ImageEncoderViT)build_qwen2_decoder_as_encoder)DeepseekOCRImagePixelInputs)MlpProjector   <image>c                	   @   sj   e Zd Zdd ZdefddZdeeedB f fdd	Z	d
ddedede
defddZdefddZdS )DeepseekOCR2ProcessingInfoc                 C   s   | j tS N)ctxget_hf_configr"   self r5   \/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/deepseek_ocr2.pyr2   E   s   z(DeepseekOCR2ProcessingInfo.get_hf_configkwargsc                 K   s,   t tttdd}| jjtfi i ||S )Nv2)
image_size	base_size	crop_modestrategy)dict
IMAGE_SIZEr#   r$   r1   get_hf_processorr%   )r4   r7   v2_processor_configr5   r5   r6   r?   H   s   
z+DeepseekOCR2ProcessingInfo.get_hf_processorreturnNc                 C   s   dd iS )Nimager5   r3   r5   r5   r6   get_supported_mm_limitsS   s   z2DeepseekOCR2ProcessingInfo.get_supported_mm_limitsT)croppingimage_widthimage_heightrD   c                C   s   t }t}d}d}tr#|dkr|dkrddg}nt||t d}|\}	}
nd }	}
t|| |  }}t|| |  }}|| }|	dksI|
dkrR|
| |	|  }nd}|| d S )N      r-   r(   )r9   r   )r>   r#   r$   r'   mathceil)r4   rE   rF   rD   r9   r:   
patch_sizedownsample_ratio
crop_rationum_width_tilesnum_height_tileshwh2w2global_views_tokenslocal_views_tokensr5   r5   r6   get_num_image_tokensV   s&   

z/DeepseekOCR2ProcessingInfo.get_num_image_tokensc                 C   s(   t dkrtdkrtdddS tdddS )N   i   i   )widthheighti   )r>   r#   r   r3   r5   r5   r6   !get_image_size_with_most_featuresw   s   z<DeepseekOCR2ProcessingInfo.get_image_size_with_most_features)__name__
__module____qualname__r2   objectr?   r   strintrC   boolrV   r   rZ   r5   r5   r5   r6   r/   D   s    
!r/   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )DeepseekOCR2DummyInputsBuilder	mm_countsrA   c                 C   s$   | dd}| j }|j}|| S )NrB   r   )getinfor?   image_token)r4   rc   
num_images	processorrf   r5   r5   r6   get_dummy_text   s   
z-DeepseekOCR2DummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   s.   | dd}| j }d| j|j|j|diS )NrB   r   )rX   rY   rg   )rd   re   rZ   _get_dummy_imagesrX   rY   )r4   rj   rc   rk   rg   max_image_sizer5   r5   r6   get_dummy_mm_data   s   
z0DeepseekOCR2DummyInputsBuilder.get_dummy_mm_datar0   )
r[   r\   r]   r   r_   r`   ri   r   r   rn   r5   r5   r5   r6   rb   }   s    
rb   c                
   @   s   e Zd Zdedeeef deeef deeef def
ddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZdS )DeepseekOCR2MultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrA   c                 C   sR   |r| j j| j jdi |tdd|i||}|S | j  }||ddd}|S )Nrp   Tpt)add_special_tokensreturn_tensorsr5   )re   r1   call_hf_processorr?   r=   get_tokenizer)r4   rp   rq   rr   rs   processed_outputs	tokenizerr5   r5   r6   _call_hf_processor   s   
z2DeepseekOCR2MultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   sr   | dtd}|d d df dk|d d df dkB }t||jddd}ttdtdtd|dS )	Nimages_spatial_crop)r      r   r(   dimrB   )pixel_valuesr~   images_crop)	rd   torchemptywhereprodr=   r   batchedflat_from_sizes)r4   r|   r}   r~   is_tiledpatches_per_imager5   r5   r6   _get_mm_fields_config   s   (z5DeepseekOCR2MultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    sN   j jdi |}|j t tsJ dtf fdd}td g|dgS )Nitem_idxc                    sP    dttf}t|tr|| }n|| }jj|j|j	t
d} g| S )NrB   )rE   rF   rD   )	get_itemsr   r   
isinstanceget_feature_sizeget_image_sizere   rV   rX   rY   r$   )r   imagesnum_image_tokenssizeimage_token_idr   r4   r5   r6   get_replacement_deepseek_vl2   s   


zYDeepseekOCR2MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_deepseek_vl2rB   )modalitytargetreplacementr5   )re   r?   r   r   r`   r   )r4   r   r}   r   hf_processorr   r5   r   r6   _get_prompt_updates   s   z3DeepseekOCR2MultiModalProcessor._get_prompt_updatesN)r[   r\   r]   r_   r   r^   r   r{   r   r   r   r   r   r   r   r5   r5   r5   r6   ro      s8    







ro   )re   dummy_inputsc                       sn  e Zd ZedddddddZeded	ed
edB fddZddde	def fddZ
ded
edB fddZdejd
ejfddZdejd
ejdB fddZdejdejdejd
efdd Zd!ed
ejfd"d#Zded
edB fd$d%Z		d4d&ejd'ejd(edB d)ejdB def
d*d+Zd,ejd
ejdB fd-d.Zd/eeeejf  d
ee fd0d1Zd
efd2d3Z  Z S )5DeepseekOCR2ForCausalLMz"language_model.model.embed_tokens.zlanguage_model.model.layers.zlanguage_model.model.norm.zlanguage_model.lm_head. )zmodel.embed_tokens.zmodel.layers.zmodel.norm.zlm_head.zmodel.)orig_to_new_prefixr   irA   Nc                 C   s   | drdS td)NrB   r.   z Only image modality is supported)
startswith
ValueError)clsr   r   r5   r5   r6   get_placeholder_str   s   
z+DeepseekOCR2ForCausalLM.get_placeholder_str)prefixvllm_configr   c          	         s  t    |jj}|jj}|| _|| _|j| _|j| _|j| _|j}t	|}|j
t | _| |d^ tddddttjjddddd	d	g d
dddd| _t | _t| j| _|j| _|j| _| jj}dttj|tjd }| jdkrtt|| | _ nt!d| j W d    n1 sw   Y  | "| t#|| jt$|dd| _%W d    n1 sw   Y  | j%j&| _&d S )NrB      r-   rW   rH   gư>)epsrG   T)r                  i  )depth	embed_dimimg_size	mlp_ratio
norm_layer	num_headsrK   qkv_biasuse_rel_posglobal_attn_indexeswindow_size	out_chanslast_conv_outputr(   dtype2Dz.Only 2D tile_tag is supported currently, got: language_model)r   	hf_configr   )'super__init__model_configr   multimodal_configconfigvision_configprojector_configtext_configr!   vocab_IMAGE_TOKENr   _mark_tower_modelr)   r   r   nn	LayerNorm	sam_modelr*   qwen2_modelr,   	projectortile_tagglobal_view_posn_embedsqrttensorfloat32	Parameterrandnview_seperatorr   _mark_language_modelr   r   r   make_empty_intermediate_tensors)	r4   r   r   r   r   r   rz   r   	embed_std	__class__r5   r6   r     sb   


!
z DeepseekOCR2ForCausalLM.__init__r7   c                 K   s`   | dd }| dd }| dd }|d u st| dkr!d S | jj}td|||d|idS )Nr   r~   r   r   r:   )typedatar   r~   resolve_bindings)popr   sumitemr   r9   r+   )r4   r7   r   r~   r   r:   r5   r5   r6   _parse_and_validate_image_input>  s   z7DeepseekOCR2ForCausalLM._parse_and_validate_image_inputimage_tensorc                 C   s6   |  |}| |}| |}|j\}}}|d|S )Nr   )r   r   r   shapeview)r4   r   global_features_1global_features_2features_hwr   r5   r5   r6   _encode_global_featuresS  s
   


z/DeepseekOCR2ForCausalLM._encode_global_featurespatchesc                 C   sL   t | dkrd S | |}| |}| |}|j\}}}|d|S )Nr   r   )r   r   r   r   r   r   r   r   )r4   r   local_featuresr   r   r   r5   r5   r6   _encode_local_features]  s   


z.DeepseekOCR2ForCausalLM._encode_local_featuresr   r   r~   c                 C   s   g }|d d df dk|d d df dkB }t ||jddd}|| }t|dD ]@}|| }||g }	| |	}
| |}|d urZt j	||
| j
d d d f gdd}nt j	|
| j
d d d f gdd}|| q/|S )Nr   r(   r   r   )r   r   r   splittolistranger   r   r   catr   append)r4   r   r   r~   images_in_this_batchr   r   jdxr   	image_origlobal_featuresr   combinedr5   r5   r6   _pixel_values_to_embeddingj  s&   (


z2DeepseekOCR2ForCausalLM._pixel_values_to_embeddingimage_inputc                 C   s0   |j }|j}|jjtjd}| j|||d}|S )Nr   )r   r   r~   )r   r   r~   tor   longr   )r4   r   r   r   r~   vision_featuresr5   r5   r6   _process_image_input  s   z,DeepseekOCR2ForCausalLM._process_image_inputc                 K   s*   | j di |}|d u rd S | |}|S )Nr5   )r   r  )r4   r7   r   vision_embeddingsr5   r5   r6   embed_multimodal  s
   
z(DeepseekOCR2ForCausalLM.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s"   |d urd }| j ||||d}|S )N)r	  )r   )r4   r  r  r  r	  r7   hidden_statesr5   r5   r6   forward  s   zDeepseekOCR2ForCausalLM.forwardr
  c                 C   s   | j |S r0   )r   compute_logits)r4   r
  r5   r5   r6   r    s   z&DeepseekOCR2ForCausalLM.compute_logitsweightsc                 C   s   t | }|j|| jd}|S )N)mapper)r   load_weightshf_to_vllm_mapper)r4   r  loaderautoloaded_weightsr5   r5   r6   r    s   z$DeepseekOCR2ForCausalLM.load_weightsc                 C   s   t jddddgdS )z<
        Get the module prefix in multimodal models
        r   r   r   r   )r   	connectortower_model)r   from_string_fieldr3   r5   r5   r6   get_mm_mapping  s
   z&DeepseekOCR2ForCausalLM.get_mm_mapping)NN)!r[   r\   r]   r   r  classmethodr_   r`   r   r   r   r^   r+   r   r   Tensorr   r   r   r   r  r	   r  r    r  r  r   tuplesetr  r   r  __classcell__r5   r5   r   r6   r      sl    <


 


$r   )K__doc__rI   collections.abcr   r   r   	functoolsr   r   torch.nnr   transformersr   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.models.interfacesr	   r
   r   r   )vllm.model_executor.models.module_mappingr    vllm.model_executor.models.utilsr   r   r   r   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   vllm.multimodal.parser   r   r   r   vllm.multimodal.processingr   r   r   r   r   vllm.sequencer    vllm.tokenizersr!   ,vllm.transformers_utils.configs.deepseek_vl2r"   /vllm.transformers_utils.processors.deepseek_ocrr#   r$   r%   *transformers_utils.processors.deepseek_ocrr'   deepencoderr)   deepencoder2r*   deepseek_ocrr+   deepseek_vl2r,   r>   r   r/   rb   ro   register_processorModuler   r5   r5   r5   r6   <module>   sP   
9
N