o
    
۾iR                     @   s  d Z ddlZddlmZmZmZ ddlmZmZ ddl	Z	ddl
mZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z= ddl>m?Z?m@Z@ ddlAmBZBmCZC ddlDmEZEmFZF ddlGmHZH dZIdZJG dd de?ZKG dd dZLG dd  d eBZMG d!d" d"e.ZNG d#d$ d$e,eN ZOG d%d& d&e-eN ZPe jQePeNeOd'G d(d) d)ejReeeZSdS )*zFInference-only Deepseek-OCR model compatible with HuggingFace weights.    N)IterableMappingSequence)	AnnotatedLiteral)BatchFeatureCLIPVisionConfig)
VllmConfig)BaseDummyOptions)MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)MultiModelKeys)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsNestedTensors)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)SamplingParams)IntermediateTensors)cached_tokenizer_from_config)DeepseekVLV2Config)	BASE_SIZE	CROP_MODEDeepseekOCRProcessorcount_tiles)TensorSchemaTensorShape)AdapterLogitsProcessorRequestLogitsProcessor   )DeepCLIPVisionTransformerbuild_sam_vit_b)MlpProjector  <image>c                	   @   sz   e Zd ZU dZed ed< eeje	dddddhdf ed	< eeje	ddd
d
dhdf ed< eeje	ddf ed< dS )DeepseekOCRImagePixelInputsz
    Dimensions:
        - b: Batch size
        - n: Number of images
        - p: Number of patches
        - base_size: Base size of the processor
        - image_size: Image size of the processor
    pixel_valuestypebn   	base_sizebnp)dynamic_dimsdata
image_sizeimages_crop   images_spatial_cropN)
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr+    rH   rH   [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/deepseek_ocr.pyr4   H   s   
 	r4   c                   @   sL   e Zd Z	ddededee dB fddZdee dejd	ejfd
dZ	dS )NoRepeatNGramLogitsProcessorN
ngram_sizewindow_sizewhitelist_token_idsc                 C   s   || _ || _|p
t | _d S N)rK   rL   setrM   )selfrK   rL   rM   rH   rH   rI   __init___   s   z%NoRepeatNGramLogitsProcessor.__init__
output_idslogitsreturnc           	      C   s   t || jk r	|S t|| jd  d  }tdt || j }t || j d }t }t||D ]}t|||| j  }|d d |krL||d  q0|| j }|r]t	d |t
|< |S )Nr.   r   inf)lenrK   tuplemaxrL   rO   rangeaddrM   floatlist)	rP   rR   rS   current_prefixsearch_start
search_endbanned_tokensingramrH   rH   rI   __call__i   s   
z%NoRepeatNGramLogitsProcessor.__call__rN   )
rA   rB   rC   intrO   rQ   r]   rF   rG   rd   rH   rH   rH   rI   rJ   ^   s     


rJ   c                   @   sF   e Zd ZdZedefddZdefddZdede	dB fd	d
Z
dS )NGramPerReqLogitsProcessorzgExample of overriding the wrapper class `__init__()` in order to utilize
    info about the device typeparamsc                 C   s   |j o|j d}|j o|j dd}|j o|j dd }|d u r#d S t|tr,|dkr4td| dt|tr=|dkrEtd| d|d urVt|tsXtd	| dd S d S )
NrK   rL   d   rM   r   z8`ngram_size` has to be a strictly positive integer, got .z9`window_size` has to be a strictly positive integer, got z<`whitelist_token_ids` has to be a sequence of integers, got )
extra_argsget
isinstancere   
ValueErrorr   )clsrg   rK   rL   rM   rH   rH   rI   validate_params   s4   

z*NGramPerReqLogitsProcessor.validate_paramsrT   c                 C   s   dS )NFrH   rP   rH   rH   rI   is_argmax_invariant   s   z.NGramPerReqLogitsProcessor.is_argmax_invariantNc                 C   sd   |j o|j d}|j o|j dd}|j o|j dd }|d u r#d S |r)t|nd }t|||dS )NrK   rL   rh   rM   )rK   rL   rM   )rj   rk   rO   rJ   )rP   rg   rK   rL   rM   rH   rH   rI   new_req_logits_processor   s   z3NGramPerReqLogitsProcessor.new_req_logits_processor)rA   rB   rC   rD   classmethodr"   ro   boolrq   r-   rr   rH   rH   rH   rI   rf      s    rf   c                	   @   sj   e Zd Zdd ZdefddZdeeedB f fdd	Z	d
ddedede
defddZdefddZdS )DeepseekOCRProcessingInfoc                 C   s   | j tS rN   )ctxget_hf_configr%   rp   rH   rH   rI   rw      s   z'DeepseekOCRProcessingInfo.get_hf_configkwargsc                 K   s,   t tttdd}| jjtfi i ||S )Nv1)r=   r9   	crop_modestrategy)dict
IMAGE_SIZEr&   r'   rv   get_hf_processorr(   )rP   rx   v1_processor_configrH   rH   rI   r~      s   
z*DeepseekOCRProcessingInfo.get_hf_processorrT   Nc                 C   s   dd iS )NimagerH   rp   rH   rH   rI   get_supported_mm_limits   s   z1DeepseekOCRProcessingInfo.get_supported_mm_limitsT)croppingimage_widthimage_heightr   c                C   s   t }t}d}d}tr#|dkr|dkrddg}nt||t d}|\}	}
nd }	}
t|| |  }}t|| |  }}||d  }|	dksK|
dkrV|
| |	| d  }nd}|| d S )N      r2   r.   )r=   r   )r}   r&   r'   r)   mathceil)rP   r   r   r   r=   r9   
patch_sizedownsample_ratio
crop_rationum_width_tilesnum_height_tileshwh2w2global_views_tokenslocal_views_tokensrH   rH   rI   get_num_image_tokens   s&   

z.DeepseekOCRProcessingInfo.get_num_image_tokensc                 C   s(   t dkrtdkrtdddS tdddS )N   i   i   )widthheight)r}   r&   r   rp   rH   rH   rI   !get_image_size_with_most_features   s   z;DeepseekOCRProcessingInfo.get_image_size_with_most_features)rA   rB   rC   rw   objectr~   r   strre   r   rt   r   r   r   rH   rH   rH   rI   ru      s    
!ru   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )DeepseekOCRDummyInputsBuilder	mm_countsrT   c                 C   s$   | dd}| j }|j}|| S )Nr   r   )rk   infor~   image_token)rP   r   
num_images	processorr   rH   rH   rI   get_dummy_text   s   
z,DeepseekOCRDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   s.   | dd}| j }d| j|j|j|diS )Nr   r   )r   r   r   )rk   r   r   _get_dummy_imagesr   r   )rP   r   r   r   r   max_image_sizerH   rH   rI   get_dummy_mm_data   s   
z/DeepseekOCRDummyInputsBuilder.get_dummy_mm_datarN   )
rA   rB   rC   r   r   re   r   r
   r   r   rH   rH   rH   rI   r      s    
r   c                
   @   s   e Zd Zdedeeef deeef deeef def
ddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZdS )DeepseekOCRMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrT   c                 C   sR   |r| j j| j jdi |tdd|i||}|S | j  }||ddd}|S )Nr   Tpt)add_special_tokensreturn_tensorsrH   )r   rv   call_hf_processorr~   r|   get_tokenizer)rP   r   r   r   r   processed_outputs	tokenizerrH   rH   rI   _call_hf_processor  s   
z1DeepseekOCRMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   sr   | dtd}|d d df dk|d d df dkB }t||jddd}ttdtdtd|dS )	Nr@   )r   r?   r   r.   rU   dimr   )r5   r@   r>   )	rk   rF   emptywhereprodr|   r   batchedflat_from_sizes)rP   r   r   r@   is_tiledpatches_per_imagerH   rH   rI   _get_mm_fields_config*  s   (z4DeepseekOCRMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    sN   j jdi |}|j t tsJ dtf fdd}td g|dgS )Nitem_idxc                    sP    dttf}t|tr|| }n|| }jj|j|j	t
d} g| S )Nr   )r   r   r   )	get_itemsr   r   rl   get_feature_sizeget_image_sizer   r   r   r   r'   )r   imagesnum_image_tokenssizeimage_token_idr   rP   rH   rI   get_replacement_deepseek_vl2E  s   


zXDeepseekOCRMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_deepseek_vl2r   )modalitytargetreplacementrH   )r   r~   r   rl   re   r    )rP   r   r   r   hf_processorr   rH   r   rI   _get_prompt_updates:  s   z2DeepseekOCRMultiModalProcessor._get_prompt_updatesN)rA   rB   rC   r   r   r   r   r   r   r   r   r   r   r!   r   rH   rH   rH   rI   r     s8    







r   )r   dummy_inputsc                       sx  e Zd ZedddddddZeded	ed
edB fddZddde	def fddZ
ded
edB fddZdejd
ejfddZdejdejd
ejdB fddZdejdejdejd
efd d!Zd"ed
ejfd#d$Zded
edB fd%d&Z		d5d'ejdB d(ejd)edB d*ejdB def
d+d,Zd-ejd
ejdB fd.d/Zd0eeeejf  d
ee fd1d2Zd
efd3d4Z  Z S )6DeepseekOCRForCausalLMz"language_model.model.embed_tokens.zlanguage_model.model.layers.zlanguage_model.model.norm.zlanguage_model.lm_head. )zmodel.embed_tokens.zmodel.layers.zmodel.norm.zlm_head.zmodel.)orig_to_new_prefixr   rb   rT   Nc                 C   s   | drdS td)Nr   r3   z Only image modality is supported)
startswithrm   )rn   r   rb   rH   rH   rI   get_placeholder_strq  s   
z*DeepseekOCRForCausalLM.get_placeholder_str)prefixvllm_configr   c                   s  t    |jj}|j}|jj}|| _|| _|j| _|j| _|j	| _	|j}t
|}|jt | _| |df t | _tdddddddd	d
}t||t|dd| _t| j| _|j| _|j| _| jj}	dttj|	tjd }
| jdkrtt |	|
 | _!tt |	|
 | _"nt#d| j W d    n1 sw   Y  | $| t%|| j	t|dd| _&W d    n1 sw   Y  | j&j'| _'d S )Nr   r   i   r            i   gh㈵>)hidden_sizeintermediate_sizenum_attention_headsnum_hidden_layersr=   r   projection_dimlayer_norm_epsvision_model)configquant_configr   r.   dtype2Dz.Only 2D tile_tag is supported currently, got: language_model)r   	hf_configr   )(superrQ   model_configr   r   multimodal_configr   vision_configprojector_configtext_configr$   vocab_IMAGE_TOKENr   _mark_tower_modelr0   	sam_modelr   r/   r   r   r1   	projectortile_tagglobal_view_posn_embedrF   sqrttensorfloat32nn	Parameterrandnimage_newlineview_seperatorrm   _mark_language_modelr   r   make_empty_intermediate_tensors)rP   r   r   r   r   r   r   r   clip_vision_configr   	embed_std	__class__rH   rI   rQ   x  sf   



#
zDeepseekOCRForCausalLM.__init__rx   c                 K   s`   | dd }| dd }| dd }|d u st| dkr!d S | jj}td|||d|idS )Nr5   r@   r>   r   r9   )r6   r<   r>   r@   resolve_bindings)poprF   sumitemr   r=   r4   )rP   rx   r5   r@   r>   r9   rH   rH   rI   _parse_and_validate_image_input  s   z6DeepseekOCRForCausalLM._parse_and_validate_image_inputimage_tensorc           
      C   s   |  |}| ||}tj|d d dd f |ddddfdd}| |}|j\}}}t|d }|	|||}| j
d d d d f |d|}	tj||	gdd}|	d|S )Nr.   r?   r   rU   r         ?)r   r   rF   catflattenpermuter   shapere   viewr   expand)
rP   r  global_features_1global_features_2features_hwr   sidenewlinerH   rH   rI   _encode_global_features  s   

z.DeepseekOCRForCausalLM._encode_global_featurespatches
crop_shapec                 C   s  t | dkrd S | |}| ||}t j|d d dd f |ddddfdd}| |}|j	\}}}t
|d }	t
|d  }
t
|d  }|||
|	|	|ddddd||	 |
|	 |}| jd d d d f ||	 d|}t j||gdd}|d|S )	Nr   r.   r?   rU   r   r  r8   r   )rF   r  r	  r   r   r  r  r  r   r  re   r  reshaper   r  )rP   r  r  local_features_1local_features_2r  r  r  r   
patch_sidewidth_tilesheight_tilesr  rH   rH   rI   _encode_local_features  s0   


z-DeepseekOCRForCausalLM._encode_local_featuresr5   r>   r@   c                 C   s   g }|d d df dk|d d df dkB }t ||jddd}|| }t|dD ]E}|| }||g }	|| }
| |	}| ||
}|d ur_t j	||| j
d d d f gdd}nt j	|| j
d d d f gdd}|| q/|S )Nr   r.   rU   r   )rF   r   r   splittolistrZ   r   r  r#  r  r   append)rP   r5   r>   r@   images_in_this_batchr   r   jdxr  	image_orir  global_featureslocal_featurescombinedrH   rH   rI   _pixel_values_to_embedding  s(   (

z1DeepseekOCRForCausalLM._pixel_values_to_embeddingimage_inputc                 C   s0   |j }|j}|jjtjd}| j|||d}|S )Nr   )r5   r>   r@   )r<   r>   r@   torF   longr-  )rP   r.  r5   r>   r@   vision_featuresrH   rH   rI   _process_image_input%  s   z+DeepseekOCRForCausalLM._process_image_inputc                 K   s*   | j di |}|d u rd S | |}|S )NrH   )r
  r2  )rP   rx   r.  vision_embeddingsrH   rH   rI   embed_multimodal4  s
   
z'DeepseekOCRForCausalLM.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s"   |d urd }| j ||||d}|S )N)r8  )r   )rP   r5  r6  r7  r8  rx   hidden_statesrH   rH   rI   forward;  s   zDeepseekOCRForCausalLM.forwardr9  c                 C   s   | j |S rN   )r   compute_logits)rP   r9  rH   rH   rI   r;  L  s   z%DeepseekOCRForCausalLM.compute_logitsweightsc                 C   s   t | }|j|| jd}|S )N)mapper)r   load_weightshf_to_vllm_mapper)rP   r<  loaderautoloaded_weightsrH   rH   rI   r>  R  s   z#DeepseekOCRForCausalLM.load_weightsc                 C   s   t jddddgdS )z<
        Get the module prefix in multimodal models
        r   r   r   r   )r   	connectortower_model)r   from_string_fieldrp   rH   rH   rI   get_mm_mappingW  s
   z%DeepseekOCRForCausalLM.get_mm_mapping)NN)!rA   rB   rC   r   r?  rs   r   re   r   r	   rQ   r   r4   r
  rF   rG   r  r#  r   r-  r2  r   r4  r#   r:  r;  r   rX   rO   r>  r   rE  __classcell__rH   rH   r  rI   r   _  sx    @

#
!


$r   )TrD   r   collections.abcr   r   r   typingr   r   rF   torch.nnr   transformersr   r   vllm.configr	   vllm.config.multimodalr
   %vllm.model_executor.models.interfacesr   r   r   r   )vllm.model_executor.models.module_mappingr    vllm.model_executor.models.utilsr   r   r   r   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   vllm.multimodal.parser   r   r   r   vllm.multimodal.processingr   r   r   r    r!   vllm.sampling_paramsr"   vllm.sequencer#   vllm.tokenizersr$   ,vllm.transformers_utils.configs.deepseek_vl2r%   /vllm.transformers_utils.processors.deepseek_ocrr&   r'   r(   r)   vllm.utils.tensor_schemar*   r+   vllm.v1.sample.logits_processorr,   r-   deepencoderr/   r0   deepseek_vl2r1   r}   r   r4   rJ   rf   ru   r   r   register_processorModuler   rH   rH   rH   rI   <module>   sR   &89
N