o
    پil                     @   s  d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z	d dl
mZmZ d dlmZmZmZmZ d dlmZ d dlmZ dZd	Zd
ZdZdZdZdZdZd
ZdZdZ dZ!dZ"e# Z$dee%ef fddZ&dZ'G dd de(Z)eG dd de)Z*G dd de(Z+dd Z,eed	dfd d!Z-G d"d# d#eZ.G d$d% d%eZ/G d&d' d'eZ0G d(d) d)eZ1ee.d*G d+d, d,eZ2e3e2e. dS )-    N)	dataclass)AnyDictListOptionalTuple)ImageImageOps)AutoProcessorLlamaTokenizerFastPretrainedConfigProcessorMixin)register_customized_processor)&DeepseekOCRNoRepeatNGramLogitProcessor     T      d   @   Fzdeepseek-ai/DeepSeek-OCR   Z   )i5 i6 returnc                   C   s   t tttdS )zMReturn default custom params for the DeepSeek-OCR n-gram no repeat processor.)
ngram_sizewindow_sizewhitelist_token_ids)NGRAM_NO_REPEAT_SIZENGRAM_NO_REPEAT_WINDOWlistNGRAM_NO_REPEAT_WHITELIST r    r    S/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/configs/deepseek_ocr.pyget_default_ngram_custom_params(   s   r"   z6<image>
<|grounding|>Convert the document to markdown.c                   @   s4   e Zd Zdd Zdd Zdd Zdd Zd	d
 ZdS )
DictOutputc                 C   
   | j  S N)__dict__itemsselfr    r    r!   r'   6      
zDictOutput.itemsc                 C   r$   r%   )r&   keysr(   r    r    r!   r+   9   r*   zDictOutput.keysc                 C   s
   | j | S r%   r&   )r)   itemr    r    r!   __getitem__<   r*   zDictOutput.__getitem__c                 C   s
   || j v S r%   r,   )r)   keyr    r    r!   __contains__?   r*   zDictOutput.__contains__c                 C   s   || j |< d S r%   r,   )r)   r/   valuer    r    r!   __setitem__B   s   zDictOutput.__setitem__N)__name__
__module____qualname__r'   r+   r.   r0   r2   r    r    r    r!   r#   5   s    r#   c                   @   sR   e Zd ZU ejed< ejed< ejed< ejed< ejed< ejed< dd Zd	S )
VLChatProcessorOutput	input_ids
target_idsimages_croppixel_valuesimages_seq_maskimages_spatial_cropc                 C   s
   t | jS r%   )lenr7   r(   r    r    r!   __len__Q   r*   zVLChatProcessorOutput.__len__N)	r3   r4   r5   torch
LongTensor__annotations__Tensor
BoolTensorr>   r    r    r    r!   r6   F   s   
 




r6   c                	   @   sV   e Zd Z			ddeeeeef  deeeeef  defddZdejfd	d
Z	dS )ImageTransform      ?rF   rF   Tmeanstd	normalizec              
   C   sv   || _ || _|| _zdd lm} W n ty" } ztd|d }~ww | g}|r3|||| |	|| _
d S )Nr   zMPlease install torchvision via `pip install torchvision` to use Deepseek-VL2.)rG   rH   rI   torchvision.transforms
transformsImportErrorToTensorappend	NormalizeCompose	transform)r)   rG   rH   rI   Terrtransform_pipelinesr    r    r!   __init__V   s"   
zImageTransform.__init__pil_imgc                 C   s   |  |}|S r%   )rQ   )r)   rV   xr    r    r!   __call__q   s   
zImageTransform.__call__N)rE   rE   T)
r3   r4   r5   r   r   floatboolrU   r   rX   r    r    r    r!   rD   U   s    
rD   c                 C   s|   t d}d}|| }|D ]/}|d |d  }	t| |	 }
|
|k r%|
}|}q|
|kr;|d| | |d  |d  kr;|}q|S )Ninf   r]   r   r]   rF   )rY   abs)aspect_ratiotarget_ratioswidthheight
image_sizebest_ratio_diff
best_ratioarearatiotarget_aspect_ratio
ratio_diffr    r    r!   find_closest_aspect_ratiov   s    rj   c                    s4  | j \}}|| }t fddt d D }t|dd d}t|||||}	||	d  }
||	d  }|	d |	d  }| |
|f}g }t|D ].}||
|  | ||
|  | ||
|  d | ||
|  d | f}||}|| qKt||ksJ |rt|dkr| ||f}|| ||	fS )Nc                 3   sX    | ]'}t d |d  D ]}t d |d  D ]}||  kr|| kr||fV  qqqdS )r]   N)range).0nijmax_nummin_numr    r!   	<genexpr>   s    
z%dynamic_preprocess.<locals>.<genexpr>r]   c                 S   s   | d | d  S )Nr   r]   r    )rW   r    r    r!   <lambda>   s    z$dynamic_preprocess.<locals>.<lambda>)r/   r   )	sizesetrk   sortedrj   resizecroprN   r=   )imagerr   rq   rc   use_thumbnail
orig_widthorig_heightr_   r`   rh   target_widthtarget_heightblocksresized_imgprocessed_imagesrn   box	split_imgthumbnail_imgr    rp   r!   dynamic_preprocess   s6   



r   c                       s  e Zd ZdZdgZ											d@ded
eeeef  dededeeeef deeeef de	de
de
de	de
de	dede	f fddZdAde
fddZedd Zedd  Zed!d" ZdBd#e
d$e	d%e	fd&d'Zd(ee d)e
fd*d+Z	,	,	,			-		dCd.e
d/eee
e
f  d0eej d1e	d2e	d3e
d4ed5e	fd6d7Zd,d,d,ddd-dd,d8d.e
d/eee
e
f  d0eej d1e	d2e	d3e
d4ed#ee
 fd9d:Zd;d< Z			dDd=e
d0eej d$e	d%e	d5e	f
d>d?Z  ZS )EDeepseekOCRProcessor)LlamaTokenizerr   	tokenizerrE   T<image>   <｜▁pad▁｜>Fdeepseekcandidate_resolutions
patch_sizedownsample_ratio
image_mean	image_stdrI   image_token	pad_tokenadd_special_token
sft_formatmask_prompt	ignore_id	ocr2_modec                    s*  || _ |d d | _|| _|| _|| _|| _|| _t| _t	|||d| _
|| _d| j_|jd u r8| jd|	i | jj|}|d u rP|g}d|i}| j| | jj|| _g d}d|i}| j| ddg}d|i}| j| || _|	| _|
| _|| _|| _|| _|| _t j|fi | d S )	Nr   )rG   rH   rI   leftr   additional_special_tokens)z<|ref|>z<|/ref|>z<|det|>z<|/det|>z<|grounding|>z<|User|>z<|Assistant|>)r   rc   r   r   r   rI   r   	BASE_SIZE	base_sizerD   image_transformr   padding_sider   add_special_tokensvocabgetimage_token_idr   r   r   r   r   r   superrU   )r)   r   r   r   r   r   r   rI   r   r   r   r   r   r   r   kwargsr   special_tokensspecial_tokens_dict	__class__r    r!   rU      sN   

zDeepseekOCRProcessor.__init__messagesc                 C   s   g }g }g }g }g }d}	| | j}
| j|||	|	|
  ddt|dkd\}}}}}}}|
}	||7 }||7 }|}||||||fS )zKplay the role of format_messages_v2 and get_images_info in the last versionr   Tr   )boseoscropping)countr   tokenize_with_imagesr=   )r)   r   
pil_imagesmax_req_input_lentokenized_datamasked_tokenized_dataimages_listr;   r<   image_indeximage_token_cntr7   imagesr9   seq_maskspatial_cropnum_image_tokensimage_shapesr    r    r!   format_messages_v2   sB   	
	z'DeepseekOCRProcessor.format_messages_v2c                 C      | j jS r%   )r   bos_token_idr(   r    r    r!   bos_id&     zDeepseekOCRProcessor.bos_idc                 C   r   r%   )r   eos_token_idr(   r    r    r!   eos_id*  r   zDeepseekOCRProcessor.eos_idc                 C   r   r%   )r   pad_token_idr(   r    r    r!   pad_id.  r   zDeepseekOCRProcessor.pad_idtextr   r   c                 C   s4   | j j|dd}|r| jg| }|r|| jg }|S )NF)r   )r   encoder   r   )r)   r   r   r   tr    r    r!   r   2  s   zDeepseekOCRProcessor.encoder   r   c                 K   s   | j j|fi |S r%   )r   decode)r)   r   r   r    r    r!   r   <  s   zDeepseekOCRProcessor.decodeN promptconversationsr   apply_sft_formatinference_modesystem_promptr   r   c	                 K   s   |p|}|  |||\}
}}}}}t|}t|dk}d}t|dkr-tdd |D }t|dkr?tdd| j| jf}ntj|dd}tj|gdd}t|
|||||d}||_	||_
|S )	a  

        Args:
            prompt (str): the formatted prompt;
            conversations (List[Dict]): conversations with a list of messages;
            images (List[ImageType]): the list of images;
            apply_sft_format (bool): if prompt is not None, then apply the SFT format to prompt;
                if conversations is not None, then it will always apply the SFT format to conversations;
            inference_mode (bool): if True, then remove the last eos token;
            system_prompt (str): the system prompt;
            **kwargs:

        Returns:
            outputs (BaseProcessorOutput): the output of the processor,
                - input_ids (torch.LongTensor): [N + image tokens]
                - target_ids (torch.LongTensor): [N + image tokens]
                - images (torch.FloatTensor): [n_images, 3, H, W]
                - image_id (int): the id of the image token
                - num_image_tokens (List[int]): the number of image tokens
        r   Fc                 s   s(    | ]}|d  dkp|d dkV  qdS )r   r]   Nr    )rl   ry   r    r    r!   rs   o  s    
z3DeepseekOCRProcessor.process_one.<locals>.<genexpr>r]      dim)r7   r8   r9   r:   r;   r<   )r   r?   r@   r=   anyzerosrc   stackr6   
has_imageshas_local_crops)r)   r   r   r   r   r   r   r   r   r   r7   masked_tokenized_strr   r;   r<   r9   r8   r   r   preparer    r    r!   process_one?  sB   !
z DeepseekOCRProcessor.process_one)r   r   r   r   r   r   r   r   c             	   K   sF   |d u st |tsJ |d ur|d }| j|p|||||||d}
|
S )Nr   )r   r   r   r   r   r   r   )
isinstancer   r   )r)   r   r   r   r   r   r   r   r   r   r   r    r    r!   rX     s   
zDeepseekOCRProcessor.__call__c                 C   s,   g }t |D ]\}}||kr|| q|S r%   )	enumeraterN   )r)   r   target_valueindicesindexr-   r    r    r!   find_all_indices  s   
z%DeepseekOCRProcessor.find_all_indicesconversationc           !      C   s  |}| | jt|ksJ || j}g g g g f\}}}	}
g }g }g }t||D ]\}}	 | j|ddd}||7 }|	dgt| 7 }	||j |jd dkr]|jd dkr]ddg}n|rht|t	d\}}nddg}	 | j
dkr}|s}|| j
| j
f}tj|| j| jftdd | jjD d	}|| | |\}}|
||g |dks|dkrtt|D ]}|| ||  q	 t| j
| j | j }t| j| j | j }| jrg }|dks|dkr|| jg|| | |  7 }|| jg||  7 }|| jg7 }n.| jg| | jg | }|| jg7 }|dks"|dkr4|| jg||  | jg ||  7 }||7 }|	d
gt| 7 }	|t| q)	 | j|d ddd}||7 }|	dgt| 7 }		 |rp| jg| }dg|	 }	|r~|| jg }|	dg }	t|t|	ksJ dt| dt|	 g }|D ]}|| jkr|| q|| j qt|t|	  krt|ksn J dt| dt| dt|	 dt|}t|}tj|	tjd}	| j||dk || jkB < | j ||dk < d
}|r|d | jksJ |dd }|dd }|	dd }	t|dkrFt!dd| j| jf}tj!dtj"d}
t!dd| j
| j
f#d} n+tj$|dd}tj|
tj"d}
|rctj$|dd#d} nt!dd| j
| j
f#d} |#d}||| |	|
||fS )z Tokenize text with <image> tags.F)r   r   r   r   r]   )rc   c                 s   s    | ]	}t |d  V  qdS )   N)int)rl   rW   r    r    r!   rs     s    z<DeepseekOCRProcessor.tokenize_with_images.<locals>.<genexpr>)colorTr   z2tokenize_with_images func: tokenized_str's length z) is not equal to imags_seq_mask's length ztokenized_str's length z, input_ids' length z, imags_seq_mask's length z, are not equal)dtypeNr   r\   r   )%r   r   r=   splitzipr   rN   ru   r   
IMAGE_SIZErc   rx   r	   padr   tupler   rG   rk   mathceilr   r   r   r   r   r   r   r?   r@   tensorrZ   r   r   long	unsqueezer   )!r)   r   r   r   r   r   text_splitsr   images_crop_listr;   r<   r   r   tokenized_strtext_seprz   tokenized_sep
crop_ratioimages_crop_rawglobal_viewnum_width_tilesnum_height_tilesrn   num_queriesnum_queries_basetokenized_imager   token_indexr7   r8   r   r:   r9   r    r    r!   r     s
  





(


z)DeepseekOCRProcessor.tokenize_with_images)
rE   rE   Tr   r   Fr   Tr   F)r   )TF)NNNFTr   r   T)TTT)r3   r4   r5   tokenizer_class
attributesr   r   r   rY   rZ   strrU   r   propertyr   r   r   r   r   r   r   r   r   r   rX   r   r   __classcell__r    r    r   r!   r      s    	
F(



	
M	

r   c                       s*  e Zd ZU dZeed< dZeed< dZeed< dZ	eed< d	Z
eed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< 													ddededed
ededededededededef fddZ  ZS ) VisionEncoderConfigvision
model_type#vit_so400m_patch14_siglip_384.webli
model_name  rc      r   r   ra      layersheads   	mlp_ratiomapglobal_poolTignore_headFclass_tokenr   num_classesuse_checkpointskipweight_initdeterministicnum_recomputing_layersc                    s^   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _t jdi | d S Nr    )r  rc   r   ra   r  r  r  r  r  r  r  r  r   rU   )r)   r  rc   r   ra   r  r  r  r  r  r  r  r  r   r   r    r!   rU   j  s   zVisionEncoderConfig.__init__)r  r  r	  r   r
  r	  r  r  TFr   F)r3   r4   r5   r  r   rA   r  rc   r   r   ra   r  r  r  r  r  rZ   r  r  r  r  r  r  rU   r  r    r    r   r!   r  W  sl   
 	
r  c                       s   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed	< d
Zeed< dZeed< dZeed< 					
	ddededed	ededef fddZ  ZS )MlpProjectorConfigmlp_projectordownsample_mlp_geluprojector_type  	input_dim   n_embedr   depthr]   r  r   Ftoken_poolingc                    s:   || _ || _|| _|| _|| _|| _t jdi | d S r  )r  r  r!  r"  r  r   r   rU   )r)   r  r  r!  r"  r  r   r   r   r    r!   rU     s   
zMlpProjectorConfig.__init__)r  r  r   r   r]   r   )r3   r4   r5   r  r  r   rA   r  r   r!  r"  r  r   r#  rZ   rU   r  r    r    r   r!   r    s8   
 r  c                       sz   e Zd ZdZdgZ												
														
															
							d  fdd	Z  ZS )!DeepseekV2Configdeepseek_v2past_key_values      +    r       Nr]         ?      r      greadyr   FsoftmaxMbP?Tsilur   {Gz?ư>順 顆      @        c*           +         s  || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|d u rT|}|| _|| _|| _t|| _|#| _|| _ |%| _!|&| _"|'| _#|(| _$|)| _%t& j'd| |!|"|$d|* d S )N)r   r   r   tie_word_embeddingsr    )(
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizemoe_intermediate_sizenum_hidden_layersnum_attention_headsn_shared_expertsn_routed_expertsep_sizerouted_scaling_factorkv_lora_rankq_lora_rankqk_rope_head_dim
v_head_dimqk_nope_head_dimtopk_methodn_group
topk_groupnum_experts_per_tokmoe_layer_freqfirst_k_dense_replacenorm_topk_probscoring_funcaux_loss_alphaseq_auxnum_key_value_heads
hidden_actinitializer_rangerY   rms_norm_epspretraining_tp	use_cache
rope_thetarope_scalingattention_biasattention_dropoutuse_mlar   rU   )+r)   r;  r=  r>  r?  r@  rA  rU  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rK  rL  rM  rN  rO  rP  rQ  rR  rS  rT  rV  r<  rW  rX  rZ  r   r   r   rY  r:  r[  r\  r]  r^  r_  r   r   r    r!   rU     s^   -

zDeepseekV2Config.__init__))r'  r(  r)  r*  r   r+  r+  NNr]   r,  r-  r.  r   r/  r/  r0  NNNr]   r   Fr1  r2  Tr3  r   r4  r5  TNr6  r7  r]   Fr8  NFr9  T)r3   r4   r5   r  keys_to_ignore_at_inferencerU   r  r    r    r   r!   r$    sZ    r$  )processor_classc                
       s   e Zd ZU dZeed< eed< dZeed< dZ	eed< dZ
eeeef  ed	< eZee ed
< 			ddeded	eeeef  f fddZ  ZS )DeepseekVLV2Configzdeepseek-ocrvision_configprojector_config2Dtile_tagheadglobal_view_pos)r  r  r   customized_processor_typec                    s   t  jdi | |di }tdi || _|di }tdi || _|di }tdi || _|| _	|| _
|| _| jj| _| jj| _d S )Nrc  rd  language_configr    )r   rU   r   r  rc  r  rd  r$  text_configrf  rh  r   r;  r=  )r)   rf  rh  r   r   rc  rd  rk  r   r    r!   rU     s   
zDeepseekVLV2Config.__init__)rf  rg  ri  )r3   r4   r5   r  r  rA   r  rf  r   rh  r   r   r   r   rj  typer   rU   r  r    r    r   r!   rb    s$   
 rb  )4r   dataclassesr   typingr   r   r   r   r   r?   PILr   r	   transformersr
   r   r   r   3sglang.srt.multimodal.customized_mm_processor_utilsr   *sglang.srt.sampling.custom_logit_processorr   r   r   	CROP_MODE	MIN_CROPS	MAX_CROPSMAX_CONCURRENCYNUM_WORKERSPRINT_NUM_VIS_TOKENSSKIP_REPEAT
MODEL_PATHr   r   r   to_strDEFAULT_CUSTOM_LOGIT_PROCESSORr   r"   PROMPTobjectr#   r6   rD   rj   r   r   r  r  r$  rb  registerr    r    r    r!   <module>   sR    
!
.   &3d$