o
    پiy\                     @   s   d dl Z d dlmZ d dlmZmZmZmZ d dlZd dl	m
Z
mZ d dlmZmZmZmZ dd ZG dd	 d	eZeG d
d deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZeee dS )    N)	dataclass)DictListOptionalTuple)ImageImageOps)AutoProcessorLlamaTokenizerFastPretrainedConfigProcessorMixinc                 C   s   | \}}d }d}t d}|D ]=\}}t|| || }	t||	 t||	 }
}t|
| || }|| | }||ksC||krK||k rK|}|}||f}q|S )Nr   inf)floatminint)
image_sizecandidate_resolutionsoriginal_widthoriginal_heightbest_fitmax_effective_resolutionmin_wasted_resolutionwidthheightscaledownscaled_widthdownscaled_heighteffective_resolutionwasted_resolution r   R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/configs/deepseekvl2.pyselect_best_resolution   s*   r!   c                   @   s4   e Zd Zdd Zdd Zdd Zdd Zd	d
 ZdS )
DictOutputc                 C   
   | j  S N)__dict__itemsselfr   r   r    r&   ,      
zDictOutput.itemsc                 C   r#   r$   )r%   keysr'   r   r   r    r*   /   r)   zDictOutput.keysc                 C   s
   | j | S r$   r%   )r(   itemr   r   r    __getitem__2   r)   zDictOutput.__getitem__c                 C   s
   || j v S r$   r+   )r(   keyr   r   r    __contains__5   r)   zDictOutput.__contains__c                 C   s   || j |< d S r$   r+   )r(   r.   valuer   r   r    __setitem__8   s   zDictOutput.__setitem__N)__name__
__module____qualname__r&   r*   r-   r/   r1   r   r   r   r    r"   +   s    r"   c                   @   sH   e Zd ZU ejed< ejed< ejed< ejed< ejed< dd ZdS )	VLChatProcessorOutput	input_ids
target_idspixel_valuesimages_seq_maskimages_spatial_cropc                 C   s
   t | jS r$   )lenr6   r'   r   r   r    __len__F   r)   zVLChatProcessorOutput.__len__N)	r2   r3   r4   torch
LongTensor__annotations__Tensor
BoolTensorr<   r   r   r   r    r5   <   s   
 



r5   c                	   @   sV   e Zd Z			ddeeeeef  deeeeef  defddZdejfd	d
Z	dS )ImageTransform      ?rD   rD   Tmeanstd	normalizec              
   C   sv   || _ || _|| _zdd lm} W n ty" } ztd|d }~ww | g}|r3|||| |	|| _
d S )Nr   zMPlease install torchvision via `pip install torchvision` to use Deepseek-VL2.)rE   rF   rG   torchvision.transforms
transformsImportErrorToTensorappend	NormalizeCompose	transform)r(   rE   rF   rG   Terrtransform_pipelinesr   r   r    __init__K   s"   
zImageTransform.__init__pil_imgc                 C   s   |  |}|S r$   )rO   )r(   rT   xr   r   r    __call__f   s   
zImageTransform.__call__N)rC   rC   T)
r2   r3   r4   r   r   r   boolrS   r   rV   r   r   r   r    rB   J   s    
rB   c                       s  e Zd ZdZdgZ										d>ded
eeeef  dededeeeef deeeef de	de
de
de	de
de	def fddZd?ddZedd Zedd Zedd  Zd@d!e
d"e	d#e	fd$d%Zd&ee d'e
fd(d)Z	*	*	*			+	dAd,e
d-eee
e
f  d.eej d/e	d0e	d1e
d2efd3d4Zd*d*d*ddd+dd5d,e
d-eee
e
f  d.eej d/e	d0e	d1e
d2efd6d7Zd8d9 Z				dBd:e
d.eej d"e	d#e	d;e	d2efd<d=Z  ZS )CDeepseekVLV2Processor)LlamaTokenizerr
   	tokenizerrC   T<image>   <｜▁pad▁｜>Fdeepseekr   
patch_sizedownsample_ratio
image_mean	image_stdrG   image_token	pad_tokenadd_special_token
sft_formatmask_prompt	ignore_idc                    s  || _ |d d | _|| _|| _|| _|| _|| _t|||d| _|| _	d| j	_
|jd u r5| j	d|	i | j	j|}|d u rM|g}d|i}| j	| | j	j|| _g d}d|i}| j	| ddg}d|i}| j	| || _|	| _|
| _|| _|| _|| _t j|fi | d S )	Nr   )rE   rF   rG   leftrd   additional_special_tokens)z<|ref|>z<|/ref|>z<|det|>z<|/det|>z<|grounding|>z<|User|>z<|Assistant|>)r   r   r_   ra   rb   rG   r`   rB   image_transformrZ   padding_siderd   add_special_tokensvocabgetimage_token_idrc   re   rf   rg   rh   superrS   )r(   rZ   r   r_   r`   ra   rb   rG   rc   rd   re   rf   rg   rh   kwargsrp   special_tokensspecial_tokens_dict	__class__r   r    rS   o   sJ   

zDeepseekVLV2Processor.__init__c                 C   s   g }g }g }g }g }d}	| | j}
| j|||	|	|
  ddt|dk|d\}}}}|
}	||7 }| jr?|| jgt| 7 }n||7 }||7 }||7 }||7 }t|t|kseJ dt| dt| |||||fS )zKplay the role of format_messages_v2 and get_images_info in the last versionr   T   )boseoscroppingmax_req_input_lenz+format_messages_v2: tokenized_str's length ) is not equal to imags_seq_mask's length )countrc   tokenize_with_imagesr;   rg   rh   )r(   messages
pil_imagesr|   tokenized_datamasked_tokenized_dataimages_listr9   r:   image_indeximage_token_cnttokenized_strimagesseq_maskspatial_cropr   r   r    format_messages_v2   sD   
	z(DeepseekVLV2Processor.format_messages_v2c                 C      | j jS r$   )rZ   bos_token_idr'   r   r   r    bos_id      zDeepseekVLV2Processor.bos_idc                 C   r   r$   )rZ   eos_token_idr'   r   r   r    eos_id   r   zDeepseekVLV2Processor.eos_idc                 C   r   r$   )rZ   pad_token_idr'   r   r   r    pad_id   r   zDeepseekVLV2Processor.pad_idtextry   rz   c                 C   s4   | j j|dd}|r| jg| }|r|| jg }|S )NF)rm   )rZ   encoder   r   )r(   r   ry   rz   tr   r   r    r      s   zDeepseekVLV2Processor.encoder   returnc                 K   s   | j j|fi |S r$   )rZ   decode)r(   r   rr   r   r   r    r      s   zDeepseekVLV2Processor.decodeN promptconversationsr   apply_sft_formatinference_modesystem_promptr|   c                 K   sv  |du s|du sJ d|  |||\}	}
}}}t|	t|  kr(t|
ks=n J dt|	 dt|
 dt| dt|	}t|
}tj|tjd}| j||dk || jkB < | j||dk < |r|d	 | j	ksmJ |dd	 }|dd	 }|dd	 }t|dkrt
d
d| j| jf}tj
dtjd}ntj|dd}tj|tjd}tj|gdd}t|||||d}|S )a  

        Args:
            prompt (str): the formatted prompt;
            conversations (List[Dict]): conversations with a list of messages;
            images (List[ImageType]): the list of images;
            apply_sft_format (bool): if prompt is not None, then apply the SFT format to prompt;
                if conversations is not None, then it will always apply the SFT format to conversations;
            inference_mode (bool): if True, then remove the last eos token;
            system_prompt (str): the system prompt;
            **kwargs:

        Returns:
            outputs (BaseProcessorOutput): the output of the processor,
                - input_ids (torch.LongTensor): [N + image tokens]
                - target_ids (torch.LongTensor): [N + image tokens]
                - images (torch.FloatTensor): [n_images, 3, H, W]
                - image_id (int): the id of the image token
                - num_image_tokens (List[int]): the number of image tokens
        Nz9prompt and conversations cannot be used at the same time.ztokenized_str's length z, input_ids' length z, imags_seq_mask's length z, are not equal)dtyper   rw         )r   rx   )dim)r6   r7   r8   r9   r:   )r   r;   r=   r>   tensorrW   rh   rp   r   r   zerosr   longstackr5   )r(   r   r   r   r   r   r   r|   rr   r   masked_tokenized_strr   r9   r:   r6   r7   preparer   r   r    process_one   sT   !	$

z!DeepseekVLV2Processor.process_oner   r   r   r   r   r   r|   c          
   	   K   s   | j |||||||d}	|	S )Nr   )r   )
r(   r   r   r   r   r   r   r|   rr   r   r   r   r    rV   M  s   
zDeepseekVLV2Processor.__call__c                 C   s,   g }t |D ]\}}||kr|| q|S r$   )	enumeraterL   )r(   r   target_valueindicesindexr,   r   r   r    find_all_indicese  s   
z&DeepseekVLV2Processor.find_all_indicesconversationr{   c                 C   s  g g g }}}	| | j}
g }t|
|D ]\}}	 | j|ddd}||7 }|dgt| 7 }	 |r<t|j| j\}}n| j| j}}	 t	j
|| j| jftdd | jjD d}|| | 	 t	j
|||ftdd | jjD d}td|| jD ]!}td|| jD ]}|| ||||| j || j f qq|	 || j || j }}|	||g 	 t| j| j | j  }}| jg| |d  }|| jg7 }|| jg||  || d  7 }||7 }|d	gt| 7 }q	 | j|
d
 ddd}|d
kr!|t|t| d k r!|t| d d }|d| }|d| }||7 }|dgt| 7 }	 |r=| jg| }dg| }|rK|| jg }|dg }t|t|ksbJ dt| dt| ||||	fS )z Tokenize text with <image> tags.F)ry   rz   c                 s       | ]	}t |d  V  qdS    Nr   .0rU   r   r   r    	<genexpr>      z=DeepseekVLV2Processor.tokenize_with_images.<locals>.<genexpr>)colorc                 s   r   r   r   r   r   r   r    r     r   r   r   Trw      Nz2tokenize_with_images func: tokenized_str's length r}   )splitrc   zipr   r;   r!   sizer   r   r   padtuplerk   rE   rL   rangecropmathceilr_   r`   rp   r   r   )r(   r   r   ry   rz   r{   r|   r   r9   r:   text_splitsr   text_sepimagetokenized_sep
best_widthbest_heightglobal_view
local_viewijnum_width_tilesnum_height_tileshwtokenized_imagerestr   r   r    r   l  s   


	




z*DeepseekVLV2Processor.tokenize_with_images)	rC   rC   Tr[   r\   Fr]   Tr^   )rw   )TF)NNNFTr   rw   )TTTrw   )r2   r3   r4   tokenizer_class
attributesr
   r   r   r   rW   strrS   r   propertyr   r   r   r   r   r   r   r   r   rV   r   r   __classcell__r   r   ru   r    rX   k   s    	

D)




[	
rX   c                       s*  e Zd ZU dZeed< dZeed< dZeed< dZ	eed< d	Z
eed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< 													ddededed
ededededededededef fddZ  ZS ) DeepseekVL2VisionEncoderConfigvision
model_typesiglip_large_patch16_384
model_name  r      r_   r   r      layersheads   	mlp_ratiomapglobal_poolTignore_headFclass_tokenr   num_classesuse_checkpointskipweight_initdeterministicnum_recomputing_layersc                    s^   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _t jdi | d S Nr   )r   r   r_   r   r   r   r   r   r   r   r   r   rq   rS   )r(   r   r   r_   r   r   r   r   r   r   r   r   r   rr   ru   r   r    rS     s   z'DeepseekVL2VisionEncoderConfig.__init__)r   r   r   r   r   r   r   r   TFr   F)r2   r3   r4   r   r   r?   r   r   r   r_   r   r   r   r   r   r   rW   r   r   r   r   r   r   rS   r   r   r   ru   r    r     sl   
 	
r   c                       s   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed	< d
Zeed< dZeed< dZeed< 					
	ddededed	ededef fddZ  ZS )DeepseekVL2MlpProjectorConfigmlp_projectordownsample_mlp_geluprojector_type  	input_dim   n_embedrx   depthr   r   r`   Ftoken_poolingc                    s:   || _ || _|| _|| _|| _|| _t jdi | d S r   )r   r   r   r   r   r`   rq   rS   )r(   r   r   r   r   r   r`   rr   ru   r   r    rS     s   
z&DeepseekVL2MlpProjectorConfig.__init__)r   r   r   rx   r   rx   )r2   r3   r4   r   r   r   r?   r   r   r   r   r   r`   r   rW   rS   r   r   r   ru   r    r     s8   
 r   c                       sz   e Zd ZdZdgZ												
														
															
							d  fdd	Z  ZS )!DeepseekV2Configdeepseek_v2past_key_values      +           Nr         ?      @      greadyr   FsoftmaxMbP?Tsilur   {Gz?ư>順 顆      @        c*           +         s  || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|d u rT|}|| _|| _|| _t|| _|#| _|| _ |%| _!|&| _"|'| _#|(| _$|)| _%t& j'd| |!|"|$d|* d S )N)r   r   r   tie_word_embeddingsr   )(
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizemoe_intermediate_sizenum_hidden_layersnum_attention_headsn_shared_expertsn_routed_expertsep_sizerouted_scaling_factorkv_lora_rankq_lora_rankqk_rope_head_dim
v_head_dimqk_nope_head_dimtopk_methodn_group
topk_groupnum_experts_per_tokmoe_layer_freqfirst_k_dense_replacenorm_topk_probscoring_funcaux_loss_alphaseq_auxnum_key_value_heads
hidden_actinitializer_ranger   rms_norm_epspretraining_tp	use_cache
rope_thetarope_scalingattention_biasattention_dropoutuse_mlarq   rS   )+r(   r  r  r  r  r  r  r-  r  r  r  r  r  r  r   r!  r"  r#  r$  r%  r&  r'  r(  r)  r*  r+  r,  r.  r  r/  r0  r2  r   r   r   r1  r  r3  r4  r5  r6  r7  rr   ru   r   r    rS   *  s^   -

zDeepseekV2Config.__init__))r   r   r   r   r  r  r  NNr   r  r  r  r  r  r  r  NNNr   r   Fr	  r
  Tr  r   r  r  TNr  r  r   Fr  NFr  T)r2   r3   r4   r   keys_to_ignore_at_inferencerS   r   r   r   ru   r    r   %  sZ    r   c                
       s   e Zd ZU dZeed< eed< eed< dZe	ed< dZ
e	ed< d	Zeeeef  ed
< 				dde	de	d
eeeef  f fddZ  ZS )DeepseekVL2Configdeepseek_vl_v2vision_configprojector_configlanguage_config2Dtile_tagheadglobal_view_pos)r   r   r   c                    s   t  jdi | |di }tdi || _|di }tdi || _|di }t|tr4|| _	ntdi || _	|| _
|| _|| _dg| _d S )Nr;  r<  r=  DeepseekVL2ForCausalLMr   )rq   rS   ro   r   r;  r   r<  
isinstancer   r=  r?  rA  r   architectures)r(   r?  rA  r   rr   r;  r<  r=  ru   r   r    rS     s   
zDeepseekVL2Config.__init__)r?  r@  rB  )r2   r3   r4   r   r   r?   r   r   r?  r   rA  r   r   r   rS   r   r   r   ru   r    r9    s$   
 r9  )r   dataclassesr   typingr   r   r   r   r=   PILr   r   transformersr	   r
   r   r   r!   objectr"   r5   rB   rX   r   r   r   r9  registerr   r   r   r    <module>   s&    !  k3e%