o
    پiJ                     @   s  d dl mZ d dlmZmZmZmZ d dlZd dl	Z	d dl
Z
d dlmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZmZ d dlmZ G d	d
 d
eZG dd deZG dd deZG dd deZG dd deZ G dd deZ!eG dd dZ"G dd deZ#G dd deZ$G dd de%Z&eG dd de&Z'eG dd  d e&Z(G d!d" d"eZ)G d#d$ d$eZ*ee#e) ee#e$ dS )%    )	dataclass)DictListTupleUnionN)Image)BaseImageProcessorBatchFeatureLlamaConfigLlamaTokenizerFastPretrainedConfigProcessorMixin)to_numpy_array)register_image_processorregister_processor)expand2squarec                       s   e Zd Z fddZ  ZS )DictToObjectc                    sB   t | | | D ]\}}t|trt|}t| || qd S N)super__init__items
isinstancedictr   setattr)self
dictionarykeyvalue	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/configs/janus_pro.pyr      s   
zDictToObject.__init__)__name__
__module____qualname__r   __classcell__r    r    r   r!   r      s    r   c                       2   e Zd ZU dZdZeed< i Z fddZ  Z	S )VisionConfigvision clsc                    H   t  jdi | |dd| _t| jts| jj| _|di | _d S Nr*   r)   paramsr    r   r   getr*   r   strr"   r-   r   kwargsr   r    r!   r   (   
   
zVisionConfig.__init__
r"   r#   r$   
model_typer*   r0   __annotations__r-   r   r%   r    r    r   r!   r'   #   
   
 r'   c                       r&   )GenAlignerConfiggen_alignerr)   r*   c                    r+   r,   r.   r1   r   r    r!   r   7   r3   zGenAlignerConfig.__init__r4   r    r    r   r!   r8   2   r7   r8   c                       r&   )GenHeadConfiggen_headr)   r*   c                    r+   r,   r.   r1   r   r    r!   r   F   r3   zGenHeadConfig.__init__r4   r    r    r   r!   r:   A   r7   r:   c                       r&   )AlignerConfigalignerr)   r*   c                    r+   r,   r.   r1   r   r    r!   r   U   r3   zAlignerConfig.__init__r4   r    r    r   r!   r<   P   r7   r<   c                       r&   )GenVisionConfig
gen_visionr)   r*   c                    r+   r,   r.   r1   r   r    r!   r   d   r3   zGenVisionConfig.__init__r4   r    r    r   r!   r>   _   r7   r>   c                   @   s   e Zd ZU dZeed< dZeeeeeef ef ed< dZ	eed< dZ
eed< d	Zeeeef ef ed
< dZeed< dZeed< dZeed< dZeed< dZeed< dS )SigLIPVisionCfgi  width   layers   heads   
patch_sizeiP  
image_sizemapglobal_poolgZӼ@	mlp_ratioFclass_tokenr   num_classesuse_checkpointN)r"   r#   r$   rA   intr6   rC   r   r   rE   rG   rH   rJ   r0   rK   floatrL   boolrM   rN   r    r    r    r!   r@   n   s   
  r@   c                       sR   e Zd ZU dZeed< eed< eed< eed< e	ed< e
ed<  fdd	Z  ZS )
MultiModalityConfigmulti_modalityvision_configaligner_configgen_vision_configgen_aligner_configgen_head_configlanguage_configc                    s   t  jdi | |di }tdi || _|di }tdi || _|di }tdi || _|di }t	di || _
|di }tdi || _|di }t|tr_|| _d S tdi || _d S )NrT   rU   rV   rW   rX   rY   r    )r   r   r/   r'   rT   r<   rU   r>   rV   r8   rW   r:   rX   r   r
   rY   )r   r2   rT   rU   rV   rW   rX   rY   r   r    r!   r      s   

zMultiModalityConfig.__init__)r"   r#   r$   r5   r'   r6   r<   r>   r8   r:   r
   r   r%   r    r    r   r!   rR   |   s   
 rR   c                       s   e Zd ZdgZ					ddeded	eeeeef ee f d
eeeeef ee f dede	f fddZ
dedejfddZddedefddZedd Z  ZS )VLMImageProcessorpixel_valuesrF   g3<4'?gwgM?gy{ ?gB91?gwt.?g	U?p?TrH   min_size
image_mean	image_stdrescale_factordo_normalizec                    s`   t  jdi | || _|| _|| _|| _|| _|| _|d u r$d| _d S t	dd |D | _d S )N)   rd   rd   c                 S   s   g | ]}t |d  qS )   )rO   ).0xr    r    r!   
<listcomp>   s    z.VLMImageProcessor.__init__.<locals>.<listcomp>r    )
r   r   rH   rb   r`   ra   r_   rc   background_colortupler   rH   r_   r`   ra   rb   rc   r2   r   r    r!   r      s   
zVLMImageProcessor.__init__pil_imgreturnc                 C   s   |j \}}t||}tt|| | j | jtt|| | j | jg}|dks8|dks8|d dks8|d dkr<tdtjjj	dfdd}|||tjjj	dd}t
|| j}t|}t|d}|S )	z

        Args:
            pil_img (PIL.Image): [H, W, 3] in PIL.Image in RGB

        Returns:
            x (np.ndarray): [3, self.image_size, self.image_size]
        r      zInvalid size!Tc                 S   s   t |tr:| j\}}||kr||ks||kr||kr| S ||k r+|}t|| | }n
|}t|| | }||f}n|d |d f}| j|||rLd dS ddS )Nrn   r   g      @)resamplereducing_gap)r   rO   sizeresize)rl   rq   interpolation	antialiaswhowohr    r    r!   rr      s    

 

z(VLMImageProcessor.resize.<locals>.resize)rs   rt   )   r   rn   )rq   maxrO   rH   r_   
ValueErrorPILr   
ResamplingBICUBICr   ri   r   np	transpose)r   rl   rA   heightmax_sizerq   rr   rg   r    r    r!   rr      s    


(

zVLMImageProcessor.resizeptreturn_tensorsc                    sn   t |ts|g} fdd|D }dd |D } fdd|D } jr- fdd|D }d|i}t||dS )Nc                    s   g | ]}  |qS r    )rr   rf   imager   r    r!   rh      s    z0VLMImageProcessor.preprocess.<locals>.<listcomp>c                 S   s   g | ]
}|d ddf qS )N   .r    r   r    r    r!   rh      s    c                    s   g | ]} j | jd dqS )channels_first)r   scaleinput_data_format)rescalerb   r   r   r    r!   rh     s    c                    s"   g | ]} j | j jd dqS )r   )r   meanstdr   )	normalizer`   ra   r   r   r    r!   rh     s    r[   )datatensor_type)r   listrc   r	   )r   imagesr   r2   r   r    r   r!   
preprocess   s   



	zVLMImageProcessor.preprocessc                 C   s   d| j | j gS )Nr   )rH   r   r    r    r!   default_shape  s   zVLMImageProcessor.default_shaperF   r\   r]   r^   T)r   )r"   r#   r$   model_input_namesrO   r   r   rP   r   rQ   r   r   r   ndarrayrr   r0   r	   r   propertyr   r%   r    r    r   r!   rZ      s0    	 7 rZ   c                   @   s4   e Zd Zdd Zdd Zdd Zdd Zd	d
 ZdS )
DictOutputc                 C   
   | j  S r   )__dict__r   r   r    r    r!   r        
zDictOutput.itemsc                 C   r   r   )r   keysr   r    r    r!   r   "  r   zDictOutput.keysc                 C   s
   | j | S r   r   )r   itemr    r    r!   __getitem__%  r   zDictOutput.__getitem__c                 C   s
   || j v S r   r   )r   r   r    r    r!   __contains__(  r   zDictOutput.__contains__c                 C   s   || j |< d S r   r   )r   r   r   r    r    r!   __setitem__+  s   zDictOutput.__setitem__N)r"   r#   r$   r   r   r   r   r   r    r    r    r!   r     s    r   c                   @   s<   e Zd ZU eed< ejed< ejed< ejed< dd ZdS )VLChatProcessorOutput
sft_format	input_idsr[   num_image_tokensc                 C   s
   t | jS r   )lenr   r   r    r    r!   __len__6  r   zVLChatProcessorOutput.__len__N)	r"   r#   r$   r0   r6   torchTensor	IntTensorr   r    r    r    r!   r   /  s   
 


r   c                   @   sL   e Zd ZU ee ed< ejed< ejed< ejed< ejed< ejed< dS )BatchedVLChatProcessorOutputr   r   r[   attention_maskimages_seq_maskimages_emb_maskN)	r"   r#   r$   r   r0   r6   r   r   
BoolTensorr    r    r    r!   r   :  s   
 



r   c                       sH  e Zd ZdZdZddgZ							
			d9dedededededede	de
dede
de	f fddZedd Zede	fddZedd Zed d! Zed"d# Zed$d% Zed&d' Zd(ee	 d)ejfd*d+Z	,	,d:d-ed.ee fd/d0Zd,d,d,dd1d-ed2eeeef  d.ee d3e
fd4d5Zd6ee defd7d8Z  ZS );VLChatProcessorAutoImageProcessor)LlamaTokenizerr   image_processor	tokenizer<image_placeholder><begin_of_image><end_of_image>   <｜▁pad▁｜>@  FdeepseekT	image_tagimage_start_tagimage_end_tagpad_tagr   add_special_tokenr   mask_prompt	ignore_idc                    s   || _ || _| jj|}|d u r|g}d|i}| j| || _|| _|| _|| _|| _	|| _
|	| _|| _t j||fi | d S )Nadditional_special_tokens)r   r   vocabr/   add_special_tokensr   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r2   image_idspecial_tokensspecial_tokens_dictr   r    r!   r   L  s*   
zVLChatProcessor.__init__c                 C      | j S r   )r   r   r    r    r!   image_tokenu     zVLChatProcessor.image_tokenrm   c                 C      | j j| j}|S r   )r   r   r/   r   )r   r   r    r    r!   r   y     zVLChatProcessor.image_idc                 C   r   r   )r   r   r/   r   )r   image_start_idr    r    r!   r   ~  r   zVLChatProcessor.image_start_idc                 C   r   r   )r   r   r/   r   )r   image_end_idr    r    r!   r     r   zVLChatProcessor.image_end_idc                 C   r   r   )r   r   r    r    r!   image_start_token  r   z!VLChatProcessor.image_start_tokenc                 C   r   r   )r   r   r    r    r!   image_end_token  r   zVLChatProcessor.image_end_tokenc                 C   r   r   )r   r   r/   r   )r   pad_idr    r    r!   r     r   zVLChatProcessor.pad_idimage_indicesr   c                 C   s   g }d}|D ]E}| j r|d }n|}||||  || jtjdtjd  || jtj| jftjd  || jtjdtjd  |d }q|||d  tj	|dd}t
| jgt| }||fS )a  

        Args:
            image_indices (List[int]): [index_0, index_1, ..., index_j]
            input_ids (torch.LongTensor): [N]

        Returns:
            input_ids (torch.LongTensor): [N + image tokens]
            num_image_tokens (torch.IntTensor): [n_images]
        r   rn   )dtypeN)dim)r   appendr   r   oneslongr   r   r   catr   r   )r   r   r   input_slicesstartindexendr   r    r    r!   add_image_token  s"   

zVLChatProcessor.add_image_tokenNpromptr   c                 K   sj   |}| j |}t|}|| jktj}| }| j||d\}}| j	|dd}	t
|||	j|d}
|
S )aB  

        Args:
            prompt (str): the formatted prompt;
            images (List[ImageType]): the list of images;
            **kwargs:

        Returns:
            outputs (BaseProcessorOutput): the output of the processor,
                - input_ids (torch.LongTensor): [N + image tokens]
                - target_ids (torch.LongTensor): [N + image tokens]
                - images (torch.FloatTensor): [n_images, 3, H, W]
                - image_id (int): the id of the image token
                - num_image_tokens (List[int]): the number of image tokens
        )r   r   r   )r   )r   r   r[   r   )r   encoder   
LongTensorr   torQ   nonzeror   r   r   r[   )r   r   r   r2   r   r   image_token_maskr   r   images_outputspreparer    r    r!   process_one  s"   

zVLChatProcessor.process_one)r   conversationsr   force_batchifyr   r   c                K   s$   | j |||d}|r| |g}|S )a  

        Args:
            prompt (str): the formatted prompt;
            conversations (List[Dict]): conversations with a list of messages;
            images (List[ImageType]): the list of images;
            force_batchify (bool): force batchify the inputs;
            **kwargs:

        Returns:
            outputs (BaseProcessorOutput): the output of the processor,
                - input_ids (torch.LongTensor): [N + image tokens]
                - images (torch.FloatTensor): [n_images, 3, H, W]
                - image_id (int): the id of the image token
                - num_image_tokens (List[int]): the number of image tokens
        )r   r   r   )r   batchify)r   r   r   r   r   r2   r   r    r    r!   __call__  s   zVLChatProcessor.__call__prepare_listc                 C   s  t |}g }g }g }|D ]}|t |j |t | qt|}tdt|}t||f| j }	t||f }
t||g| j	j
R  }t||f }t||| jf }t|D ]W\}}|j}t |}t |j}d|
|| df< t||	|| df< || jk||| df< |dkr|j||d|f< t|jD ]\}}d|||d|f< q||j qct|	|
||||d}|S )a#  
        Preprocesses the inputs for multimodal inference.

        Args:
            prepare_list (List[VLChatProcessorOutput]): A list of VLChatProcessorOutput.

        Returns:
            BatchedVLChatProcessorOutput: A dictionary of the inputs to use for multimodal inference.
        rn   Nr   T)r   r   r[   r   r   r   )r   r   r   rz   r   fullr   r   zerosr   r   rP   rQ   	enumerater   r   r   r[   r   r   )r   r   
batch_sizer   n_imagesseq_lensr   input_token_max_lenmax_n_imagesbatched_input_idsbatched_attention_maskbatched_pixel_valuesbatched_images_seq_maskbatched_images_emb_maskir   seq_lenn_imagejn_image_tokensbatched_preparesr    r    r!   r     s^   


	zVLChatProcessor.batchify)	r   r   r   r   r   Fr   Tr   )NN) r"   r#   r$   image_processor_classtokenizer_class
attributesrZ   r   r0   rO   rQ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r%   r    r    r   r!   r   F  s    	
)






/
2
#r   c                       s   e Zd ZU dZeed< eed< eeeeef e	e f ed< eeeeef e	e f ed< eed< e
ed< 				
		ddededeeeeef e	e f deeeeef e	e f dede
f fddZ  ZS )VLMImageProcessorConfigdeepseek_vlmrH   r_   r`   ra   rb   rc   rF   r\   r]   r^   Tc                    s:   || _ || _|| _|| _|| _|| _t jdi | d S )Nr    )rH   r_   r`   ra   rb   rc   r   r   rk   r   r    r!   r   ]  s   z VLMImageProcessorConfig.__init__r   )r"   r#   r$   r5   rO   r6   r   r   rP   r   rQ   r   r%   r    r    r   r!   r  T  s4   
 	r  )+dataclassesr   typingr   r   r   r   numpyr   r|   r   	PIL.Imager   transformersr   r	   r
   r   r   r   transformers.image_utilsr   sglang.srt.configs.utilsr   r   sglang.srt.multimodal.mm_utilsr   r   r   r'   r8   r:   r<   r>   r@   rR   rZ   objectr   r   r   r   r  r    r    r    r!   <module>   s>    
#
  
%