o
    پi:                     @   s4  d dl Z d dlZd dlmZmZ d dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZ e ZedZdZdZ dZ!dZ"e#e
ej$% dZ&ej$' re&du re(dej$%  d e)e*ej+%ddZ,dZ-dZ.dZ/dZ0dZ1dZ2ee e!fde)de)de)de)de)f
d d!Z3e e!efde)de)d"e)d#e
j
fd$d%Z4d&e)e*B de)d#e)fd'd(Z5d&e)e*B de)d#e)fd)d*Z6d&e)e*B de)d#e)fd+d,Z7e e!efde)de)d"e)fd-d.Z8d/e9d0e)d1e)e*B d#e)fd2d3Z:efd4e)d#ej;fd5d6Z<G d7d8 d8eZ=dS )9    N)ListUnion)Image)InterpolationMode)BaseImageProcessorFast)envs)MRotaryEmbedding)&Ernie4_5_VLMoeForConditionalGeneration)BaseMultimodalProcessor)MultimodalSpecialTokens)get_bool_env_varis_npuloggerSGLANG_USE_CUDA_IPC_TRANSPORT   @        z Invalid RESIZE_RESAMPLE value: 'z'. Ignoring and using default.VIDEO_MAX_PIXELSg    Ai iN    g       @      heightwidthfactor
min_pixels
max_pixelsc           
      C   s0  t | |t| | tkr3| |kr t |t||}t|t |}nt |t| |}t|t |}|} |}t |t| |}t |t||}|| |krat| | | }	t| |	 |}t||	 |}n|| |k r~t|| |  }	t| |	 |}t||	 |}||| ks|| |krtd| d| ||fS )Nzencounter invalid h_bar: z	, w_bar: )	maxmin	MAX_RATIOround_by_factorfloor_by_factormathsqrtceil_by_factor
ValueError)
r   r   r   r   r   	new_width
new_heighth_barw_barbeta r+   _/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/multimodal/processors/ernie45_vl.pysmart_resize3   s*   r-   size_factorreturnc                 C   s>   | j \}}|}|}t|||||d\}}| j||ftd} | S )Nr   r   r   )resample)sizer-   resizeRESIZE_RESAMPLE)imager   r   r.   r   r   resized_heightresized_widthr+   r+   r,   resize_imageV   s   

r8   numberc                 C   s   t | | | S N)roundr9   r   r+   r+   r,   r    j   s   r    c                 C      t | | | S r:   )r"   ceilr<   r+   r+   r,   r$   n      r$   c                 C   r=   r:   )r"   floorr<   r+   r+   r,   r!   r   r?   r!   c                    s   t | |||S r:   )r8   )r5   r   r   r.   r+   r+   r,   resize_image_asyncv   s   rA   eletotal_frames	video_fpsc                 C   s   d| v rd| v rJ dd| v rt | d t}nA| dt}t| dtt}t| dtt|t}|| | }||krIt	
d| d| d ttt||||}t|t}t|kra||ksotd	t d
| d| d|S )aa  calculate the number of frames for video used for model inputs.

    Args:
        ele (dict): a dict contains the configuration of video.
            support either `fps` or `nframes`:
                - nframes: the number of frames to extract for model inputs.
                - fps: the fps to extract frames for model inputs.
                    - min_frames: the minimum number of frames of the video, only used when fps is provided.
                    - max_frames: the maximum number of frames of the video, only used when fps is provided.
        total_frames (int): the original total number of frames of the video.
        video_fps (int | float): the original fps of the video.

    Raises:
        ValueError: nframes should in interval [FRAME_FACTOR, total_frames].

    Returns:
        int: the number of frames for video used for model inputs.
    fpsnframesz%Only accept either `fps` or `nframes`
min_frames
max_frameszsmart_nframes: nframes[z] > total_frames[]znframes should in interval [z, z], but got .)r    FRAME_FACTORgetFPSr$   FPS_MIN_FRAMESr!   r   FPS_MAX_FRAMESr   warningr   r%   )rB   rC   rD   rF   rE   rG   rH   r+   r+   r,   smart_nframes   s,   
rQ   image_factorc                    s
  t | |  }}ti ||d}tjd|d |tjd}t|}| | }t	
| }|dddd}|j\}}}	}
t}t}ttt|| t t|d }t|	|
|||d\}}tjjj|||gtjd	}|dddd}| }||| ||d
d}||fS )N)rC   rD   r      )numdtype   r   g?r0   )interpolationtorchvision)rE   durationtotal_num_framesframes_indicesvideo_backend)lenget_avg_fpsrQ   nplinspaceint64unique	get_batchasnumpytorch
from_numpy
pin_memorypermuteshapeVIDEO_MIN_PIXELSVIDEO_TOTAL_PIXELSr   r   r   rK   intr-   rX   
transforms
functionalr3   r   BILINEAR)vrrR   rC   rD   rF   idxvideo_npvideo_r   r   r   total_pixelsr   r6   r7   video_metadatar+   r+   r,   preprocess_video   sH   


rw   c                       sh   e Zd ZegZ fddZdejdedejfddZ		dde
fd	d
Zdeeeef  fddZ  ZS )Ernie4_5_VLImageProcessorc                    s   t  j|||g|R i | || _|j| _|j| _|j| _|j| _|j| _d| _d| _	d| _
d| _tdd|j|jd|| _| jj| _| jj| _d S )Nr   r   r   r   z1<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>z1<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>)image_tokenvideo_tokenimage_token_idvideo_token_id)super__init__	hf_config
model_typeimage_start_token_idimage_end_token_idvideo_start_token_idvideo_end_token_idIMAGE_FACTOR
MIN_PIXELS
MAX_PIXELSr   r   im_patch_idbuild	mm_tokens
_processor	tokenizerimage_processor)selfr   server_argsr   argskwargs	__class__r+   r,   r~      s*   
z"Ernie4_5_VLImageProcessor.__init__pixel_values	mm_kwargsr/   c           
      C   s   | j }|j}| j}tj|jtjdg d}tj|jtjdg d}tj|j	tjd}|j
d }	|ddg|	d}|ddg|	d}| sQ| }| sY| }||tj | | }||j}|S )N)rU   )rS   rV   rS   rS   r   )r   vision_configr   re   tensor
image_meanfloat32reshape	image_stdrescale_factor
patch_sizesqueezerepeat_interleaveis_contiguous
contiguoustorU   )
r   r   r   r   r   r   image_mean_tensorimage_std_tensorr   patch_size_squaredr+   r+   r,   _pixel_values_norm   s@   


z,Ernie4_5_VLImageProcessor._pixel_values_normNc                 K   s  |r||d< |r||d< | j }t|dr$t|jtr$| jjs$ts$d|d< |jd|gddd|}|d	ur|d }|d	urE| 	|||d< t
| D ]{}	||	 d	u rW||	= qK|	d
kr|d
 }
|d }|
d	d	df dk}|
| |d< |
|  |d< |d jdd }|d	| |d< ||d	 |d< |d= |d
= |d  dkr|d= |d  dkr|d= |d  dkr|d= |d  dkr|d= qK| jjs| jD ]}trq||v rt|| tjr|| d||< q|S )zI
        process multimodal data with transformers AutoProcessor
        imagesvideosr   cudadeviceTpt)textpaddingreturn_tensorsNgrid_thwr   rS   video_grid_thwimage_grid_thw)dimr   pixel_values_videoscpur+   )r   hasattr
isinstancer   r   r   disable_fast_image_processor_is_npu__call__r   listkeysprodsumnumelkeep_mm_feature_on_deviceFEATURE_NAMESSGL_USE_CUDA_IPCre   Tensorr   )r   
input_textr   r   audiosr   	processorresultr   keyr   pixel_values_allmaskimage_patch_numfeature_namer+   r+   r,   process_mm_data  st   



z)Ernie4_5_VLImageProcessor.process_mm_data
image_datac              	      s&  | j |||j|j| jd}g }|jr.t|jd tjr.|jD ]}t|}	||	 q||_|j	rFdd |j	D I d H }
t
tt|
 \|_	}| || j\}}}| }tj|d| jt|dd t|dd d\}}|d}|jd |jd	 ks~J d
| || j| j| jj| jj||d}|S )N)promptr   
video_data
audio_datamultimodal_tokensr   c                    s   g | ]	}t |I d H qS r:   )rw   ).0rs   r+   r+   r,   
<listcomp>  s    zCErnie4_5_VLImageProcessor.process_mm_data_async.<locals>.<listcomp>r   r   )	input_idsr   r   r   rS   r   z9input_ids and mrope_positions should have the same length)r   mm_itemsim_start_id	im_end_idim_token_idr|   mrope_positionsmrope_position_delta)load_mm_datar   r   r   r   r   r   r8   appendr   mapr   zipprocess_and_combine_mm_dataflattenr   get_rope_index_ernie45	unsqueezer   getattrr   ri   tolistr   r   r{   r|   )r   r   r   request_objr   r   base_outputresized_imagesr5   resized_imagevideos_processedrt   r   r   retr   r   	mm_inputsr+   r+   r,   process_mm_data_asynch  sV   	





z/Ernie4_5_VLImageProcessor.process_mm_data_async)NNN)__name__
__module____qualname__r	   modelsr~   re   r   objectr   dictr   r   r   strbytesr   __classcell__r+   r+   r   r,   rx      s"    
&
Jrx   )>r"   ostypingr   r   numpyr_   re   rX   PILr   torchvision.transformsr   transformersr   sglang.srt.environr   "sglang.srt.layers.rotary_embeddingr   sglang.srt.models.ernie45_vlr	   /sglang.srt.multimodal.processors.base_processorr
   SGLangBaseProcessorr   sglang.srt.utilsr   r   r   r   r   r   r   r   r   r   SGLANG_RESIZE_RESAMPLErL   r4   is_setrP   rl   floatenvironrk   rj   r   rK   rM   rN   rO   r-   r8   r    r$   r!   rA   r   rQ   r   rw   rx   r+   r+   r+   r,   <module>   s    
%

	
3
/