o
    پiA                     @   sH  d dl Z d dlZd dlZd dlZd dlmZmZ d dlZd dl	Z	d dl
Z
d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z' d dl%m(Z( d dl)m*Z* dZ+dZ,ej-. Z/dZ0e1eej2. dZ3ej24 re3du re*5dej2.  d e6e7ej8.ddZ9dZ:dZ;dZ<dZ=dZ>dZ?e+e,e/fde6d e6d!e6d"e6d#e6d$e@e6e6f fd%d&ZAd'e6d!e6d$e6fd(d)ZBd'e6d!e6d$e6fd*d+ZCd'e6d!e6d$e6fd,d-ZDd.eEd/e6d0e6e7B d$e6fd1d2ZFe+i fd3e6d4eEd$e	jGfd5d6ZHG d7d8 d8e'ZIdS )9    N)ListUnion)VideoReader)Image)InterpolationMode)envs)MRotaryEmbedding)ModalityMultimodalDataItem)"Qwen2_5_VLForConditionalGeneration)Qwen2VLForConditionalGeneration)Qwen3_5ForConditionalGeneration"Qwen3_5MoeForConditionalGeneration)$Qwen3OmniMoeForConditionalGeneration)Qwen3VLForConditionalGeneration)"Qwen3VLMoeForConditionalGeneration)BaseMultimodalProcessor)MultimodalSpecialTokens)logger   i@     z Invalid RESIZE_RESAMPLE value: 'z'. Ignoring and using default.VIDEO_MAX_PIXELSg    Ai  i 0	    g       @   i   heightwidthfactor
min_pixels
max_pixelsreturnc                 C   s   t | |t| | tkrtdt dt | |t| |  t |t| |}t |t||}|| |krOt| | | }t| | |}t|| |}||fS || |k rlt|| |  }t| | |}t|| |}||fS )a-  
    Rescales the image so that the following conditions are met:

    1. Both dimensions (height and width) are divisible by 'factor'.

    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].

    3. The aspect ratio of the image is maintained as closely as possible.
    z+absolute aspect ratio must be smaller than z, got )	maxmin	MAX_RATIO
ValueErrorround_by_factormathsqrtfloor_by_factorceil_by_factor)r   r   r   r   r   h_barw_barbeta r,   \/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/multimodal/processors/qwen_vl.pysmart_resize8   s    r.   numberc                 C   s   t | | | S )zFReturns the closest integer to 'number' that is divisible by 'factor'.)roundr/   r   r,   r,   r-   r$   Y   s   r$   c                 C      t | | | S )z]Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.)r%   ceilr1   r,   r,   r-   r(   ^      r(   c                 C   r2   )zYReturns the largest integer less than or equal to 'number' that is divisible by 'factor'.)r%   floorr1   r,   r,   r-   r'   c   r4   r'   eletotal_frames	video_fpsc                 C   s   d| v rd| v rJ dd| v rt | d t}nA| dt}t| dtt}t| dtt|t}|| | }||krIt	
d| d| d ttt||||}t|t}t|kra||ksotd	t d
| d| d|S )aa  calculate the number of frames for video used for model inputs.

    Args:
        ele (dict): a dict contains the configuration of video.
            support either `fps` or `nframes`:
                - nframes: the number of frames to extract for model inputs.
                - fps: the fps to extract frames for model inputs.
                    - min_frames: the minimum number of frames of the video, only used when fps is provided.
                    - max_frames: the maximum number of frames of the video, only used when fps is provided.
        total_frames (int): the original total number of frames of the video.
        video_fps (int | float): the original fps of the video.

    Raises:
        ValueError: nframes should in interval [FRAME_FACTOR, total_frames].

    Returns:
        int: the number of frames for video used for model inputs.
    fpsnframesz%Only accept either `fps` or `nframes`
min_frames
max_frameszsmart_nframes: nframes[z] > total_frames[]znframes should in interval [z, z], but got .)r$   FRAME_FACTORgetFPSr(   FPS_MIN_FRAMESr'   r!   FPS_MAX_FRAMESr   warningr    r#   )r6   r7   r8   r:   r9   r;   r<   r,   r,   r-   smart_nframesh   s,   
rE   image_factorvideo_configc                    s  t | ts| S t }t| |  }}t|||d}tjd|d |tj	d}t
|}| | }t| }	|	dddd}	|	j\}}
}}|dt}|dt}tt|d	t|| t t|d
 }t }|d	|}||krtd| d| d t||}d|v rd|v rt|d |d |d\}}nt|||||d\}}t }tjjj |	||gt!j"d}	|	 }	||| ||dd}t }t#d|| d dd|| d dd|| d dd|| d dd	 |	|fS )N)r7   r8   r      )numdtype   r   r   total_pixelsr   g?zThe given max_pixels[z] exceeds limit[z].resized_heightresized_width)r   )r   r   r   )interpolationtorchvision)r9   durationtotal_num_framesframes_indicesvideo_backendz)[preprocess_video Perf], get_batch_time:   .2fz ms, smart_resize_time: z ms, torchvision_resize_time:  ms, total_time:  ms)$
isinstancer   timeperf_counterlenget_avg_fpsrE   nplinspaceint64unique	get_batchasnumpytorch
from_numpy
pin_memorypermuteshaper@   VIDEO_MIN_PIXELSVIDEO_TOTAL_PIXELSr    r!   r   r?   intr   rD   r.   rP   
transforms
functionalresizer   BILINEARdebug)vrrF   rG   
entry_timer7   r8   r:   idxvideo_npvideo_r   r   r   rL   r   get_batch_timemax_pixels_supposedrM   rN   smart_resize_timevideo_metadatatorchvision_resize_timer,   r,   r-   preprocess_video   s   






r|   c                       sP   e Zd Zeeeeeee	gZ
 fddZdd Zdeeeef  fddZ  ZS )QwenVLImageProcessorc                    s   |j | _ |j dkr|j}t j|||g|R i | |j| _|j| _|j| _	|j| _t
|dd | _t
|dd | _t
|dd | _|jdi | _|jdi | _td|jtd|j| jd	|| _d S )
Nqwen3_omni_moevision_end_token_idaudio_start_token_idaudio_token_idimageru   z+<|vision_start|><|image_pad|><|vision_end|>z6<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>)image_tokenimage_token_idimage_token_regexvideo_token_idr   )
model_typethinker_configsuper__init__vision_start_token_idIM_START_TOKEN_IDr   IM_END_TOKEN_IDr   IM_TOKEN_IDgetattrr   r   mm_process_configr@   image_configrG   r   recompiler   build	mm_tokens)self	hf_configserver_args
_processorargskwargs	__class__r,   r-   r      s0   
	zQwenVLImageProcessor.__init__c           	      C   s   |  ||\}}tj| jjj| jj| jj| j	| j
tj|tjdd|t| jjdd d\}}|d}ttj||dg}||| j| j| jj| jj| jj||d	S )N)rJ   r   tokens_per_second)spatial_merge_sizer   r   r   r   	input_idsimage_grid_thwr   rH   )modalityoffsetsprecomputed_embeddings	r   mm_itemsim_start_id	im_end_idim_token_idr   r   mrope_positionsmrope_position_delta)build_input_idsr   get_rope_indexr   vision_configr   r   r   r   r   r   rd   tensorlong	unsqueezer   squeezer
   r	   IMAGEr   r   r   )	r   prompt
embeddingsimg_grid_thwr   r   r   r   r   r,   r,   r-   get_mm_data  s<   


	z QwenVLImageProcessor.get_mm_data
image_datac                    s  t  } j|||j|j jd}t  }t|dd}	d }
|jr8 fdd|jD I d H }tt	t
| \|_}
t  } jjdv rP j| j|
dd\}}}n
 | j\}}}d } jd	krutd
d |D d }|rutj|jdd}t|dd }|d u rt|dd }t  }| }d }t|dr|j}|d u r|rt|d tr|d d}d }t|dr|j}|d u r|jr|jd }t|tr|d}tj jjj jj jj j jt jjdd | dt|dd t|dd |d|t jdd  j!t jdd d\}}|"d}t  }t#$d|	d|| d dd|| d dd|| d dd|| d dd|| d dd |% | j j& jj jj jj'||d 	S )!N)r   r   
video_data
audio_datamultimodal_tokensridanonymous_ridc                    s"   g | ]}t | jd I dH qS ))rG   N)r|   rG   ).0ru   r   r,   r-   
<listcomp>L  s
    z>QwenVLImageProcessor.process_mm_data_async.<locals>.<listcomp>)qwen3_vlqwen3_vl_moeqwen3_5qwen3_5_moeF)rz   do_sample_framesr~   c                 s   s    | ]	}|  r|V  qd S )N)is_audio)r   mmr,   r,   r-   	<genexpr>i  s    z=QwenVLImageProcessor.process_mm_data_async.<locals>.<genexpr>rH   )dimsecond_per_grid_tsvideo_second_per_gridr   r   video_grid_thwr   r   position_id_per_seconds)r   r   r   r   r   r   r   r   r   r   use_audio_in_videoaudio_seqlensr   r   r   z[QwenVLProcessor Perf] rid=z, load_time: rU   rV   z ms, preprocess_time: z ms, process_time: z ms, get_rope_index_time: rW   rX   r   )(rZ   r[   load_mm_datar   r   r   r   videosmaplistzipr   r   process_and_combine_mm_datanextrd   sumfeature_attention_maskflattenhasattrr   rY   dictr@   r   r   r   r   r   r   r   r   r   r   r   r   rp   tolistr   r   )r   r   
input_textrequest_objr   r   rr   base_output	load_timer   rz   videos_processedpreprocess_timer   r   retaudio_feature_lengths
audio_itemr   process_timer   r   first_videor   r   get_rope_index_timer,   r   r-   process_mm_data_async7  s   













z*QwenVLImageProcessor.process_mm_data_async)__name__
__module____qualname__r   r   r   r   r   r   r   modelsr   r   r   r   strbytesr   __classcell__r,   r,   r   r-   r}      s    
$r}   )Jr%   osr   rZ   typingr   r   numpyr^   rd   rP   decordr   PILr   torchvision.transformsr   sglang.srt.environr   "sglang.srt.layers.rotary_embeddingr   "sglang.srt.managers.schedule_batchr	   r
   sglang.srt.models.qwen2_5_vlr   sglang.srt.models.qwen2_vlr   sglang.srt.models.qwen3_5r   r    sglang.srt.models.qwen3_omni_moer   sglang.srt.models.qwen3_vlr   sglang.srt.models.qwen3_vl_moer   /sglang.srt.multimodal.processors.base_processorr   SGLangBaseProcessorr   sglang.utilsr   IMAGE_FACTOR
MIN_PIXELSSGLANG_IMAGE_MAX_PIXELSr@   
MAX_PIXELSr"   r   SGLANG_RESIZE_RESAMPLERESIZE_RESAMPLEis_setrD   rk   floatenvironrj   ri   r   r?   rA   rB   rC   tupler.   r$   r(   r'   r   rE   Tensorr|   r}   r,   r,   r,   r-   <module>   s    


!
3
P