o
    پiI                     @   s   d dl mZmZ d dlmZ d dlmZ d dlmZ d dl	m
Z d dl	mZ zd dlmZ W n ey9   dZY nw G d	d
 d
eZdS )    )ListUnion)MRotaryEmbedding)Glm4vForConditionalGeneration) Glm4vMoeForConditionalGeneration)BaseMultimodalProcessor)MultimodalSpecialTokens)GlmOcrForConditionalGenerationNc                       sJ   e Zd Zdd eeefD Z fddZdee	e
ef  fddZ  ZS )Glm4vImageProcessorc                 C   s   g | ]}|d ur|qS )N ).0mr   r   Z/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/multimodal/processors/glm4v.py
<listcomp>   s
    zGlm4vImageProcessor.<listcomp>c                    s   t  j|||g|R i | d| _d| _d| _d| _d| _d| _|j| _	|j
| _|j| _|j| _|j| _|j| _d| _d| _d	| _t| j| j	| j| j	d
|| _d S )Nz	<|image|>z	<|video|>z<|begin_of_image|>z<|end_of_image|>z<|begin_of_video|>z<|end_of_video|>   i 1  i )image_tokenimage_token_idvideo_tokenvideo_token_id)super__init__IMAGE_TOKENVIDEO_TOKENIMAGE_START_TOKENIMAGE_END_TOKENVIDEO_START_TOKENVIDEO_END_TOKENr   IM_TOKEN_IDr   VIDEO_TOKEN_IDimage_start_token_idIMAGE_START_TOKEN_IDimage_end_token_idIMAGE_END_TOKEN_IDvideo_start_token_idVIDEO_START_TOKEN_IDvideo_end_token_idVIDEO_END_TOKEN_IDIMAGE_FACTOR
MIN_PIXELS
MAX_PIXELSr   build	mm_tokens)self	hf_configserver_args
_processorargskwargs	__class__r   r   r      s0   zGlm4vImageProcessor.__init__
image_datac              	      s   | j |||j| jd}|jr|j|_| || j\}}}	| }tj|d| j	t
|	dd t
|	dd t
|	dd d\}
}|
d}
| || jj| jj|
|d}|S )	N)promptr4   
video_datamultimodal_tokensr   image_grid_thwvideo_grid_thwattention_mask)	input_idsr-   r8   r9   r:      )r;   mm_itemsim_token_idr   mrope_positionsmrope_position_delta)load_mm_datar6   r+   videosprocess_and_combine_mm_dataflattenr   get_rope_index_glm4v	unsqueezer-   getattrsqueezetolistr   r   )r,   r4   
input_textrequest_objr0   r1   base_outputr=   r;   retr?   r@   	mm_inputsr   r   r   process_mm_data_async>   s:   





	z)Glm4vImageProcessor.process_mm_data_async)__name__
__module____qualname__r   r   r	   modelsr   r   r   strbytesrO   __classcell__r   r   r2   r   r
      s    
 r
   )typingr   r   "sglang.srt.layers.rotary_embeddingr   sglang.srt.models.glm4vr   sglang.srt.models.glm4v_moer   /sglang.srt.multimodal.processors.base_processorr   SGLangBaseProcessorr   sglang.srt.models.glm_ocrr	   ImportErrorr
   r   r   r   r   <module>   s    