o
    پi                     @   sd   d dl mZmZ d dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZ G dd deZdS )	    )ListUnionN)ModalityMultimodalDataItem)MiniCPMO)MiniCPMV)BaseMultimodalProcessorMultimodalSpecialTokensc                       sR   e Zd ZeegZdZ fddZdee	e
ef  dee	e
ef  fddZ  ZS )MiniCPMMultimodalProcessorTc                    s   t  j|||g|R i | | jj}t|dd | _t|dd | _t|dd | _t|dd | _t|dd | _	t|dd | _
t|dd | _tdd	d
| jd|| _d S )Nslice_start_idslice_end_idaudio_start_idaudio_end_idim_start_id	im_end_idunk_idz(<image>./</image>)z(<audio>./</audio>)z(<video>./</video>))image_tokenaudio_tokenvideo_tokenimage_token_id)super__init__
_processor	tokenizergetattrr   r   r   r   r   r   im_token_idr	   build	mm_tokens)self	hf_configserver_argsr   argskwargsr   	__class__ \/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/multimodal/processors/minicpm.pyr      s"   z#MiniCPMMultimodalProcessor.__init__
image_data
audio_datac              
      sN  | j |||| jd}|d u rd S | j|j|j|jd}|d }|d }	t|tjt	fs5t
dt| t|	tjt	fsFt
dt|	 t|t|	kr\t
dt| dt|	 g }
g }t||	D ].\}}t|t|krt
d	t| d
t| t||D ]\}}|
|g7 }
||g7 }qqe|
}g }|d  }| j|| j| jd}| j|| j| jd}|| t|}t|dkrt||d|itjd}||g7 }d|v r|d d urt|d dkr| jd ur| jd ur| j|| j| jd}nd }t|d gd|d i|tjd}||g7 }|| | j| j| j| j| j| j| jd	S )N)promptr(   r'   multimodal_tokens)
input_textimagesaudiospixel_values	tgt_sizesz*Incorrect type of pixel values. Got type: z*Incorrect type of target sizes. Got type: z#Inconsistent batch lengths, found: z vs. zInconsistent N lengths, found: z vs 	input_ids)r0   mm_start_id	mm_end_idr   tgt_size)featureoffsetsmodel_specific_datamodalityaudio_featuresaudio_feature_lens)r4   r6   r5   r7   )	mm_itemsr0   r   r   r   r   r   r   r   )load_mm_datar   process_mm_datar+   r,   r-   
isinstancetorchTensorlist
ValueErrortypelenzipflattenget_mm_items_offset_by_pairr   r   r   r   extendsortedr   r   IMAGEr   r   AUDIOtolistr   )r   r'   r(   r+   request_objr"   base_outputresr.   r/   pixel_values_flattgt_sizes_flatpixel_btgt_bpixel_ntgt_nitemsr0   image_offsetsslice_offsetsitemaudio_offsetsr%   r%   r&   process_mm_data_async%   s   






z0MiniCPMMultimodalProcessor.process_mm_data_async)__name__
__module____qualname__r   r   modelssupport_dynamic_frame_expansionr   r   r   strbytesrZ   __classcell__r%   r%   r#   r&   r
      s    r
   )typingr   r   r>   "sglang.srt.managers.schedule_batchr   r   sglang.srt.models.minicpmor   sglang.srt.models.minicpmvr   /sglang.srt.multimodal.processors.base_processorr   r	   r
   r%   r%   r%   r&   <module>   s    