o
    پi                     @   s^   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	 e 
eZG dd deZdS )    N)Modality)MiDashengLMModel)BaseMultimodalProcessorMultimodalSpecialTokensc                       s:   e Zd ZdZegZ fddZ	d	ddZdd Z  Z	S )
MiDashengLMMultimodalProcessorz:Multimodal processor for MiDashengLM audio-language model.c                    s   t  j|||g|R i | d| _td| _| jj}|d| _	|d| _
|d| _t| j| j| j
d|| _| jtjtjd d| jvrT| jd d S d S )	Nz#<|audio_bos|><|AUDIO|><|audio_eos|>z.<\|audio_bos\|>(?:<\|AUDIO\|>)+<\|audio_eos\|>z<|audio_bos|>z	<|AUDIO|>z<|audio_eos|>)audio_tokenaudio_token_regexaudio_token_id)input_valuesaudio_lengthr
   )super__init__AUDIO_TOKENrecompileAUDIO_TOKEN_REGEX
_processor	tokenizerconvert_tokens_to_idsaudio_start_idr	   audio_end_idr   build	mm_tokensATTR_NAME_TO_MODALITYupdater   AUDIOFEATURE_NAMESappend)self	hf_configserver_argsr   argskwargsr   	__class__ `/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/multimodal/processors/midashenglm.pyr      s0   
z'MiDashengLMMultimodalProcessor.__init__Nc           	      K   s   |r||d< |r||d< |r||d< i |d< |d  dd | j}|jd|gddd	|}t| jd
dsFdD ]}||v rE||  ||< q7|S )zGOverride to use correct audio parameter name for MiDashengLM processor.imagesvideosaudioaudio_kwargs
truncationFTpt)textpaddingreturn_tensorskeep_mm_feature_on_device)r
   Nr%   )
setdefaultr   __call__getattrr    cpu)	r   
input_textr'   r(   audiosr"   	processorresultfeature_namer%   r%   r&   process_mm_data2   s,   z.MiDashengLMMultimodalProcessor.process_mm_datac              
      s  t d t d t d|du  t d|  t d |r6| j|s6| j | }t d | j||| jd}|du rJt d dS | || j\}}}t d	t|  t d
t	|
   t d|j  t d| j d| j d| j  t d|| jk    t|D ]S\}}	t d| d|	j  t d| dt|	dd  t d| dt|	dd  t d| dt|	d  t|	dr|	jdurt d| d|	jj  qd|v r"t|dkr"|d }
t|
tjr|
 dkr|
 n|
d  }
|
|d _t d|
 d n.d|v rPt|dkrP|d }|jd kr=|jd! n|jd }
|
|d _t d|
 d" || | j| j| jd#}t d$t|d%  d& |S )'a  Process audio data for MiDashengLM model.

        Args:
            audio_data: Audio input data
            input_text: Text prompt
            **kwargs: Additional arguments

        Returns:
            Dictionary containing processed multimodal data
        zP================================================================================zprocess_mm_data_async calledzaudio_data is not None: Nzinput_text: zAuto-prepended audio token)prompt
audio_datamultimodal_tokenszbase_output is Nonezmm_items count: z
ret keys: zinput_ids shape: zaudio_token_id=z, audio_start_id=z, audio_end_id=z&Count of audio_token_id in input_ids: zmm_item[z] modality: z] pad_value: 	pad_valuezNOT SETz] offsets: offsetsz] has feature: featurez] feature shape: r   r      zSet audio_length=z" (from processor, mel frame count)r
      z (fallback, waveform length))mm_items	input_idsr   r	   r   z
Returning rD   z	 mm_items)loggerinfor   searchr   load_mm_datar   process_and_combine_mm_datalenlistkeysshaper	   r   r   sumitem	enumeratemodalityr3   hasattrr@   
isinstancetorchTensornumelr   ndimtolist)r   r<   r5   r"   base_outputrD   rE   retirP   r   r
   r8   r%   r%   r&   process_mm_data_asyncN   s   










z4MiDashengLMMultimodalProcessor.process_mm_data_async)NNN)
__name__
__module____qualname____doc__r   modelsr   r:   r]   __classcell__r%   r%   r#   r&   r      s    
r   )loggingr   rU   "sglang.srt.managers.schedule_batchr   sglang.srt.models.midashenglmr   /sglang.srt.multimodal.processors.base_processorr   r   	getLoggerr^   rF   r   r%   r%   r%   r&   <module>   s    
