o
    
۾iPA                     @   s  d dl Z d dlZd dlmZmZ d dlmZ d dlZd dl	Z	d dl
mZ d dlmZmZ d dlmZmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZmZ d dlmZ d dlm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8 ddl9m:Z: ee;Z<dZ=G dd de%Z>G dd de	j?j@ZAde	jBdeCde	jBfd d!ZDG d"d# d#ZEe(jFe>e&e#d$eG d%d& d&e$e!ZGdS )'    N)AsyncGeneratorMapping)Literal)RawAudio)StreamingModeTranscriptionRequest)AudioAudioConfig)support_torch_compile)ModelConfigSpeechToTextConfig
VllmConfig)VLLM_ENGINE_ITERATION_TIMEOUT_S)
PromptTypeTokensPrompt)init_logger)MultiModalEmbeddingsSupportsRealtime)VoxtralDummyInputsBuilderVoxtralForConditionalGenerationVoxtralMultiModalProcessorVoxtralProcessingInfo)MULTIMODAL_REGISTRY)_IBaseMultiModalProcessorCache)MultiModalKwargsOptionalItems)MultiModalDataItems)BaseDummyInputsBuilder)MultiModalPromptUpdatesPlaceholderFeaturesInfo)IntermediateTensors)cached_tokenizer_from_config   )_flatten_embeddings   c                       sz   e Zd Zdddedee dedB ddf fddZd	ed
ee	 de
dededeee	 eeee f f fddZ  ZS )"VoxtralRealtimeMultiModalProcessorNcacheinfodummy_inputsr'   returnc                   s   t  j||d d d S )Nr&   )super__init__)selfr(   r)   r'   	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/voxtral_realtime.pyr,   6   s   z+VoxtralRealtimeMultiModalProcessor.__init__mm_items
prompt_ids	mm_kwargsmm_prompt_updatesis_update_appliedc                 C   s~   | dg }t|dksJ d|| j }|jjj}|d d jjd }	|	|	}
t
ddd|
dg d d}|d|gifS )Naudior"   z:Expected only one audio input for realtime, got mm_kwargs=r   audio_arrays)modalityitem_idx	start_idxtokensis_embed)getlenr(   get_tokenizerinstructaudio_encoderaudio_configdatashapenum_audio_tokensr   )r-   r2   r3   r4   r5   r6   audios	tokenizerrC   num_audio_sampleslengthfeatures_infor0   r0   r1   _maybe_apply_prompt_updates@   s$   


z>VoxtralRealtimeMultiModalProcessor._maybe_apply_prompt_updates)__name__
__module____qualname__r   r   r   r,   r   listintr   r   booltupler   strr   rL   __classcell__r0   r0   r.   r1   r%   5   s0    
r%   c                       sF   e Zd ZdZddededdf fddZd	ejdejfd
dZ	  Z
S )TimeEmbeddingz&Sinusoidal Embedding for encoding time     @dimthetar*   Nc                    s\   t    || _|| _tt| j t| jd 	  | jd  }| j
d|dd d S )N   inv_freqF)
persistent)r+   r,   rX   rY   torchexpmathlogarangefloatregister_buffer)r-   rX   rY   r[   r.   r0   r1   r,   d   s   
zTimeEmbedding.__init__tc                 C   s>   |d }| j j|j|jd}|| }tj| | fddS )N).N)devicedtyperX   )r[   tore   rf   r]   catcossin)r-   rd   r[   embr0   r0   r1   forwardo   s
   zTimeEmbedding.forward)rW   )rM   rN   rO   __doc__rQ   rb   r,   r]   Tensorrn   rU   r0   r0   r.   r1   rV   a   s    rV   input_tensorscalingr*   c                 C   s,   | | }t j|| jd}|d| dS )N)re   r"   rg   )r]   ra   re   	unsqueezeview)rq   rr   baseoffsetsr0   r0   r1   _expand_tensorx   s   rw   c                   @   s   e Zd ZdeddfddZedd Zedd	 Zedefd
dZ	de
defddZdddZdejddfddZdejdB fddZdS )VoxtralRealtimeBufferconfigr*   Nc                 C   s   || _ |j| _|j| _| j j| _| | j| _| | j| _	| d| j j
 | _| | j j}d| _|| j | _t| j | _tj| jtjd| _d| _d S )N  r   rf   )_configstreaming_look_ahead_ms_look_ahead_in_msstreaming_look_back_ms_look_back_in_mssampling_rate_sampling_rate_get_len_in_samples_look_ahead
_look_back
frame_rate_streaming_sizetranscription_delay_ms_start_end_PRE_ALLOCATE_BUFFER_SIZE_IN_S_buffer_sizenpemptyfloat32_buffer_filled_buffer_len)r-   ry   streaming_delayr0   r0   r1   r,      s   

zVoxtralRealtimeBuffer.__init__c                 C   s   t | j| j dS )Nr   )maxr   r   r-   r0   r0   r1   r;      s   zVoxtralRealtimeBuffer.start_idxc                 C   s   | j | j S N)r   r   r   r0   r0   r1   end_idx      zVoxtralRealtimeBuffer.end_idxc                 C   s   | j | jkS r   )r   r   r   r0   r0   r1   is_audio_complete   r   z'VoxtralRealtimeBuffer.is_audio_complete	len_in_msc                 C   s*   | j | d }| sJ |t|}|S )Nrz   )r   
is_integerrQ   )r-   r   	_len_in_slen_in_sr0   r0   r1   r      s   z)VoxtralRealtimeBuffer._get_len_in_samplesc                 C   sp   t j| jt jd}t| j| j d}|dkr#| j| j| j |d |< | `|| _|| _| j| _	| j	| j
 | _d S )Nr{   r   )r   r   r   r   r   r   r;   r   r   r   r   r   )r-   
new_bufferleft_to_copyr0   r0   r1   _allocate_new_buffer   s   
z*VoxtralRealtimeBuffer._allocate_new_bufferr7   c                 C   sP   | j t| }|| jkr|   || j| j | j t| < |  j t|7  _ d S r   )r   r?   r   r   r   )r-   r7   put_end_idxr0   r0   r1   write_audio   s   
z!VoxtralRealtimeBuffer.write_audioc                 C   s8   | j sd S | j| j| j }| j| _|  j| j7  _|S r   )r   r   r;   r   r   r   r   )r-   r7   r0   r0   r1   
read_audio   s   z VoxtralRealtimeBuffer.read_audio)r*   N)rM   rN   rO   r	   r,   propertyr;   r   rR   r   rb   rQ   r   r   r   ndarrayr   r   r0   r0   r0   r1   rx      s    


rx   )r(   r)   c                       st  e Zd ZdZdZdddedef fddZede	e
jd	f d
ejee  dede	ed	f fddZedd Z		d,d	dddejded	B dejd	B dedejf
ddZ				d-dejd	B dejded	B dejd	B dedejeB fddZdeej ejB eejdf B d	B fdd Zeded!edefd"d#Z ed$e
jded%ed&ed	B d!e!d' d(ed)ed	B defd*d+Z"  Z#S ).VoxtralRealtimeGenerationT )prefixvllm_configr   c                   sN   t  j||d |jj rJ dt| jjjd| _	| j
jjj}|j| _d S )N)r   r   zKVoxtral realtime doesn't support full cudagraphs yet. Please use PIECEWISE.rh   )r+   r,   compilation_configcudagraph_modehas_full_cudagraphsrV   ry   text_confighidden_sizetime_embeddingrH   rA   rB   rC   num_delay_tokensn_delay_tokens)r-   r   r   rC   r.   r0   r1   r,      s   
z"VoxtralRealtimeGeneration.__init__audio_streamNinput_streammodel_configr*   c                 C  s   t |}|jj}|j}t|}d}|2 zg3 d H W }	||	 |  }
d ur{|rT| s/J t|
|j	dd}	t
tjt|	d d}|j|}|j}|jd j}
d}ntj| tdI d H }|dd  }d	|
d fi}t||d
V  |  }
d us'q6 d S )NTwavformat)	streamingr7   languager   F)timeoutrg   r7   prompt_token_idsmulti_modal_data)r!   rA   rB   rC   rx   r   r   r   r   r   r   r   ONLINEr   
from_audiomistralencode_transcriptionr<   rG   audio_arrayasynciowait_forr>   r   r   )clsr   r   r   rH   rB   ry   bufferis_first_yieldr7   	new_audiorequest	audio_enc	token_idsall_outputsr   r0   r0   r1   buffer_realtime_audio   s@   
	z/VoxtralRealtimeGeneration.buffer_realtime_audioc                 C   s   | j jjjS r   )rH   rA   rB   rC   r   r0   r0   r1   rC   #  r   z&VoxtralRealtimeGeneration.audio_config)is_multimodalhandle_oov_mm_token	input_idsmultimodal_embeddingsr   r   c                C   s,   |dusJ t |dksJ dt|}|S )z+Pass post-conv embeddings directly as inputNr   zCFor realtime you must provide a multimodal_embedding at every step.)r?   r#   )r-   r   r   r   r   mm_embeds_flatr0   r0   r1   embed_input_ids'  s   z)VoxtralRealtimeGeneration.embed_input_ids	positionsintermediate_tensorsinputs_embedskwargsc                 K   s   |d usJ |d usJ | j jj}||jd | |jd | }t||}| j||}|j\}	}
|	| j dks;J ||	| j |
| j }| 	|}| j
|}|| }tjd| j|j|jd}| |}| j
j|||||d}|S )Nr   r"   r"   )
fill_valuere   rf   )r   t_cond)ry   rC   block_pool_sizert   rE   rw   whisper_encoderdownsample_factorreshapeaudio_language_adapterlanguage_modelr   r]   fullr   re   rf   r   model)r-   r   r   r   r   r   	pool_sizewhisper_positionsaudio_hidden_states
num_tokensaudio_hidden_sizeaudio_text_embedstext_embedstime_tensorr   hidden_statesr0   r0   r1   rn   :  sD   




z!VoxtralRealtimeGeneration.forward.c                    s   j di |}|dusJ ddtjdtdtdtjfdd fd	d
|D } fdd
|D }dd
 |D }jj|}jjj|jfdd
|D dd}jj	j
 fdd
|D }fdd
|D }|S )zATransform audio waveforms -> initial whisper post-conv embeddingsNz;For realtime you must provide an audio input at every step.samplemult_ofposr*   c                 S   sn   |dv sJ || j | |  }dkr5|dkr| |d  n	| d d |d f } | j | dks5J d| | S )Nr   r"   r   z*Sample is empty after truncation with ctx rE   )r   r   r   ctxr0   r0   r1   _truncate_leftx  s   (zBVoxtralRealtimeGeneration.embed_multimodal.<locals>._truncate_leftc                    s"   g | ]} j | j jqS r0   )r   compute_whisper_melspecri   rf   ).0r7   r   r0   r1   
<listcomp>  s    z>VoxtralRealtimeGeneration.embed_multimodal.<locals>.<listcomp>c                    s   g | ]} |d dqS )rZ   r"   r0   r   mel)r   r0   r1   r     s    c                 S   s   g | ]}|j d  qS r   r   r   r0   r0   r1   r     s    c                    s   g | ]}|  qS r0   r0   )r   s)conv_strider0   r1   r     s    r   rh   c                    s   g | ]} |d qS )r   r0   )r   r   )r   r   r0   r1   r     s    
c                    s,   g | ]}| |jd    |jd   qS r   )rt   rE   )r   e)r   r0   r1   r     s    r0   ) _parse_and_validate_audio_arraysr]   rp   rQ   r   forward_convtotal_stridesplitry   rC   r   )r-   r   audio_inputsmel_featuresseq_lensaudio_embeddingsaudio_embeddings_per_sampler0   )r   r   r   r-   r1   embed_multimodaln  sB   


	


z*VoxtralRealtimeGeneration.embed_multimodal	task_typec                 C   s&   t |}|jjj}|j}td |d dS )N)max_audio_clip_ssample_ratemin_energy_split_window_size)r!   rA   rB   rC   r   r   )r   r   r  rH   rC   r  r0   r0   r1   get_speech_to_text_config  s   
z3VoxtralRealtimeGeneration.get_speech_to_text_configr7   
stt_configr   )
transcribe	translaterequest_promptto_languagec                 C   sb   t |}t|t|jdd}t|jt||tj	d}	|j
|	}
t|
jd|
jd j|jfidS )Nr   r   )r   r7   r   r   r7   r   r   )r!   r   rQ   r  r   r   r   r   r   OFFLINErA   r   r   r<   rG   r   )r   r7   r   r	  r   r  r  r  rH   req	tokenizedr0   r0   r1   get_generation_prompt  s   z/VoxtralRealtimeGeneration.get_generation_promptr   )NN)$rM   rN   rO   requires_raw_input_tokensskip_warmup_audio_preprocessingr   rT   r,   classmethodr   r   r   r   QueuerP   rQ   r   r   r   r   rC   r]   rp   r   rR   r   r    objectrn   rS   r  r   r  r   r  rU   r0   r0   r.   r1   r      s    
2


4 
9	r   )Hr   r_   collections.abcr   r   typingr   numpyr   r]   &mistral_common.protocol.instruct.chunkr   -mistral_common.protocol.transcription.requestr   r   &mistral_common.tokens.tokenizers.audior   r	   vllm.compilation.decoratorsr
   vllm.configr   r   r   	vllm.envsr   vllm.inputs.datar   r   vllm.loggerr   %vllm.model_executor.models.interfacesr   r   "vllm.model_executor.models.voxtralr   r   r   r   vllm.multimodalr   vllm.multimodal.cacher   r   vllm.multimodal.inputsr   vllm.multimodal.parser   vllm.multimodal.processingr   $vllm.multimodal.processing.processorr   r   vllm.sequencer    vllm.tokenizersr!   utilsr#   rM   loggerr   r%   nnModulerV   rp   rQ   rw   rx   register_processorr   r0   r0   r0   r1   <module>   sL   ,Q