o
    ۷io`                  
   @   sz  U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z m!Z! d dl"m#Z# ee$Z%dZ&dZ'e(de(de(de(de(de(de(de(dgZ)dhZ*e+e, e-d< h dZ.e+e, e-d< dZ/dZ0dZ1G dd deeZ2dS )     N)Any)urlparse)urlopen)Request)ResponseStreamingResponse)OpenAIServing)init_logger)random_uuid)
AudioMixin)CreateAudioOpenAICreateSpeechRequest)OmniRequestOutput   i   z127.0.0.0/8z
10.0.0.0/8z172.16.0.0/12z192.168.0.0/16z169.254.0.0/16z::1/128zfc00::/7z	fe80::/10	qwen3_tts_TTS_MODEL_STAGES>   AutoFrenchGermanKoreanChineseEnglishItalianRussianSpanishJapanese
Portuguese_TTS_LANGUAGESi     i   c                       s0  e Zd Z fddZdedB fddZdd Zdefd	d
Zde	e
 fddZdededB fddZdee
ef defddZdefddZdede
dB fddZede
deee ef fddZde
fddZedeedB e
dB f fddZdedee
ef fd d!Z	d%ded"edB fd#d$Z  ZS )&OmniOpenAIServingSpeechc                    sr   t  j|i | |  | _| jd u| _|  | _|  | _t	
dt| j dt| j  d | _|  | _d S )NzLoaded z supported speakers: )super__init___find_tts_stage
_tts_stage_is_tts _compute_max_instructions_length_max_instructions_length_load_supported_speakerssupported_speakersloggerinfolensorted_tts_tokenizer_load_codec_frame_rate_codec_frame_rate)selfargskwargs	__class__ a/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/entrypoints/openai/serving_speech.pyr!   @   s   


"z OmniOpenAIServingSpeech.__init__returnNc           
   
   C   sB  z[| j jj}tj|dd}tj|rZt|}t	|}W d   n1 s(w   Y  |
d}|
d}|rZ|rZ|dkrZt|t| }td|dd	| d
| d |W S W n tyu } ztd|  W Y d}~nd}~ww z | j jj}	t|	dd}|durtd| d t|W S W dS  ty   Y dS w )zPLoad codec frame rate from speech tokenizer config for prompt length estimation.speech_tokenizerzconfig.jsonNoutput_sample_rateencode_downsample_rater   zLoaded codec frame rate: z.1fz Hz (output_sample_rate=z, encode_downsample_rate=)z>Failed to load codec frame rate from speech tokenizer config: codec_frame_rate_hzz'Using codec frame rate from hf_config: z Hz)engine_clientmodel_configmodelospathjoinexistsopenjsonloadgetfloatr)   r*   	Exceptionwarning	hf_configgetattr)
r0   
model_pathst_config_pathf	st_config	output_sr
downsamplerateerK   r5   r5   r6   r.   R   sH   






z.OmniOpenAIServingSpeech._load_codec_frame_ratec                 C   s@   t | jdd}|du rdS |D ]}t |ddtv r|  S qdS )zHFind and return the TTS stage from the stage list, or None if not found.
stage_listNmodel_stagerL   r=   r   )r0   rU   stager5   r5   r6   r"   q   s   z'OmniOpenAIServingSpeech._find_tts_stagec                 C   sF   t | jdd}|dur|S | jdur!t | jdi }d|v r!|d S tS )zCompute max instructions length with precedence: CLI > stage config > default.

        Called once during initialization; result is cached in self._max_instructions_length.
        tts_max_instructions_lengthNtts_argsmax_instructions_length)rL   r=   r#   _TTS_MAX_INSTRUCTIONS_LENGTH)r0   cli_overriderZ   r5   r5   r6   r%   {   s   
z8OmniOpenAIServingSpeech._compute_max_instructions_lengthc              
   C   s   z-| j jjj}dD ]}t||d}|r$t|tr$dd | D   W S q	t	d W t S  t
yI } zt	d|  W Y d}~t S d}~ww )zHLoad supported speakers (case-insensitive) from the model configuration.)spk_id
speaker_idNc                 S   s   h | ]}|  qS r5   )lower).0speakerr5   r5   r6   	<setcomp>   s    zCOmniOpenAIServingSpeech._load_supported_speakers.<locals>.<setcomp>zBNo speakers found in talker_config (checked spk_id and speaker_id)z+Could not load speakers from model config: )r=   r>   rK   talker_configrL   
isinstancedictkeysr)   rJ   rI   set)r0   rd   	attr_namespeakers_dictrT   r5   r5   r6   r'      s   z0OmniOpenAIServingSpeech._load_supported_speakers	ref_audioc                 C   s.  | j du rdS z|}t|tr,|r,t|dkr!t|d ttfr!n|d }t|tr,|st|tr<t|dkr<|\}}nt|trLt|dkrL|\}}nW dS t|}t|dr]t|}nt|drr|jdkrl|j	d n|j	d }nW dS |dks}|dkrW dS || }t
|| j  W S  ty   Y dS w )a   Estimate ref_code length from ref_audio waveform without running the codec.

        The codec produces one frame per (output_sample_rate / encode_downsample_rate)
        audio samples, so ref_code_len = ceil(duration_seconds * codec_frame_rate).
        N   r   r   __len__shape)r/   re   listr+   intrH   tuplehasattrndimrn   mathceilrI   )r0   rk   itemwavsr	n_samplesdurationr5   r5   r6   _estimate_ref_code_len   s6   





 z.OmniOpenAIServingSpeech._estimate_ref_code_len
tts_paramsc           	   
      s   zJddl m}  jdu r ddlm}  jjj}|j|ddd _ jjj	}|j
}|dp/d	gd }|j|| fd
dt|ddt|dd jdW S  tyc } ztd| W Y d}~dS d}~ww )zHEstimate prompt length so the placeholder matches model-side embeddings.r   )&Qwen3TTSTalkerForConditionalGenerationN)AutoTokenizerTleft)trust_remote_codepadding_side	task_typeCustomVoicec                    s    j | ddd S )NF)padding	input_ids)r-   )tr0   r5   r6   <lambda>   s    z>OmniOpenAIServingSpeech._estimate_prompt_len.<locals>.<lambda>codec_language_idspk_is_dialect)additional_informationr   tokenize_promptr   r   estimate_ref_code_lenz=Failed to estimate TTS prompt length, using fallback 2048: %s   ):vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_talkerr~   r-   transformersr   r=   r>   r?   from_pretrainedrK   rd   rG   /estimate_prompt_len_from_additional_informationrL   r|   rI   r)   rJ   )	r0   r}   r~   r   
model_namerK   rd   r   rT   r5   r   r6   _estimate_prompt_len   s4   





z,OmniOpenAIServingSpeech._estimate_prompt_lenc                 C   s:   t | jdd}|r|D ]}t |dd}|tv r dS qdS )z4Check if the current model is a supported TTS model.rU   NrV   TFrW   )r0   rU   rX   rV   r5   r5   r6   _is_tts_model   s   z%OmniOpenAIServingSpeech._is_tts_modelrequestc                 C   sh  |j pd}|jdur|j |_|jr|j sdS |jdur2|jtvr2d|j ddtt S |dkrS|jdurS| j	rS|j| j	vrSd|j ddt| j	 S |dkrl|j
du r^d	S |j
d
sl|j
dsldS |dkr~|jdurwdS |jdur~dS |dkr|jsdS |jrt|j| jkrd| j dS |jdur|jtk rdt S |jtkrdt S dS )z?Validate TTS request parameters. Returns error message or None.r   NzInput text cannot be emptyzInvalid language 'z'. Supported: z, zInvalid speaker 'Basez0Base task requires 'ref_audio' for voice cloning)zhttp://zhttps://data:zBref_audio must be a URL (http/https) or base64 data URL (data:...)z&'ref_text' is only valid for Base taskz0'x_vector_only_mode' is only valid for Base taskVoiceDesignz>VoiceDesign task requires 'instructions' to describe the voicezInstructions too long (max z characters)z max_new_tokens must be at least zmax_new_tokens cannot exceed )r   voicer`   inputstriplanguager   rB   r,   r(   rk   
startswithref_textx_vector_only_modeinstructionsr+   r&   max_new_tokens_TTS_MAX_NEW_TOKENS_MIN_TTS_MAX_NEW_TOKENS_MAX)r0   r   r   r5   r5   r6   _validate_tts_request   s<   









z-OmniOpenAIServingSpeech._validate_tts_requestref_audio_strc                    sf   t dtddfdd dttjtf f fdd}t }|d|I dH \}}|	 |fS )z;Resolve ref_audio URL/base64 to (wav_samples, sample_rate).urlr7   Nc                    sv   t | j}|stdt|d D ]'}t|d d ddd }t| t	 fddt
D r8td  qd S )	Nz%ref_audio URL must include a hostname   r   %r   c                 3   s    | ]} |v V  qd S Nr5   )ra   netaddrr5   r6   	<genexpr>-  s    zROmniOpenAIServingSpeech._resolve_ref_audio.<locals>._check_ssrf.<locals>.<genexpr>z+ref_audio URL resolves to blocked address: )r   hostname
ValueErrorsocketgetaddrinfostrsplit	ipaddress
ip_addressany_REF_AUDIO_BLOCKED_NETWORKS)r   hostr*   ip_strr5   r   r6   _check_ssrf&  s   

z?OmniOpenAIServingSpeech._resolve_ref_audio.<locals>._check_ssrfc                     s  j dv r:  ttd} | td }t|tkr%tdt dW d    n1 s/w   Y  t|}n 	drV}d|v rM|
ddd }tt|}ntdtj|d	d
d\}}t|tjrv|jdkrvtj|dd}tj|tjdt|fS )N)httphttps)timeoutr   zref_audio URL exceeds z bytesr   ,z4ref_audio must be an http(s) URL or data: base64 URIfloat32F)dtype	always_2dro   )axis)r   )schemer   _REF_AUDIO_TIMEOUT_Sread_REF_AUDIO_MAX_BYTESr+   r   ioBytesIOr   r   base64	b64decodesfre   npndarrayrt   meanasarrayr   rq   )respdatabufb64audiory   r   parsedr   r5   r6   _fetch_sync0  s&   

z?OmniOpenAIServingSpeech._resolve_ref_audio.<locals>._fetch_sync)
r   r   rr   r   r   rq   asyncioget_running_looprun_in_executortolist)r   r   loopwav_npry   r5   r   r6   _resolve_ref_audio!  s    
z*OmniOpenAIServingSpeech._resolve_ref_audio
request_idc              
   C  sx  d}d}z|2 z3 dH W }|  |\}}|du rq|d}|dur=t|tr.|r.|d n|}	t|	dr9|	 nt|	}|| }
t|
trQ|
|d }t|
}n|
dur]|
g}|d7 }ng }|D ]-}t|drr| 	 
  n|}|jdkr}| }t||d	d
ddd}| |jV  qaq6 W dS  tjy   td|   ty } ztd|| W Y d}~dS d}~ww )a  Generate PCM audio chunks for streaming response.

        Handles two audio output modes from the engine:
        - Cumulative mode (list): Engine returns growing list of chunks;
        we emit only the new tail on each iteration.
        - Per-step mode (tensor): Engine returns single tensor per iteration;
        we emit it directly.

        Args:
            generator: Async generator from the engine
            request_id: Request identifier for logging

        Yields:
            Raw PCM bytes for each audio chunk
        r   ]  Nry   ro   rw   r   rH   pcm      ?r   Faudio_tensorsample_rateresponse_formatspeedstream_formatbase64_encodez(Streaming request %s cancelled by clientz-Streaming speech generation failed for %s: %s)_extract_audio_outputrG   re   rp   rs   rw   rq   r+   rH   detachcpunumpyrt   squeezer   create_audio
audio_datar   CancelledErrorr)   r*   rI   	exception)r0   	generatorr   
prev_countsample_rate_valresaudio_output	audio_keysr_rawsr_val	audio_val
new_chunkschunk_tensorchunk_np	audio_objrT   r5   r5   r6   _generate_pcm_chunksH  sV   



 
&z,OmniOpenAIServingSpeech._generate_pcm_chunksc                 C   s\   t | dd}|st | dd}|rt |ddnd}|sdS d|v r"dnd|v r(dnd}||fS )zReturn (audio_output dict, audio key) or (None, None).

        Returns the raw dict so callers can apply their own extraction strategy:
        streaming needs per-chunk delta slicing; non-streaming needs full concatenation.
        multimodal_outputNrequest_output)NNr   model_outputs)rL   )r   mmrokeyr5   r5   r6   r     s   z-OmniOpenAIServingSpeech._extract_audio_outputc                 C   s  i }|j g|d< |jdur|jg|d< ndg|d< |jdur%|jg|d< ndg|d< |jdur6|jg|d< n|d d dkrCd	g|d< |jdurO|jg|d
< ndg|d
< |jdur_|jg|d< |jdurj|jg|d< |jdurv|jg|d< ndg|d< |d d dkrdg|d< |S )zBuild TTS parameters from request.

        Processes each parameter if present, skips if not.
        Values are wrapped in lists as required by the model.
        textNr   r   r   r   rb   r   Vivianinstruct r   r   r   r   r   Tnon_streaming_mode)r   r   r   r   r   r   r   r   )r0   r   paramsr5   r5   r6   _build_tts_params  s2   












z)OmniOpenAIServingSpeech._build_tts_paramsraw_requestc              
      s  |  |I dH }|durtd| |S | jjr| jjdt  }z| jr_| |}|r5| 	|W S | 
|}|jdurQ| |jI dH \}}||gg|d< | |}	dg|	 |d}
ni }d|ji}
td|t|jd	krz|jdd	 d
 n|j|ddgd  | jj}| jj|
||dgd}|jrt| ||ddW S d}|2 z3 dH W }|}q6 |du r| 	dW S | |\}}|du r| 	dW S || }|dd}t|tr|r|d n|}t|dr| nt|}t|trddl}|j|dd}t|dr|   ! " }|j#dkr|$ }t%|||j&p#d|j'p(d|j(dd}| )|}t*|j+|j,dW S  t-j.yK   | 	d Y S  t/yb } z| 	|W  Y d}~S d}~w t0y } zt1d | | 	d!| W  Y d}~S d}~ww )"a  
        Create Speech API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/audio/createSpeech
        for the API specification. This API mimics the OpenAI
        Create Speech API.

        For Qwen3-TTS models, additional parameters are supported:
        - task_type: "CustomVoice", "VoiceDesign", or "Base"
        - language: Language code (e.g., "Chinese", "English", "Auto")
        - voice: Speaker name (e.g., "Vivian", "Ryan") for CustomVoice
        - instructions: Voice style/emotion instructions
        - ref_audio: Reference audio for voice cloning (Base task)
        - ref_text: Transcript of reference audio (Base task)
        - x_vector_only_mode: Use speaker embedding only (Base task)

        Streaming is supported via stream=True with response_format='pcm'.
        Each Code2Wav chunk is yielded as raw PCM bytes as soon as it is decoded.
        NzError with model %szspeech-rk   r   )prompt_token_idsr   promptz,TTS speech request %s: text=%r, task_type=%s2   z...r   unknownr   r   )r  r   sampling_params_listoutput_modalitiesz	audio/pcm)
media_typez#No output generated from the model.z'TTS model did not produce audio output.ry   r   ro   rw   )dimrH   rx   r   Fr   )contentr  zClient disconnectedzSpeech generation failed: %szSpeech generation failed: )2_check_modelr)   errorr=   errored
dead_errorr
   r$   r   create_error_responser  rk   r   r   r   r*   r+   rG   default_sampling_params_listgeneratestreamr   r  r   re   rp   rs   rw   rq   torchcatrH   r   r   r   rt   r   r   r   r   r   r   r   r   r  r   r   r   rI   r   )r0   r   r  error_check_retr   validation_errorr}   wav_listry   ph_lenr  r  r   final_outputr   r   r   r   r   r   r   r"  r  audio_responserT   r5   r5   r6   create_speech  s   




$



z%OmniOpenAIServingSpeech.create_speechr   )__name__
__module____qualname__r!   rH   r.   r"   rq   r%   rh   r   r'   objectr|   rf   r   r   boolr   r   r   staticmethodrr   rp   r   r  r   r  r   r*  __classcell__r5   r5   r3   r6   r   ?   s,    
#
5 &? 9r   )3r   r   r   r   rE   ru   r@   r   typingr   urllib.parser   urllib.requestr   r   r   	soundfiler   fastapir   fastapi.responsesr   r   &vllm.entrypoints.openai.engine.servingr   vllm.loggerr	   
vllm.utilsr
   .vllm_omni.entrypoints.openai.audio_utils_mixinr   +vllm_omni.entrypoints.openai.protocol.audior   r   vllm_omni.outputsr   r+  r)   r   r   
ip_networkr   r   rh   r   __annotations__r   r\   r   r   r   r5   r5   r5   r6   <module>   sN   
 