o
    ۷i                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlmZ d dlZd dlZd dlZd dlZd dlmZmZmZ ddlmZmZmZ eeejeejef f Z eee	e f Z!eG dd	 d	Z"G d
d dZ#dS )    N)	dataclass)AnyDictListOptionalTupleUnion)urlparse)
AutoConfig	AutoModelAutoProcessor   )Qwen3TTSConfig Qwen3TTSForConditionalGenerationQwen3TTSProcessorc                   @   sJ   e Zd ZU dZeej ed< ejed< eed< eed< dZ	ee
 ed< dS )VoiceClonePromptItemz
    Container for one sample's voice-clone prompt information that can be fed to the model.

    Fields are aligned with `Qwen3TTSForConditionalGeneration.generate(..., voice_clone_prompt=...)`.
    ref_coderef_spk_embeddingx_vector_only_modeicl_modeNref_text)__name__
__module____qualname____doc__r   torchTensor__annotations__boolr   str r    r    X/home/ubuntu/vllm_env/lib/python3.10/site-packages/qwen_tts/inference/qwen3_tts_model.pyr   (   s   
 
r   c                   @   s  e Zd ZdZdTdedeeeef  fddZ	e
dedd fd	d
Zdee fddZdee fddZdee ddfddZdeee  ddfddZdedefddZdedefddZdedefddZdedeejef fddZd eeee f deeejef  fd!d"Zdedee fd#d$Z d%edefd&d'Z!d%edefd(d)Z"d*edefd+d,Z#d-ee dee$j% fd.d/Z&										dUd0ee d1ee d2ee' d3ee' d4ee' d5ee d6ee d7ee' d8ee' d9ee deeef fd:d;Z(e$) 		<dVd=eeee f d>eeeeee  f  d?eeee f dee* fd@dAZ+dBee* deeef fdCdDZ,e$- 				<		<dWd%eeee f dEeeee f d=eeeee f  d>eeeeee  f  d?eeee f dFeeeeef ee* f  dGedeeej ef fdHdIZ.e$- 		JdXd%eeee f d*eeee f dEeeee f dGedeeej ef f
dKdLZ/e$- 			JdYd%eeee f dMeeee f dEeeee f d*eeeee f  dGedeeej ef fdNdOZ0deee  fdPdQZ1deee  fdRdSZ2dS )ZQwen3TTSModela  
    A HuggingFace-style wrapper for Qwen3 TTS models (CustomVoice/VoiceDesign/Base) that provides:
      - from_pretrained() initialization via AutoModel/AutoProcessor
      - generation APIs for:
          * CustomVoice: generate_custom_voice()
          * VoiceDesign: generate_voice_design()
          * Base: generate_voice_clone() + create_voice_clone_prompt()
      - consistent output: (wavs: List[np.ndarray], sample_rate: int)

    Notes:
      - This wrapper expects the underlying model class to be `Qwen3TTSForConditionalGeneration`
      - Language / speaker validation is done via model methods:
          model.get_supported_languages(), model.get_supported_speakers()
    Nmodelgenerate_defaultsc                 C   sj   || _ || _|p	i | _t|dd | _| jd u r3zt| j| _W d S  ty2   td| _Y d S w d S )Ndevicecpu)	r#   	processorr$   getattrr%   next
parametersStopIterationr   )selfr#   r'   r$   r    r    r!   __init__F   s   

zQwen3TTSModel.__init__pretrained_model_name_or_pathreturnc                 K   sv   t dt ttt ttt tj|fi |}t|ts*t	dt
| dtj|dd}|j}| |||dS )a  
        Load a Qwen3 TTS model and its processor in HuggingFace `from_pretrained` style.

        This method:
          1) Loads config via AutoConfig (so your side can register model_type -> config/model).
          2) Loads the model via AutoModel.from_pretrained(...), forwarding `kwargs` unchanged.
          3) Loads the processor via AutoProcessor.from_pretrained(model_path).
          4) Loads optional `generate_config.json` from the model directory/repo snapshot if present.

        Args:
            pretrained_model_name_or_path (str):
                HuggingFace repo id or local directory of the model.
            **kwargs:
                Forwarded as-is into `AutoModel.from_pretrained(...)`.
                Typical examples: device_map="cuda:0", dtype=torch.bfloat16, attn_implementation="flash_attention_2".

        Returns:
            Qwen3TTSModel:
                Wrapper instance containing `model`, `processor`, and generation defaults.
        	qwen3_ttszAutoModel returned z-, expected Qwen3TTSForConditionalGeneration. T)fix_mistral_regex)r#   r'   r$   )r
   registerr   r   r   r   r   from_pretrained
isinstance	TypeErrortypegenerate_config)clsr.   kwargsr#   r'   r$   r    r    r!   r3   R   s   
zQwen3TTSModel.from_pretrainedc                 C   >   t | jdd }t|r| }|d u rd S tdd |D S d S )Nget_supported_languagesc                 S      g | ]}t | qS r    r   lower.0xr    r    r!   
<listcomp>       z:Qwen3TTSModel._supported_languages_set.<locals>.<listcomp>r(   r#   callableset)r,   langsvr    r    r!   _supported_languages_set{      z&Qwen3TTSModel._supported_languages_setc                 C   r:   )Nget_supported_speakersc                 S   r<   r    r=   r?   r    r    r!   rB      rC   z9Qwen3TTSModel._supported_speakers_set.<locals>.<listcomp>rD   )r,   spksrH   r    r    r!   _supported_speakers_set   rJ   z%Qwen3TTSModel._supported_speakers_set	languagesc                 C   sp   |   }|du r
dS g }|D ]}|du r|| qt| |vr'|| q|r6td| dt| dS )z
        Validate that requested languages are supported by the model.

        Args:
            languages (List[str]): Language names for each sample.

        Raises:
            ValueError: If any language is not supported.
        NzUnsupported languages: . Supported: )rI   appendr   r>   
ValueErrorsorted)r,   rN   	supportedbadlangr    r    r!   _validate_languages   s   


z!Qwen3TTSModel._validate_languagesspeakersc                 C   sn   |   }|du r
dS g }|D ]}|du s|dkrqt| |vr&|| q|r5td| dt| dS )z
        Validate that requested speakers are supported by the Instruct model.

        Args:
            speakers (List[Optional[str]]): Speaker names for each sample.

        Raises:
            ValueError: If any speaker is not supported.
        N zUnsupported speakers: rO   )rM   r   r>   rP   rQ   rR   )r,   rW   rS   rT   spkr    r    r!   _validate_speakers   s   

z Qwen3TTSModel._validate_speakerssc                 C   s2   | drdS d|vrd|vrt|dkrdS dS )Nz
data:audioT/\   F)
startswithlen)r,   r[   r    r    r!   _is_probably_base64   s
   
z!Qwen3TTSModel._is_probably_base64c                 C   s4   zt |}|jdv ot|jW S  ty   Y dS w )N)httphttpsF)r	   schemer   netloc	Exception)r,   r[   ur    r    r!   _is_url   s   zQwen3TTSModel._is_urlb64c                 C   s0   d|v r|  dr|ddd }t|S )N,zdata:   )stripr_   splitbase64	b64decode)r,   ri   r    r    r!   _decode_base64_to_wav_bytes   s   
z)Qwen3TTSModel._decode_base64_to_wav_bytesrA   c                 C   s  |  |r?tj|}| }W d    n1 sw   Y  t|}tj|ddd\}}W d    n1 s9w   Y  n4| |ri| 	|}t|}tj|ddd\}}W d    n1 scw   Y  n
t
j|d dd\}}|jdkrtj|dd}|tjt|fS )	Nfloat32F)dtype	always_2dT)srmonork   axis)rh   urllibrequesturlopenreadioBytesIOsfra   rp   librosaloadndimnpmeanastyperq   int)r,   rA   respaudio_bytesfaudiort   	wav_bytesr    r    r!   _load_audio_to_np   s$   




zQwen3TTSModel._load_audio_to_npaudiosc                 C   s  t |tr|}n|g}g }|D ]G}t |tr|| | qt |trDt|dkrDt |d tjrD||d 	tj
t|d f qt |tjrNtdtdt| t|D ]$\}}|d jdkrtj|d dd	tj
|d< |d |d f||< q[|S )a  
        Normalize audio inputs into a list of (waveform, sr).

        Supported forms:
          - str: wav path / URL / base64 audio string
          - (np.ndarray, sr): waveform + sampling rate
          - list of the above

        Args:
            audios:
                Audio input(s).

        Returns:
            List[Tuple[np.ndarray, int]]:
                List of (float32 waveform, original sr).

        Raises:
            ValueError: If a numpy waveform is provided without sr.
        r   r   rk   z3For numpy waveform input, pass a tuple (audio, sr).zUnsupported audio input type: rv   rw   )r4   listr   rP   r   tupler`   r   ndarrayr   rq   r   rQ   r5   r6   	enumerater   r   )r,   r   itemsoutair    r    r!   _normalize_audio_inputs   s$   

&$z%Qwen3TTSModel._normalize_audio_inputsc                 C   s   t |tr|S |gS N)r4   r   )r,   rA   r    r    r!   _ensure_list
  s   zQwen3TTSModel._ensure_listtextc                 C      d| dS )N<|im_start|>assistant
z!<|im_end|>
<|im_start|>assistant
r    r,   r   r    r    r!   _build_assistant_text     z#Qwen3TTSModel._build_assistant_textc                 C   r   )Nr   <|im_end|>
r    r   r    r    r!   _build_ref_text  r   zQwen3TTSModel._build_ref_textinstructc                 C   r   )Nz<|im_start|>user
r   r    )r,   r   r    r    r!   _build_instruct_text  r   z"Qwen3TTSModel._build_instruct_texttextsc                 C   sV   g }|D ]$}| j |ddd}|d | j}| dkr!|dn|}|| q|S )NptT)r   return_tensorspadding	input_idsrk   r   )r'   tor%   dim	unsqueezerP   )r,   r   r   r   inputinput_idr    r    r!   _tokenize_texts  s   zQwen3TTSModel._tokenize_texts	do_sampletop_ktop_ptemperaturerepetition_penaltysubtalker_dosamplesubtalker_top_ksubtalker_top_psubtalker_temperaturemax_new_tokensc                    s   t ddddddddddd
 dtd	td
tf fdd}t |}|j|d||d||d||d||d||d||d||d||d|	|d|
d
 |S )a  
        Merge user-provided generation arguments with defaults from `generate_config.json`.

        Rule:
          - If the user explicitly passes a value (not None), use it.
          - Otherwise, use the value from generate_config.json if present.
          - Otherwise, fall back to the hard defaults.

        Args:
            do_sample, top_k, top_p, temperature, repetition_penalty,
            subtalker_dosample, subtalker_top_k, subtalker_top_p, subtalker_temperature, max_new_tokens:
                Common generation parameters.
            **kwargs:
                Other arguments forwarded to model.generate().

        Returns:
            Dict[str, Any]: Final kwargs to pass into model.generate().
        T2   g      ?g?g?i   )
r   r   r   r   r   r   r   r   r   r   nameuser_valr/   c                    s(   |d ur|S | j v rj |  S  |  S r   )r$   )r   r   hard_defaultsr,   r    r!   pickL  s
   

z2Qwen3TTSModel._merge_generate_kwargs.<locals>.pickr   r   r   r   r   r   r   r   r   r   )dictr   r   update)r,   r   r   r   r   r   r   r   r   r   r   r9   r   mergedr    r   r!   _merge_generate_kwargs  s6    z$Qwen3TTSModel._merge_generate_kwargsF	ref_audior   r   c              
   C   s  | j jdkrtd| j j d| j j d| j j d| |}t|tr)| |n|gt| }t|tr:| |n|gt| }t|t|ksQt|t|krdtdt| dt| dt| | 	|}g }g }	|D ]\}
}|
|
 |	
| qott|	d	kr| j jj||	d
 d}|j}ng }|D ]\}
}|
| j jj|
|djd
  qg }tt||||D ]R\}\\}
}}}}|s|du s|dkrtd| |
}|| j jkrtj|tjt|| j jd}| j j|| j jd}|
t|rdn||t|t| |d q|S )a  
        Build voice-clone prompt items from reference audio (and optionally reference text) using Base model.

        Modes:
          - x_vector_only_mode=True:
              Only speaker embedding is used to clone voice; ref_text/ref_code are ignored.
              This is mutually exclusive with ICL.
          - x_vector_only_mode=False:
              ICL mode is enabled automatically (icl_mode=True). In this case ref_text is required,
              because the model continues/conditions on the reference text + reference speech codes.

        Batch behavior:
          - ref_audio can be a single item or a list.
          - ref_text and x_vector_only_mode can be scalars or lists.
          - If any of them are lists with length > 1, lengths must match.

        Audio input:
          - str: local wav path / URL / base64
          - (np.ndarray, sr): waveform + sampling rate

        Args:
            ref_audio:
                Reference audio(s) used to extract:
                  - ref_code via `model.speech_tokenizer.encode(...)`
                  - ref_spk_embedding via `model.extract_speaker_embedding(...)` (resampled to 24k)
            ref_text:
                Reference transcript(s). Required when x_vector_only_mode=False (ICL mode).
            x_vector_only_mode:
                Whether to use speaker embedding only. If False, ICL mode will be used.

        Returns:
            List[VoiceClonePromptItem]:
                List of prompt items that can be converted into `voice_clone_prompt` dict.

        Raises:
            ValueError:
                - If x_vector_only_mode=False but ref_text is missing.
                - If batch lengths mismatch.
        basemodel with 
tokenizer_type: 
tts_model_size: 
tts_model_type: z`
does not support create_voice_clone_prompt, Please check Model Card or Readme for more details.zBatch size mismatch: ref_audio=z, ref_text=z, x_vector_only_mode=rk   r   )rt   NrX   zIref_text is required when x_vector_only_mode=False (ICL mode). Bad index=)yorig_sr	target_sr)r   rt   )r   r   r   r   r   )r#   tts_model_typerQ   tokenizer_typetts_model_sizer   r4   r   r`   r   rP   rF   speech_tokenizerencodeaudio_codesr   zipspeaker_encoder_sample_rater   resampler   r   rq   r   extract_speaker_embeddingr   r   )r,   r   r   r   ref_audio_listref_text_list	xvec_list
normalizedref_wavs_for_coderef_sr_for_codewavrt   enc	ref_codesr   r   codertext	xvec_onlywav_resamplespk_embr    r    r!   create_voice_clone_promptc  sf   .
""  

 &
	z'Qwen3TTSModel.create_voice_clone_promptr   c                 C   s8   t dd |D dd |D dd |D dd |D dS )Nc                 S      g | ]}|j qS r    )r   r@   itr    r    r!   rB         zEQwen3TTSModel._prompt_items_to_voice_clone_prompt.<locals>.<listcomp>c                 S   r   r    )r   r   r    r    r!   rB     r   c                 S   r   r    )r   r   r    r    r!   rB     r   c                 S   r   r    )r   r   r    r    r!   rB     r   )r   r   r   r   )r   )r,   r   r    r    r!   #_prompt_items_to_voice_clone_prompt  s   z1Qwen3TTSModel._prompt_items_to_voice_clone_promptlanguagevoice_clone_promptnon_streaming_modec           !         s   j jdkrtd j j d j j d j j d |}	t|tr) |n|dur4|gt|	 ndgt|	 }
t|
dkrMt|	dkrM|
t|	 }
t|	t|
krctd	t|	 d
t|
  	|
 |du r|du rttd j
|||d}t|dkrt|	dkr|t|	 }t|t|	krtdt| dt|	  |}dd |D }n@t|tr|}t|dkrt|	dkr|t|	 }t|t|	krtdt| dt|	  |}dd |D }n|}d} fdd|	D } |}d}|dur3g }t|D ]&\}}|du s|dkr!|d q  |gd }|| q jdi |} j jd||||
|d|\}}g }t|D ]0\}}|dd}|dur||| dur||tj|| |j|gdd qR|| qR j jdd |D \}}g }t|D ]G\}}|dd}|dur|| durt|| jd }t|| jd }t|t|d |jd  } ||| d  q|| q||fS )a  
        Voice clone speech using the Base model.

        You can provide either:
          - (ref_audio, ref_text, x_vector_only_mode) and let this method build the prompt, OR
          - `VoiceClonePromptItem` returned by `create_voice_clone_prompt`, OR
          - a list of `VoiceClonePromptItem` returned by `create_voice_clone_prompt`.
        
        `ref_audio` Supported forms:
        - str: wav path / URL / base64 audio string
        - (np.ndarray, sr): waveform + sampling rate
        - list of the above

        Input flexibility:
          - text/language can be scalar or list.
          - prompt can be single or batch.
          - If batch mode (len(text)>1), lengths must match.

        Args:
            text:
                Text(s) to synthesize.
            language:
                Language(s) for each sample.
            ref_audio:
                Reference audio(s) for prompt building. Required if voice_clone_prompt is not provided.
            ref_text:
                Reference text(s) used for ICL mode (required when x_vector_only_mode=False).
            x_vector_only_mode:
                If True, only speaker embedding is used (ignores ref_text/ref_code).
                If False, ICL mode is used automatically.
            voice_clone_prompt:
                list[VoiceClonePromptItem] from `create_voice_clone_prompt`.
            non_streaming_mode:
                Using non-streaming text input, this option currently only simulates streaming text input when set to `false`, 
                rather than enabling true streaming input or streaming generation.
            do_sample:
                Whether to use sampling, recommended to be set to `true` for most use cases.
            top_k:
                Top-k sampling parameter.
            top_p:
                Top-p sampling parameter.
            temperature:
                Sampling temperature; higher => more random.
            repetition_penalty:
                Penalty to reduce repeated tokens/codes.
            subtalker_dosample:
                Sampling switch for the sub-talker (only valid for qwen3-tts-tokenizer-v2) if applicable.
            subtalker_top_k:
                Top-k for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
            subtalker_top_p:
                Top-p for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
            subtalker_temperature:
                Temperature for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
            max_new_tokens:
                Maximum number of new codec tokens to generate.
            **kwargs:
                Any other keyword arguments supported by HuggingFace Transformers `generate()` can be passed.
                They will be forwarded to the underlying `Qwen3TTSForConditionalGeneration.generate(...)`.

        Returns:
            Tuple[List[np.ndarray], int]:
                (wavs, sample_rate)

        Raises:
            ValueError:
                If batch sizes mismatch or required prompt inputs are missing.
        r   r   r   r   z[
does not support generate_voice_clone, Please check Model Card or Readme for more details.NAutork   Batch size mismatch: text=, language=z<Either `voice_clone_prompt` or `ref_audio` must be provided.)r   r   r   zBatch size mismatch: prompt=z, text=c                 S   r   r    r   r   r    r    r!   rB   >  r   z6Qwen3TTSModel.generate_voice_clone.<locals>.<listcomp>c                 S   r   r    r   r   r    r    r!   rB   G  r   c                       g | ]}  |qS r    r   r@   tr,   r    r!   rB   L      rX   r   )r   ref_idsr   rN   r   r   )r   c                 S      g | ]}d |iqS r   r    r@   cr    r    r!   rB   l      r    )r#   r   rQ   r   r   r   r4   r   r`   rV   r   r   r   r   rP   r   r   generategetr   catr   r%   r   decoder   shapemax)!r,   r   r   r   r   r   r   r   r9   r   rN   prompt_itemsvoice_clone_prompt_dictref_texts_for_idsinput_textsr   r   r   rtref_tok
gen_kwargstalker_codes_list_codes_for_decodecodesref_code_listwavs_allfswavs_outr   ref_len	total_lencutr    r   r!   generate_voice_clone  s   O
8






	(z"Qwen3TTSModel.generate_voice_cloneTc                    s   j jdkrtd j j d j j d j j d |}t|tr) |n|dur4|gt| ndgt| } |}t|dkrRt|dkrR|t| }t|dkrdt|dkrd|t| }t|t|  krtt|ksn td	t| d
t| dt|  	|  
 fdd|D }	g }
|D ]}|du s|dkr|
d q|
 
 |gd  q jdi |} j jd|	|
||d|\}} j jdd |D \}}||fS )a  
        Generate speech with the VoiceDesign model using natural-language style instructions.

        Args:
            text:
                Text(s) to synthesize.
            language:
                Language(s) for each sample.
            instruct:
                Instruction(s) describing desired voice/style. Empty string is allowed (treated as no instruction).
            non_streaming_mode:
                Using non-streaming text input, this option currently only simulates streaming text input when set to `false`, 
                rather than enabling true streaming input or streaming generation.
            do_sample:
                Whether to use sampling, recommended to be set to `true` for most use cases.
            top_k:
                Top-k sampling parameter.
            top_p:
                Top-p sampling parameter.
            temperature:
                Sampling temperature; higher => more random.
            repetition_penalty:
                Penalty to reduce repeated tokens/codes.
            subtalker_dosample:
                Sampling switch for the sub-talker (only valid for qwen3-tts-tokenizer-v2) if applicable.
            subtalker_top_k:
                Top-k for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
            subtalker_top_p:
                Top-p for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
            subtalker_temperature:
                Temperature for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
            max_new_tokens:
                Maximum number of new codec tokens to generate.
            **kwargs:
                Any other keyword arguments supported by HuggingFace Transformers `generate()` can be passed.
                They will be forwarded to the underlying `Qwen3TTSForConditionalGeneration.generate(...)`.

        Returns:
            Tuple[List[np.ndarray], int]:
                (wavs, sample_rate)
        voice_designr   r   r   z\
does not support generate_voice_design, Please check Model Card or Readme for more details.Nr   rk   r   r   , instruct=c                    r   r    r   r   r   r    r!   rB     r   z7Qwen3TTSModel.generate_voice_design.<locals>.<listcomp>rX   r   )r   instruct_idsrN   r   c                 S   r   r   r    r   r    r    r!   rB     r   r    )r#   r   rQ   r   r   r   r4   r   r`   rV   r   rP   r   r   r   r   r   )r,   r   r   r   r   r9   r   rN   	instructsr   r  insr  r  r	  wavsr  r    r   r!   generate_voice_design|  sF   2
8
"&

z#Qwen3TTSModel.generate_voice_designspeakerc              
      s\   j jdkrtd j j d j j d j j d |}t|tr) |n|dur4|gt| ndgt| } |}	 j jdv rHd}t|trR |n|dur]|gt| nd	gt| }
t|d
krvt|d
krv|t| }t|	d
krt|d
kr|	t| }	t|
d
krt|d
kr|
t| }
t|t|  krt|	  krt|
ksn tdt| dt| dt|	 dt|
  	|  
|	   fdd|D }g }|
D ]}|du s|d	kr|d q|  |gd  q jdi |} j jd||||	|d|\}} j jdd |D \}}||fS )a  
        Generate speech with the CustomVoice model using a predefined speaker id, optionally controlled by instruction text.

        Args:
            text:
                Text(s) to synthesize.
            language:
                Language(s) for each sample.
            speaker:
                Speaker name(s). Will be validated against `model.get_supported_speakers()` (case-insensitive).
            instruct:
                Optional instruction(s). If None, treated as empty (no instruction).
            non_streaming_mode:
                Using non-streaming text input, this option currently only simulates streaming text input when set to `false`, 
                rather than enabling true streaming input or streaming generation.
            do_sample:
                Whether to use sampling, recommended to be set to `true` for most use cases.
            top_k:
                Top-k sampling parameter.
            top_p:
                Top-p sampling parameter.
            temperature:
                Sampling temperature; higher => more random.
            repetition_penalty:
                Penalty to reduce repeated tokens/codes.
            subtalker_dosample:
                Sampling switch for the sub-talker (only valid for qwen3-tts-tokenizer-v2) if applicable.
            subtalker_top_k:
                Top-k for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
            subtalker_top_p:
                Top-p for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
            subtalker_temperature:
                Temperature for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
            max_new_tokens:
                Maximum number of new codec tokens to generate.
            **kwargs:
                Any other keyword arguments supported by HuggingFace Transformers `generate()` can be passed.
                They will be forwarded to the underlying `Qwen3TTSForConditionalGeneration.generate(...)`.

        Returns:
            Tuple[List[np.ndarray], int]:
                (wavs, sample_rate)

        Raises:
            ValueError:
                If any speaker/language is unsupported or batch sizes mismatch.
        custom_voicer   r   r   z\
does not support generate_custom_voice, Please check Model Card or Readme for more details.Nr   0b6rX   rk   r   r   z
, speaker=r  c                    r   r    r   r   r   r    r!   rB   2  r   z7Qwen3TTSModel.generate_custom_voice.<locals>.<listcomp>r   )r   r  rN   rW   r   c                 S   r   r   r    r   r    r    r!   rB   F  r   r    )r#   r   rQ   r   r   r   r4   r   r`   rV   rZ   r   rP   r   r   r   r   r   )r,   r   r  r   r   r   r9   r   rN   rW   r  r   r  r  r  r  r	  r  r  r    r   r!   generate_custom_voice  sX   9
8
80*


	z#Qwen3TTSModel.generate_custom_voicec                 C      |   }|du r
dS t|S )a  
        List supported speaker names for the current model.

        This is a convenience wrapper around `model.get_supported_speakers()`.
        If the underlying model does not expose speaker constraints (returns None),
        this method also returns None.

        Returns:
            Optional[List[str]]:
                - A sorted list of supported speaker names (lowercased), if available.
                - None if the model does not provide supported speakers.
        N)rM   rR   r,   rS   r    r    r!   rK   J     z$Qwen3TTSModel.get_supported_speakersc                 C   r  )a  
        List supported language names for the current model.

        This is a convenience wrapper around `model.get_supported_languages()`.
        If the underlying model does not expose language constraints (returns None),
        this method also returns None.

        Returns:
            Optional[List[str]]:
                - A sorted list of supported language names (lowercased), if available.
                - None if the model does not provide supported languages.
        N)rI   rR   r   r    r    r!   r;   ]  r!  z%Qwen3TTSModel.get_supported_languagesr   )
NNNNNNNNNN)NF)NNNFNF)NT)NNT)3r   r   r   r   r   r   r   r   r   r-   classmethodr3   rF   rI   rM   r   rV   rZ   r   ra   rh   bytesrp   r   r   r   r   r   r   	AudioLiker   	MaybeListr   r   r   r   r   r   r   floatr   inference_moder   r   r   no_gradr  r  r  rK   r;   r    r    r    r!   r"   6   s     (		,)	


Dh	
 '^nr"   )$rn   r}   urllib.requestry   dataclassesr   typingr   r   r   r   r   r   urllib.parser	   r   numpyr   	soundfiler   r   transformersr
   r   r   core.modelsr   r   r   r   r   r   r$  r%  r   r"   r    r    r    r!   <module>   s,    