o
    ۷iS=                     @   s   d dl Z d dlZd dlZd dlmZmZmZmZ d dl	m
Z
 d dlZd dlZd dlZd dlZd dlmZ d dlmZmZmZ ddlmZmZmZmZ eeejee eej f ZG dd	 d	ZdS )
    N)ListOptionalTupleUnion)urlparse)pad_sequence)
AutoConfigAutoFeatureExtractor	AutoModel   )Qwen3TTSTokenizerV1ConfigQwen3TTSTokenizerV1ModelQwen3TTSTokenizerV2ConfigQwen3TTSTokenizerV2Modelc                   @   s  e Zd ZdZdd Zededd fddZdedefd	d
Z	dedefddZ
dedefddZdededejfddZdedee deej fddZ		d)dedee defddZdeeej ef fddZdefdd Zdefd!d"Zdefd#d$Zdefd%d&Zdefd'd(ZdS )*Qwen3TTSTokenizera.  
    A wrapper for Qwen3 TTS Tokenizer 25Hz/12Hz with HuggingFace-style loading.

    - from_pretrained(): loads speech tokenizer model via AutoModel and feature_extractor via AutoFeatureExtractor.
    - encode(): supports wav path(s), base64 audio string(s), numpy array(s).
    - decode(): accepts either the raw model encode output, or a minimal dict/list-of-dicts.

    Notes:
    - For numpy array input, you must pass `sr` so the audio can be resampled to model sample rate.
    - Returned audio is float32 numpy arrays and the output sample rate.
    c                 C   s   d | _ d | _d | _d | _d S N)modelfeature_extractorconfigdeviceself r   \/home/ubuntu/vllm_env/lib/python3.10/site-packages/qwen_tts/inference/qwen3_tts_tokenizer.py__init__9   s   
zQwen3TTSTokenizer.__init__pretrained_model_name_or_pathreturnc                 K   s   |  }t dt ttt t dt ttt t||_	tj|fi ||_
|j
j|_t|j
dd|_|jdu rZzt|j
 j|_W |S  tyY   td|_Y |S w |S )a  
        Initialize tokenizer with HuggingFace `from_pretrained` style.

        Args:
            pretrained_model_name_or_path (str):
                HuggingFace repo id or local directory.
            **kwargs (Any):
                Forwarded to `AutoModel.from_pretrained(...)` directly.
                Typical examples: device_map="cuda:0", dtype=torch.bfloat16, attn_implementation="eager".

        Returns:
            Qwen3TTSTokenizer:
                Initialized instance with `model`, `feature_extractor`, `config`.
        qwen3_tts_tokenizer_25hzqwen3_tts_tokenizer_12hzr   Ncpu)r   registerr   r
   r   r   r   r	   from_pretrainedr   r   r   getattrr   next
parametersStopIterationtorch)clsr   kwargsinstr   r   r   r!   ?   s$   

z!Qwen3TTSTokenizer.from_pretrainedsc                 C   s2   | drdS d|vrd|vrt|dkrdS dS )Nz
data:audioT/\   F)
startswithlen)r   r*   r   r   r   _is_probably_base64e   s
   
z%Qwen3TTSTokenizer._is_probably_base64c                 C   s4   zt |}|jdv ot|jW S  ty   Y dS w )N)httphttpsF)r   schemeboolnetloc	Exception)r   r*   ur   r   r   _is_urlm   s   zQwen3TTSTokenizer._is_urlb64c                 C   s0   d|v r|  dr|ddd }t|S )N,zdata:   )stripr.   splitbase64	b64decode)r   r9   r   r   r   _decode_base64_to_wav_bytest   s   
z-Qwen3TTSTokenizer._decode_base64_to_wav_bytesx	target_src           	      C   s"  |  |r?tj|}| }W d   n1 sw   Y  t|}tj|ddd\}}W d   n1 s9w   Y  n4| |ri| 	|}t|}tj|ddd\}}W d   n1 scw   Y  n
t
j|ddd\}}|jdkrtj|dd	}||krt
j|||d
}|tjS )ai  
        Load audio from wav path or base64 string, then resample to target_sr.

        Args:
            x (str):
                A wav file path, or a base64 audio string (raw or data URL).
            target_sr (int):
                Target sampling rate.

        Returns:
            np.ndarray:
                1-D float32 waveform at target_sr.
        Nfloat32F)dtype	always_2dT)srmonor;   axisyorig_srrB   )r8   urllibrequesturlopenreadioBytesIOsfr0   r@   librosaloadndimnpmeanresampleastyperC   )	r   rA   rB   respaudio_bytesfaudiorF   	wav_bytesr   r   r   
load_audioz   s(   




zQwen3TTSTokenizer.load_audioaudiosrF   c                    s   t  jjt|ttjfr|g}t|dkrg S t|d tr* fdd|D S |du r2tdg }|D ]5}t|tjsBt	d|j
dkrNtj|dd	}t |krbtj|tjt |d
}||tj q6|S )ai  
        Normalize all supported input types into a list of 1-D numpy float32 waveforms
        at `self.feature_extractor.sampling_rate`.

        Args:
            audios (AudioInput):
                - str: wav path OR base64 audio string
                - np.ndarray: raw waveform (sr must be provided)
                - list[str] / list[np.ndarray]
            sr (Optional[int]):
                Sampling rate for raw numpy input. Required if input is np.ndarray or list[np.ndarray].

        Returns:
            List[np.ndarray]:
                List of float32 waveforms resampled to model input SR.
        r   c                    s   g | ]	} j |d qS ))rB   )ra   .0rA   r   rB   r   r   
<listcomp>       z=Qwen3TTSTokenizer._normalize_audio_inputs.<locals>.<listcomp>NzIFor numpy waveform input, you must provide `sr` (original sampling rate).zNMixed input types are not supported. Use all paths/base64 or all numpy arrays.r;   rH   rI   rK   )intr   sampling_rate
isinstancestrrX   ndarrayr/   
ValueError	TypeErrorrW   rY   rU   rZ   r[   rC   append)r   rb   rF   outar   re   r   _normalize_audio_inputs   s&   
z)Qwen3TTSTokenizer._normalize_audio_inputsNTreturn_dictc                 C   s   | j ||d}| j|t| jjdd}|| j| jj}t	  | jj
|d d|d d|d}W d   |S 1 sAw   Y  |S )	a  
        Batch-encode audio into discrete codes (and optional conditioning, depending on 25Hz/12Hz).

        Args:
            audios (AudioInput):
                Supported forms:
                - np.ndarray: waveform (requires sr)
                - list[np.ndarray]: waveforms (requires sr)
                - str: wav path OR base64 audio string
                - list[str]: wav paths and/or base64 strings
            sr (Optional[int], default=None):
                Original sampling rate for numpy waveform input.
            return_dict (bool, default=True):
                Forwarded to model.encode(...). If True, returns ModelOutput.

        Returns:
            25Hz:
                Qwen3TTSTokenizerV1EncoderOutput (if return_dict=True) with fields:
                  - audio_codes: List[torch.LongTensor] each (codes_len,)
                  - xvectors:   List[torch.FloatTensor] each (xvector_dim,)
                  - ref_mels:   List[torch.FloatTensor] each (mel_len, mel_dim)
            12Hz:
                Qwen3TTSTokenizerV2EncoderOutput (if return_dict=True) with fields:
                  - audio_codes: List[torch.LongTensor] each (codes_len, num_quantizers)

            If return_dict=False, returns the raw tuple from model.encode.
        )rF   pt)	raw_audiori   return_tensorsinput_valuesr;   padding_maskrs   N)rr   r   rh   ri   tor   r   rD   r&   inference_modeencodesqueeze)r   rb   rF   rs   wavsinputsencr   r   r   r|      s"   !


zQwen3TTSTokenizer.encodec                    s  | j  }ddd t|dr|j}t|dd}t|dd}nEt|tr5|d }|dd}|dd}n/t|tr`dd |D }d|d	 v rNd
d |D nd}d|d	 v r]dd |D nd}nt	dt|t
jr|}| dkrx|d	}n| dkr|d	}|| j}n fdd|D }t|ddd| j}t
  |dkr4|du s|du rtdt|t
jr|}| dkr|d	}|| j| j j}n fdd|D }t
j|d	d| j| j j}t|t
jr|}	|	 dkr|	d	}	|	| j| j j}	n fdd|D }t|dd	d| j| j j}	| j j|||	dd}
|
j}n|dkrE| j j|dd}
|
j}ntd| W d   n	1 sWw   Y  dd |D }|t| j  fS )a4  
        Decode back to waveform.

        Usage:
        1) Pass the raw output of `encode(...)` directly (recommended).
           - 25Hz: expects fields audio_codes, xvectors, ref_mels
           - 12Hz: expects field audio_codes
        2) Pass a dict or list[dict] (minimal form) for custom pipelines:
           - 25Hz dict keys: {"audio_codes", "xvectors", "ref_mels"}
           - 12Hz dict keys: {"audio_codes"}
           Values can be torch tensors or numpy arrays.

        Args:
            encoded (Any):
                - ModelOutput returned by `encode()`, OR
                - dict, OR
                - list[dict]

        Returns:
            Tuple[List[np.ndarray], int]:
                - wavs: list of 1-D float32 numpy arrays
                - sample_rate: int, model output sampling rate
        Nc                 S   s:   t | tjr| S t| } t| }|d ur||}|S r   )rj   r&   TensorrX   asarray
from_numpyrz   )rA   rD   tr   r   r   
_to_tensor   s   


z,Qwen3TTSTokenizer.decode.<locals>._to_tensoraudio_codesxvectorsref_melsc                 S      g | ]}|d  qS )r   r   rd   er   r   r   rf   5      z,Qwen3TTSTokenizer.decode.<locals>.<listcomp>r   c                 S   r   )r   r   r   r   r   r   rf   6  r   c                 S   r   )r   r   r   r   r   r   rf   7  r   z?`encoded` must be an encode output, a dict, or a list of dicts.r;   r   c                       g | ]	} |t jd qS )rD   )r&   long)rd   cr   r   r   rf   H  rg   TrH   )batch_firstpadding_valuer   z/25Hz decode requires `xvectors` and `ref_mels`.c                    r   r   r&   rC   rc   r   r   r   rf   V  rg   )dimc                    r   r   r   )rd   mr   r   r   rf   _  rg   ry   r   zUnknown model type: c                 S   s$   g | ]}| tj   qS r   )rz   r&   rC   detachr   numpy)rd   wr   r   r   rf   l  s   $ r   )r   get_model_typehasattrr   r"   rj   dictgetlistrn   r&   r   r   	unsqueezerz   r   r   r{   rm   rD   stackdecodeaudio_valuesrh   get_output_sample_rate)r   encoded
model_typeaudio_codes_listxvectors_listref_mels_listr   audio_codes_paddedxvectors_batchref_mels_paddeddecwav_tensorsr~   r   r   r   r     sh   





 



 
 
!zQwen3TTSTokenizer.decodec                 C   s
   | j  S )z
        Get the underlying tokenizer model type.

        Returns:
            str: Model type string from `self.model.config.model_type`
                (e.g. "qwen3_tts_tokenizer_25hz" / "qwen3_tts_tokenizer_12hz").
        )r   r   r   r   r   r   r   o  s   
z Qwen3TTSTokenizer.get_model_typec                 C      t | j S )z}
        Get the expected input sample rate for encoding.

        Returns:
            int: Input sample rate (Hz).
        )rh   r   get_input_sample_rater   r   r   r   r   y     z'Qwen3TTSTokenizer.get_input_sample_ratec                 C   r   )z
        Get the output sample rate for decoded waveforms.

        Returns:
            int: Output sample rate (Hz).
        )rh   r   r   r   r   r   r   r     r   z(Qwen3TTSTokenizer.get_output_sample_ratec                 C   r   )z
        Get the encoder downsample rate (waveform samples per code step).

        Returns:
            int: Encode downsample rate.
        )rh   r   get_encode_downsample_rater   r   r   r   r     r   z,Qwen3TTSTokenizer.get_encode_downsample_ratec                 C   r   )z
        Get the decoder upsample rate (waveform samples per code step).

        Returns:
            int: Decode upsample rate.
        )rh   r   get_decode_upsample_rater   r   r   r   r     r   z*Qwen3TTSTokenizer.get_decode_upsample_rate)NT)__name__
__module____qualname____doc__r   classmethodrk   r!   r4   r0   r8   bytesr@   rh   rX   rl   ra   
AudioInputr   r   rr   r|   r   r   r   r   r   r   r   r   r   r   r   r   ,   sN    %
&
3
3
l
			r   ) r>   rR   urllib.requestrN   typingr   r   r   r   urllib.parser   rU   r   rX   	soundfilerT   r&   torch.nn.utils.rnnr   transformersr   r	   r
   corer   r   r   r   rk   rl   r   r   r   r   r   r   <module>   s(   