o
    cir                     @   sp   d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z
ddlZddlZddlmZ eeZG dd dZdS )	z
FasterQwen3TTS: Real-time TTS using CUDA graph capture.

Wrapper class that provides a Qwen3-TTS API while using
CUDA graphs for 6-10x speedup.
    N)Path)	GeneratorOptionalTupleUnion   )suppress_flash_attn_warningc                (   @   s  e Zd ZdZdejdfdedejdefddZ	e
dejd	dfd
ededeeejf dedef
ddZdefddZ						dFdedededededededeeef fddZdGd!eeef d"edeejef fd#d$Z		%	dHded!eeef d&eded'ed(ed)efd*d+Z	,dIdeded-ee d.ee fd/d0Z	,dId(efd1d2Ze 		3			4					dJdeded!eeef d&eded5ededed6ededed'ed(ed)edeeef fd7d8Ze 		3			4			9				%dKdeded!eeef d&eded5ededed6ededed:ed'ed(ed)ed;edeeejeef d,d,f f"d<d=Ze 	,		3			4		dLded-eded.ee ded5ededed6edededeeef fd>d?Z e 	,		3			4			9dMded-eded.ee ded5ededed6ededed:edeeejeef d,d,f fd@dAZ!e 		3			4		dNded.ededed5ededed6edededeeef fdBdCZ"e 		3			4			9dOded.ededed5ededed6ededed:edeeejeef d,d,f fdDdEZ#d,S )PFasterQwen3TTSz
    Qwen3-TTS model with CUDA graphs for real-time inference.
    
    Compatible API with Qwen3TTSModel, but uses CUDA graph
    capture for 6-10x speedup on NVIDIA GPUs.
    cuda   devicedtypemax_seq_lenc                 C   s:   || _ || _|| _|| _|| _|| _d| _d| _i | _d S )Ni.  F)	modelpredictor_graphtalker_graphr   r   r   sample_rate
_warmed_up_voice_prompt_cache)self
base_modelr   r   r   r   r    r   L/home/ubuntu/vllm_env/lib/python3.10/site-packages/faster_qwen3_tts/model.py__init__   s   	
zFasterQwen3TTS.__init__sdpa
model_nameattn_implementationc              
   C   s  t |tr
tt|}|drtj stdt	d|  t
  ddlm} W d   n1 s4w   Y  ddlm} dd	lm} |j||||d
}	|	jj}
|	jjj}|
j}|jj}|j}t	d ||||||dddd}||
j||||d}t	d | |	|||||dS )a  
        Load Qwen3-TTS model and prepare CUDA graphs.

        Args:
            model_name: Model path or HuggingFace Hub ID
            device: Device to use ("cuda" or "cpu")
            dtype: Data type for inference
            attn_implementation: Attention implementation ("sdpa" or "flash_attention_2")
            max_seq_len: Maximum sequence length for static cache
            
        Returns:
            FasterQwen3TTS instance
        r
   zCUDA graphs require CUDA devicezLoading Qwen3-TTS model: r   )Qwen3TTSModelNr   )PredictorGraph)TalkerGraph)
device_maptorch_dtyper   zBuilding CUDA graphs...T2   ?)r   r   	do_sampletop_ktemperature)r   r   r   z3CUDA graphs initialized (will capture on first run))r   r   r   r   r   r   )
isinstancestrgetattrtorch
startswithr
   is_available
ValueErrorloggerinfor   qwen_ttsr   r   r   r   r   from_pretrainedr   talkerconfigtalker_configcode_predictorhidden_size)clsr   r   r   r   r   r   r   r   r   r2   r4   	predictorpred_configtalker_hiddenr   r   r   r   r   r1   1   s`   




zFasterQwen3TTS.from_pretrainedprefill_lenc                 C   sF   | j rdS td | jjdd | jj|dd d| _ td dS )z:Warm up and capture CUDA graphs with given prefill length.NzWarming up CUDA graphs...   )
num_warmup)r;   r=   TzCUDA graphs captured and ready)r   r.   r/   r   capturer   )r   r;   r   r   r   _warmup   s   
zFasterQwen3TTS._warmupEnglishr#   r"   T?textlanguagemax_new_tokensr&   r%   r$   repetition_penaltyreturnc                 C   s   t d)z
        Generate speech from text using default voice.
        
        Not yet implemented - use generate_voice_clone() instead.
        z^Default voice generation not yet implemented. Use generate_voice_clone() with reference audio.)NotImplementedError)r   rB   rC   rD   r&   r%   r$   rE   r   r   r   generate   s   zFasterQwen3TTS.generate      ?	ref_audiosilence_secsc                 C   sd   t jt|ddd\}}|jdkr|jdd}|dkr.tjt|| tjd}t	||g}||fS )a  Load reference audio and optionally append trailing silence.

        The ICL voice-cloning prompt ends with the last codec token of the reference
        audio, so the model's first generated token is conditioned on whatever phoneme
        the reference ends with. Appending a short silence makes the last tokens
        encode silence instead, preventing that phoneme from bleeding into the start
        of the generated speech. Set silence_secs=0 to disable this behavior.
        float32F)r   	always_2dr   )axisr   r   )
sfreadr(   ndimmeannpzerosintrL   concatenate)r   rJ   rK   audiosrsilencer   r   r   _load_ref_audio_with_silence   s   	
z+FasterQwen3TTS._load_ref_audio_with_silenceFref_text	xvec_onlynon_streaming_modeappend_silencec              	   C   s  | j |g}| j |}	t||||f}
|
| jv r"| j|
 \}}np|rO| j jt|ddd}|d j}tdg|gdgdgd}dgt|	 }||f| j|
< nC|rSdnd	}| j	||d
}| j j||d}| j 
|}g }|d j}|r| j |g}|| j |d  n|d ||f| j|
< | j j }| j||	|||dur|gndgd|d\}}}}| js| |jd  |j}|jj}d|_d}|s|dr|d d dur|d d }||||||||fS )a  Prepare inputs for generation (shared by streaming and non-streaming).

        Args:
            xvec_only: When True (default), use only the speaker embedding (x-vector) for voice
                cloning instead of the full ICL acoustic prompt. This prevents the model from
                continuing the reference audio's last phoneme and allows natural language switching.
                When False, the full reference audio codec tokens are included in context (ICL mode).
         T)rJ   r\   x_vector_only_moder   NF)ref_coderef_spk_embeddingra   icl_moderI           )rK   )rJ   r\   Auto)m	input_idsref_idsvoice_clone_prompt	languagesspeakersr^   r   rb   )r   _build_assistant_text_tokenize_textsr(   r   create_voice_clone_promptrc   dictlenr[   #_prompt_items_to_voice_clone_promptr\   _build_ref_textappend_build_talker_inputs_localr   r?   shaper2   r3   r4   rope_deltasget)r   rB   rJ   r\   rC   r]   r^   r_   input_textsrh   	cache_keyvcpri   prompt_itemsspk_embrK   ref_audio_inputrt	ref_textsrg   tietamtthtper2   r3   	ref_codesr   r   r   _prepare_generation   sj   




z"FasterQwen3TTS._prepare_generationNspeakerinstructc              
   C   s   | j |g}| j |}g }|d u s|dkr|d  n|| j | j |gd  | j j }| j||d gd |d ur?|gndg|gd|d\}	}
}}| jsW| |	jd  |j	}|j
j}d |_||||	|
||fS )Nr`   r   rf   F)rg   rh   ri   rj   rk   rl   r^   instruct_idsr   )r   rm   rn   rt   _build_instruct_textru   r   r?   rv   r2   r3   r4   rw   )r   rB   rC   r   r   ry   rh   r   rg   r   r   r   r   r2   r3   r   r   r   _prepare_generation_custom  s.    z)FasterQwen3TTS._prepare_generation_customc	           1      C   s  dd t t|D }	d}
|dur||}
|dur7t|D ]\}}|dur6|	| |j|j | q|du rBdgt| }g }d}tt|||D ]i\}\}}}|
du r|dksb|du red}n>|	 |j
jjvrvtd| d|j
jj|	  }|j tj||jj|jd}n|d | s|d	 | r|
| }nd}|dusJ |	 d
krd}n|	 |j
jjvrtd| d|j
jj|	  }|	 dv r|dvr|j
jj|	  r|j
jj|	  }|j
jj| }|j|j tj|j
j|j
j|j
jgg|jj|jdjddd\}}}|du r*|j
jj|j
jj|j
jjgg}n|j
jj|j
jj||j
jjgg}|j tj||jj|jd}|j tj|j
jj|j
jjgg|jj|jd}|du rrtj||gdd}ntj|| ddd|gdd}|j|j |ddddf }tj|!d|j"d d d|fdd|ddddf  }tj||fdd}|dur|#dddur|d	 | r|j$|ddddf || ddddf |d | %|jj& |||d\}} tj||gdd}ntj||j|j |ddddf |ddddf  gdd}|r|ddddf }tj|tj|j|j |ddddf |fdd|j tj|j
jjg|ddddf j"d d  g|jj|jd ||j tj|j
jjgg|jj|jd gdd}|} ntj|j|j |ddddf |fdd} |	| | ||  qNt|	D ]\}}tjdd |D dd|	|< qtdd |	D }!dd |	D }"dd |"D }#tj'j(j)j*|#ddd}$|$j+dgd}	|	j"d  |	j"d }%}&t,|&!|%d}'|&|! }(|'|(-dk. %|	j})|/ }*d!d |D }+d"d |+D },tj'j(j)j*|+ddd}-tj,t0|,|-jd#!t|,d}.tj|,|-jd#-d}/|.|/k}0|*|-|0< |-}|	|)||fS )$zDLocal copy of upstream talker input building for qwen-tts main repo.c                 S   s   g | ]}g qS r   r   ).0_r   r   r   
<listcomp>9  s    z=FasterQwen3TTS._build_talker_inputs_local.<locals>.<listcomp>Nr`   zSpeaker z not implemented)r   r   ra   rd   autoz	Language )chineser   )r`   Nr<   r   dim   rb   )text_idref_idrb   tts_pad_embedtts_eos_embedr^      c                 S   s   g | ]}|d ur|qS Nr   )r   itemr   r   r   r     s    c                 S      g | ]}|j d  qS )r   rv   r   tr   r   r   r         c                 S      g | ]}| d qS r   squeezer   r   r   r   r     r   c                 S   s   g | ]	}|j d gdqS )r   dims)flipr   r   r   r   r     s    Tre   )batch_firstpadding_valuer   r   c                 S   r   r   r   r   r   r   r   r     r   c                 S   r   r   r   )r   sr   r   r   r     r   )r   )1rangerq   generate_speaker_prompt	enumeratert   r2   text_projectionget_text_embeddingsziplowerr3   r4   spk_idrG   get_input_embeddingsr*   tensorr   r   codec_language_idspk_is_dialecttts_bos_token_idtts_eos_token_idtts_pad_token_idchunkcodec_nothink_idcodec_think_bos_idcodec_think_eos_idcodec_think_idcodec_pad_idcodec_bos_idcatviewexpandrv   rx   generate_icl_prompttoclonennutilsrnnpad_sequencer   arange	unsqueezelongr   max)1r   rg   rh   ri   rj   rk   rl   r^   r   talker_input_embedsvoice_clone_spk_embedsindexinstruct_idtrailing_text_hiddensr   input_idrC   r   speaker_embedr   language_iddialecttts_bos_embedr   codec_prefill_listcodec_input_emebdding_0codec_input_emebdding_1codec_input_emebdding_talker_input_embed_role_talker_input_embedtalker_input_embedicl_input_embedtrailing_text_hiddenoriginal_lengths	sequencessequences_reversedpadded_reversed
batch_sizemax_lenindicesnum_padstalker_attention_maskpad_embedding_vectorsequences_to_padtrailing_text_original_lengthspadded_hiddensarange_tensorlengths_tensorpadding_maskr   r   r   ru   -  sn  
 







	*
 

z)FasterQwen3TTS._build_talker_inputs_localr         ?min_new_tokenstop_pc           (      C   s  ddl m} | j|||||||d\}}}}}}}}|||||||| j| j|||||	|
|d\}}|du rFtd tjdtj	dg| j
fS |j}|dur]||j}tj||gdd	}n|}|d
|di\}}|durt|jd nd}|jd } g }!|D ]7}"t|"dr|"   }"nt|"dr|" n|"}"|dkrt|t| d t|" }#|"|#d }"|!|" q|d }$|$d }%|d d |d  }&|&dkr|%|& nd}'td|%dd|&dd|d dd|'dd	 |!|fS )a9  
        Generate speech with voice cloning using reference audio.

        Args:
            text: Text to synthesize
            language: Target language
            ref_audio: Path to reference audio file
            ref_text: Transcription of reference audio
            max_new_tokens: Maximum tokens to generate
            min_new_tokens: Minimum tokens before EOS is allowed
            temperature: Sampling temperature
            top_k: Top-k sampling
            top_p: Top-p (nucleus) sampling
            do_sample: Whether to sample
            repetition_penalty: Repetition penalty
            xvec_only: When True (default), use only the speaker embedding for voice cloning.
                This prevents phoneme bleed-through from the reference and allows clean
                language switching. Set to False for full ICL mode (reference audio in context).
            non_streaming_mode: Match upstream non-streaming prompt layout. Default True for better non-streaming quality.

        Returns:
            Tuple of ([audio_waveform], sample_rate)
        r   fast_generaterC   r]   r^   r_   r2   r   attention_maskr   r   r3   r   r   rD   r   r&   r%   r   r$   rE   NGeneration returned no tokensrO   r   r   audio_codescpuflattensteps      (@
prefill_ms  decode_s
Generated .2fs audio in s (ms_per_step.1fms/step, RTF: ))rH   r   r   r   r   r.   warningrT   rU   rL   r   speech_tokenizerr   r   r*   r   decoder   rv   hasattrr   r   numpyrV   r   rq   rt   r/   )(r   rB   rC   rJ   r\   rD   r   r&   r%   r   r$   rE   r]   r^   r_   r   rg   r2   r3   r   r   r   r   r   	codec_idstimingr  ref_codes_devcodes_for_decode
audio_listrY   ref_len	total_lenaudio_arraysacutn_stepsaudio_duration
total_timertfr   r   r   generate_voice_clone  sv   )




z#FasterQwen3TTS.generate_voice_clone   
chunk_sizeparity_modec           5      c   s   ddl m}m} | j|||||||d\}}}}}}}}|j}d}t||}g }d}d} |r1|n|}!t|||||||||||	|
||d}"|sQ| j|"d< | j|"d	< |!di |"D ]\}#}$|	|# |#j
d }%tj|dd
}&|&j
d }'| du r|durtj||&j|&gdd
}(n|&}(|d|(di\})}*|)d }+t|+dr|+   }+nt|+dr|+ n|+}+|dur|j
d },|(j
d }-t|,t|-d t|+ }.|+|.d }/n|+}/|/|d }0t|/}|'|krt|/|' } nWtd|'|% | }1|&|1d }2|2j
d |% }3|d|2di\})}*|)d }+t|+dr!|+   }+nt|+dr+|+ n|+}+|3dkrAtt|3|  }4|+|4d }0n|+}0|0|*|$fV  qXdS )a  
        Stream voice-cloned speech generation, yielding audio chunks.

        Same as generate_voice_clone() but yields (audio_chunk, sample_rate, timing)
        tuples every chunk_size codec steps (~chunk_size/12 seconds of audio).

        Args:
            text: Text to synthesize
            language: Target language
            ref_audio: Path to reference audio file
            ref_text: Transcription of reference audio
            max_new_tokens: Maximum tokens to generate
            min_new_tokens: Minimum tokens before EOS is allowed
            temperature: Sampling temperature
            top_k: Top-k sampling
            top_p: Top-p (nucleus) sampling
            do_sample: Whether to sample
            repetition_penalty: Repetition penalty
            chunk_size: Codec steps per chunk (12 = ~1 second)
            xvec_only: When True (default), use only the speaker embedding for voice cloning.
                This prevents phoneme bleed-through from the reference and allows clean
                language switching. Set to False for full ICL mode (reference audio in context).
            non_streaming_mode: When True (default), prefill the full target text before
                streaming decode. Set to False to feed text token-by-token during decode.
            parity_mode: When True, disables CUDA graphs and uses dynamic cache streaming.

        Yields:
            Tuple of (audio_chunk_numpy, sample_rate, timing_dict)
        r   )fast_generate_streamingparity_generate_streamingr      r   N)r2   r   r   r   r   r3   rD   r   r&   r%   r   r$   rE   r"  r   r   r   r   r   r   r   )	streamingr$  r%  r   r  r   rp   r   r   rt   rv   r*   r   r   r   r  r   r  r   r   r  rV   rq   round)5r   rB   rC   rJ   r\   rD   r   r&   r%   r   r$   rE   r"  r]   r^   r_   r#  r$  r%  rg   r2   r3   r   r   r   r   r   r  context_framesmin_calibration_frames	all_codesprev_gen_audio_lensamples_per_frame	stream_fnstream_kwargscodec_chunkr  n_newall_flatn_totalcodes_inputr  rY   rX   r  r  ref_audio_cut	gen_audio	new_audio	ctx_startwindown_ctxctx_samplesr   r   r   generate_voice_clone_streaming|  s   1










z-FasterQwen3TTS.generate_voice_clone_streamingc                 C   s  | j j jdkrtd| j |g | j |g | j j jdv r"d }ddlm} | j||||d\}}}}}}}|||||||| j	| j
|||||	|
|d\}}|d u rdtd tjdtjd	g| jfS |j}|d
|di\}}g }|D ]!}t|dr||    qw|t|dr| n| qw|d }|d }|d d |d  }|dkr|| nd}td|dd|dd|d dd|dd	 ||fS )Ncustom_voice5Loaded model does not support custom voice generation0b6r   r   rB   rC   r   r   r   r   rO   r   r   r   r   r   r  r  r  r  r  r  r  r  r	  r
  r  r  )r   tts_model_typer-   _validate_languages_validate_speakerstts_model_sizerH   r   r   r   r   r.   r  rT   rU   rL   r   r  r  r   r  rt   r   r   r  r/   )r   rB   r   rC   r   rD   r   r&   r%   r   r$   rE   r   rg   r2   r3   r   r   r   r   r  r  r  r  rY   r  r  r  r  r  r  r   r   r   generate_custom_voice  sh   


z$FasterQwen3TTS.generate_custom_voicec           (      c   sx   | j j jdkrtd| j |g | j |g | j j jdv r#d }ddlm} | j||||d\}}}}}}}|j	}d}t
||}g }d}d }|di d	|d
|d|d|d|d|d| jd| jd|d|d|d|d|	d|
d|d|D ]\}}|| |jd }tj|dd}|jd }|d u r|d|di\} }!| d }"t|"dr|"   }"nt|"dr|" n|"}"|"|d  }#t|"}||krt|"| }nWt
d|| | }$||$d  }%|%jd | }&|d|%di\} }!| d }"t|"dr|"   }"nt|"dr|" n|"}"|&dkr1tt|&| }'|"|'d  }#n|"}#|#|!|fV  qd S )Nr=  r>  r?  r   r$  r@  r&  r   r2   r   r   r   r   r3   r   r   rD   r   r&   r%   r   r$   rE   r"  r   r   r   r   r   )r   rA  r-   rB  rC  rD  r'  r$  r   r  r   r   r   rt   rv   r*   r   r  r   r  r   r   r  rq   rV   r(  )(r   rB   r   rC   r   rD   r   r&   r%   r   r$   rE   r"  r$  rg   r2   r3   r   r   r   r   r  r)  r*  r+  prev_audio_lenr-  r0  r  r1  r2  r3  r  rY   rX   r7  r8  r9  r:  r;  r   r   r   generate_custom_voice_streamingc  s   
	





z.FasterQwen3TTS.generate_custom_voice_streamingc                 C   s  | j j jdkrtd| j |g ddlm} | j||d |d\}}}}}}}|||||||| j| j||||||	|
d\}}|d u rTt	
d tjdtjdg| jfS |j}|d	|d
i\}}g }|D ]!}t|drz||    qg|t|dr| n| qg|d }|d }|d d |d  }|d
kr|| nd
}t	d|dd|dd|d dd|dd	 ||fS )Nvoice_design5Loaded model does not support voice design generationr   r   r@  r   r   rO   r   r   r   r   r   r  r  r  r  r  r  r  r  r	  r
  r  r  )r   rA  r-   rB  rH   r   r   r   r   r.   r  rT   rU   rL   r   r  r  r   r  rt   r   r   r  r/   )r   rB   r   rC   rD   r   r&   r%   r   r$   rE   r   rg   r2   r3   r   r   r   r   r  r  r  r  rY   r  r  r  r  r  r  r   r   r   generate_voice_design  sb   


z$FasterQwen3TTS.generate_voice_designc           '      c   sX   | j j jdkrtd| j |g ddlm} | j||d |d\}}}}}}}|j}d}t||}g }d}d }|di d|d	|d
|d|d|d|d| j	d| j
d|d|d|d|d|d|	d|
d|D ]\}}|| |jd }tj|dd}|jd }|d u r|d|di\}} |d }!t|!dr|!   }!nt|!dr|! n|!}!|!|d  }"t|!}||krt|!| }nWtd|| | }#||#d  }$|$jd | }%|d|$di\}} |d }!t|!dr|!   }!nt|!dr|! n|!}!|%dkr!tt|%| }&|!|&d  }"n|!}"|"| |fV  qpd S )NrI  rJ  r   rF  r@  r&  r   r2   r   r   r   r   r3   r   r   rD   r   r&   r%   r   r$   rE   r"  r   r   r   r   r   )r   rA  r-   rB  r'  r$  r   r  r   r   r   rt   rv   r*   r   r  r   r  r   r   r  rq   rV   r(  )'r   rB   r   rC   rD   r   r&   r%   r   r$   rE   r"  r$  rg   r2   r3   r   r   r   r   r  r)  r*  r+  rG  r-  r0  r  r1  r2  r3  r  rY   rX   r7  r8  r9  r:  r;  r   r   r   generate_voice_design_streaming  s   
	





z.FasterQwen3TTS.generate_voice_design_streaming)r@   r   r#   r"   TrA   )rI   )TFTr   )
r   r   r#   r"   r   TrA   TTT)r   r   r#   r"   r   TrA   r!  TTTF)Nr   r   r#   r"   r   TrA   )	Nr   r   r#   r"   r   TrA   r!  )r   r   r#   r"   r   TrA   )r   r   r#   r"   r   TrA   r!  )$__name__
__module____qualname____doc__r*   bfloat16r(   r   rV   r   classmethodr   r1   r?   floatboolr   listrH   r   rT   ndarrayr[   r   r   r   ru   inference_moder   r   rp   r<  rE  rH  rK  rL  r   r   r   r   r	      sr   
R
	
*

Y
.
 a
	

n
	
 	

L	
a	

G	
r	   )rP  loggingpathlibr   typingr   r   r   r   r  rT   	soundfilerP   r*   r   r   	getLoggerrM  r.   r	   r   r   r   r   <module>   s    
