o
    i7                     @   sx  d Z ddlZddlmZ ddlZddlZddlZddlZddl	m
Z
mZ ddlmZmZmZmZ ddlmZ ddlZddlZddlm  m  mZ G dd dejZG dd	 d	ejZee_ee_dd
lmZ ddl m!Z" e# 					d%dee$ de%de&de%de%de%de%deej' fddZ(e
G dd dZ)G dd dZ*de$dee$ fd d!Z+d"d# Z,e-d$kre,  dS dS )&a  
LuxTTS Batched Inference Engine

Supports:
- Dynamic batching: collects requests and processes them in batches
- Async request queue with configurable max batch size and max wait time
- Chunked text streaming: splits long text at sentence boundaries
- FastAPI server for HTTP testing
    N)	dataclassfield)ListOptionalDictTuple)Futurec                   @      e Zd Zdd ZdS )FastSwooshLc                 C   0   t jd|j|jd}t ||d d|  d S )N        dtypedeviceg      @{Gz?gQ?torchtensorr   r   	logaddexpselfxzero r   (/home/ubuntu/LuxTTS/batched_inference.pyforward      zFastSwooshL.forwardN__name__
__module____qualname__r   r   r   r   r   r
          r
   c                   @   r	   )FastSwooshRc                 C   r   )Nr   r         ?r   gtN0z?r   r   r   r   r   r       r   zFastSwooshR.forwardNr   r   r   r   r   r"      r!   r"   )LuxTTS)generate         @r#         ?皙?texts
prompt_rms	num_stepsguidance_scalespeedt_shift
target_rmsreturnc                 C   s   t | }t| j}|
d }|| }|| }||dd}||}|j||||||d||	d	\}}}}|dddd }||	dd}||k rR|||  }g }t
|D ]%}t||  }t|d	 |jd }||d
|f   }|| qX|S )z
    Generate speech for multiple texts in a single batched forward pass.
    Returns list of numpy arrays (one per text).
    g?predict)	tokensprompt_tokensprompt_featuresprompt_features_lensr.   r/   durationnum_stepr-   r         r)   i   N)lennext
parametersr   texts_to_token_idsexpandsamplepermutedecodeclamprangeintitemminshapecpunumpyappend)r*   modelvocoder	tokenizerr5   r6   r7   r+   r,   r-   r.   r/   r0   
batch_sizer   speed_adjusted
all_tokensbatch_prompt_tokensbatch_prompt_featuresbatch_prompt_features_lenspred_features	pred_lens_pred_features_perm	wav_batchresultsinum_mel_framesnum_audio_sampleswav_ir   r   r   batched_generate-   s:   

r`   c                   @   s<   e Zd ZU eed< eedZeed< eejdZ	e
ed< dS )
TTSRequesttext)default_factoryfuture	timestampN)r   r   r    str__annotations__r   r   rd   timere   floatr   r   r   r   ra   p   s   
 ra   c                   @   s   e Zd ZdZ							d'd	ed
edededededefddZd(dededefddZ	dede
fddZdd Zdd Zdd  Zd!ee fd"d#Zdefd$d%Zd&S ))BatchedTTSEnginez
    Batched TTS engine with dynamic batching.

    Collects requests and processes them in batches for maximum GPU utilization.
              I@r&   r'   r#   r(   cudamax_batch_sizemax_wait_msr,   r-   r.   r/   r   c           
      C   s   || _ |d | _|| _|| _|| _|| _t | _d| _	d| _
d| _d| _d| _td t }t|d| _| jj| _| jj| _| jj| _|| _tdt | dd	 tj d
 }	td|	dd d S )Ng     @@Fr   r   zLoading LuxTTS model...)r   zModel loaded in .1fs   @zGPU memory: .2f GB)rn   
max_wait_sr,   r-   r.   r/   queueQueue_queue_runningtotal_requeststotal_batchestotal_audio_secondstotal_gpu_secondsprintrh   r$   _ttsrM   _modelvocos_vocoderrO   
_tokenizer_devicer   rm   memory_allocated)
r   rn   ro   r,   r-   r.   r/   r   t0memr   r   r   __init__~   s,   





zBatchedTTSEngine.__init__   
audio_pathr8   r1   c                 C   sD   | j j||d}|d | _|d | _|d | _|d | _td |S )z Encode a reference audio prompt.)r8   r5   r6   r7   r+   zPrompt encoded.)r   encode_prompt_prompt_tokens_prompt_features_prompt_features_lens_prompt_rmsr~   )r   r   r8   encodedr   r   r   r      s   



zBatchedTTSEngine.encode_promptrb   c                 C   s   t |d}| j| |jS )zBSubmit a TTS request. Returns a Future with the audio numpy array.rb   )ra   rx   putrd   )r   rb   reqr   r   r   submit   s   
zBatchedTTSEngine.submitc                 C   sF   d| _ tj| jdd| _| j  td| j d| jd dd dS )	z"Start the batch processing thread.T)targetdaemonz Batch engine started (max_batch=, max_wait=  .0fzms)N)	ry   	threadingThread_batch_loop_threadstartr~   rn   ru   r   r   r   r   r      s   
$zBatchedTTSEngine.startc                 C   s   d| _ | jjdd dS )z!Stop the batch processing thread.Fr   timeoutN)ry   r   joinr   r   r   r   stop   s   zBatchedTTSEngine.stopc                 C   s   | j rag }z| jjdd}|| W n
 tjy   Y q w t | j }t|| j	k rW|t  }|dkr7n z| jj|d}|| W n
 tjyO   Y nw t|| j	k s,| 
| | j sdS dS )z5Main loop: collect requests into batches and process.r)   r   r   N)ry   rx   getrL   rv   Emptyrh   ru   r<   rn   _process_batch)r   batchfirstdeadline	remainingr   r   r   r   r      s,   
zBatchedTTSEngine._batch_loopr   c                 C   s"  dd |D }t |}zgtj  t }t|| j| j| j| j	| j
| j| j| j| j| j| jd}tj  t | }|  jd7  _|  j|7  _tdd |D }|  j|7  _|  j|7  _t||D ]
\}}	|j|	 qeW dS  ty }
 z|D ]}|j|
 q|W Y d}
~
dS d}
~
ww )zProcess a batch of requests.c                 S   s   g | ]}|j qS r   r   .0rr   r   r   
<listcomp>   s    z3BatchedTTSEngine._process_batch.<locals>.<listcomp>)r*   rM   rN   rO   r5   r6   r7   r+   r,   r-   r.   r/   r;   c                 s   s    | ]	}t |d  V  qdS )逻  N)r<   r   r   r   r   	<genexpr>   s    z2BatchedTTSEngine._process_batch.<locals>.<genexpr>N)r<   r   rm   synchronizerh   r`   r   r   r   r   r   r   r   r,   r-   r.   r/   r{   rz   sumr|   r}   ziprd   
set_result	Exceptionset_exception)r   r   r*   bsr   r[   elapsedtotal_audior   resulter   r   r   r      sF   

zBatchedTTSEngine._process_batchc                 C   sb   | j | j| j td| j | j| j| jtd| j | jtd| j | j td| j tj d d	S )Nr;   gMbP?rr   )	rz   r{   avg_batch_sizetotal_audio_stotal_gpu_savg_rtfeffective_speedthroughput_req_per_speak_vram_gb)rz   r{   maxr|   r}   r   rm   max_memory_allocatedr   r   r   r   stats  s   zBatchedTTSEngine.statsN)rk   rl   r&   r'   r#   r(   rm   )r   )r   r   r    __doc__rF   ri   rf   r   dictr   r   r   r   r   r   r   ra   r   r   r   r   r   r   rj   w   s>    
(
*rj   rb   c                 C   s   t d|  }dd |D S )z"Split text at sentence boundaries.z(?<=[.!?])\s+c                 S   s   g | ]}|  r|qS r   )strip)r   pr   r   r   r     s    z#split_sentences.<locals>.<listcomp>)resplitr   )rb   partsr   r   r   split_sentences  s   r   c                  C   s  dd l } |  }|jddd |jdtdd |jdtd	d |jd
tdd |jdtdg dd | }t|j|j|j	d}|
|j |  g d}td td td td|j d|j d|j	  td |d}|jdd td |jD ]K}d|_d|_d|_d|_tj  td| d g }g }t }	t|D ]}
||
t|  }|t  ||| qg }g }g }t|D ])\}
}|jd d}t }|||
  }|| |||	  |t|d!  qt |	 }| }t |d	d" }t |d#d" }t |d$d" }t!|d" }t"|d" }t#|d" }t#|}td%|d&d' td(|d)   td*|d+ d, td-|| d,d. td/|d0 d&d' td1|d2 d,d3|d4d5 td6|d7 d8d9|d: d;d< td=|d> d4d? td@ tdA|d;dB tdC|d;dB tdD|d;dB tdE|d;dB tdF|d;dB tdG|d;dB tdH q|$  tdI d S )JNr   z--ref-audioz!/home/ubuntu/LuxTTS/ref_audio.wav)defaultz--max-batchrk   )typer   z--max-wait-ms2   z--num-stepsr&   z--concurrency+)r;         rk   @         i  )r   nargsr   )rn   ro   r,   )
z6Hello, this is a test of the batched inference system.z?The quick brown fox jumps over the lazy dog near the riverbank.zGWelcome to the annual technology conference on artificial intelligence.zBMachine learning has transformed how we approach complex problems.zINatural language processing enables computers to understand human speech.zDDeep learning models continue to improve in both speed and accuracy.zLThe future of computing lies in efficient parallel processing architectures.zHVoice synthesis technology has made remarkable progress in recent years.zGToday we demonstrate the power of batched inference for text to speech.zHHigh concurrency serving requires careful optimization of GPU resources.zG
======================================================================zBATCHED INFERENCE BENCHMARKzF======================================================================zConfig: max_batch=r   z
ms, steps=z

Warmup...zWarmup request.   r   r(   r   z
--- Concurrency: z ---x   r   r   _   c   z  Wall time:     z.3frq   z  Batches:       r{   z  Avg batch sz:  r   rp   z  Throughput:    z req/sz  GPU time:      r   z  Audio gen:     r   zs total (avg rs   zs/req)z  Effective RTF: r   z.5fz (r   r   zx realtime)z  Peak VRAM:     r   rt   z0  --- TTFB (time to first byte / full audio) ---z  TTFB min:      msz  TTFB p50:      z  TTFB p95:      z  TTFB p99:      z  TTFB max:      z  TTFB avg:      g333333?z
Done.)%argparseArgumentParseradd_argumentrF   ri   
parse_argsrj   	max_batchro   r,   r   	ref_audior   r~   r   r   rh   sleepconcurrencyrz   r{   r|   r}   r   rm   reset_peak_memory_statsrE   r<   rL   	enumerater   np
percentilerH   r   meanr   )r   parserargsengine
TEST_TEXTSfconcsubmit_timesfuturesr   r\   rb   ttfbs	latenciesaudio_durationsr   	done_timettfb
total_wallr   ttfb_p50ttfb_p95ttfb_p99ttfb_minttfb_maxttfb_avg	avg_audior   r   r   run_benchmark  s    




"r   __main__)r&   r'   r#   r(   r)   ).r   r   torch.nnnnrh   asynciorK   r   r   dataclassesr   r   typingr   r   r   r   concurrent.futuresr   rv   r   zipvoice.models.modules.scalingmodelsmodulesscalingModuler
   r"   SwooshLSwooshRzipvoice.luxvoicer$   zipvoice.modeling_utilsr%   orig_generateinference_moderf   ri   rF   ndarrayr`   ra   rj   r   r   r   r   r   r   r   <module>   sf    

	
B !n
