o
    iO                     @   s  d Z ddlZddlm  mZ ddlZddlZddl	Z	ddl
mZmZ ddlmZ ddlmZ ddlmZ G dd dZeG d	d
 d
ZeG dd dZG dd dZedkrgddlZejddZejddd ejdddd ejdddd ejdddd ejddd  ejd!dd"d ejd#ed$d% ejd&ed'd% ejd(d)d*d+ ejd,d-d  ejd.d)d/d+ e Zej rej!nej"Z#eej$ej%e#d0Z&ej're&'  dS ej(r1e&j)ej(ej*ej+ej,rej,ge-ej( ndd1Z.eej/pd2Z0e0j1dd3 e2e.D ])\Z3Z4e0d4e3d5d6 Z5e46e7e5 e8d7e3 d8e5 d9e4j9d:d;e4j: d<	 qdS ej;rae&j<ej;ej*ej+ej,d=Z=e=6ej> e8d>ej> d9e=j9d:d;e=j: d?e=j?d:d@	 dS e@  dS dS )AaH  
Production inference engine for LeWM TTS.
Features:
  - Batched inference (many concurrent requests in parallel)
  - KV-cache (O(1) per step instead of O(n))
  - FP16
  - Audio prompt support (for multi-speaker / voice cloning)
  - Continuous batching (add/remove requests dynamically)
  - Griffin-Lim + Vocos vocoder pipeline
    N)	dataclassfield)Optional)Path)LeWMTTSc                   @   s"   e Zd ZdZdddZdd ZdS )	VocoderuB   Griffin-Lim → Vocos refinement pipeline for 80-mel → waveform.cpuc                 C   sP   t jjdddd| _t jjddddd	| _d
dlm} |d| _| j	  d S )Ni  P   ]  )n_stftn_melssample_rate      @   g      ?)n_fft
hop_lengthn_iterpowerr   )Vocoszcharactr/vocos-mel-24khz)

torchaudio
transformsInverseMelScaleinv_mel
GriffinLimgriffin_limvocosr   from_pretrainedeval)selfdevicer    r!   )/home/ubuntu/lewm-tts/inference_engine.py__init__   s   zVocoder.__init__c              	   C   s   g }t |jd D ]<}|||d  }t|}| |}| |}t  | |}W d   n1 s6w   Y  ||	d
  q	|S )u=   mel: [B, 80, T] → waveforms list of [T_audio] numpy arrays.r      N)rangeshapetorchexpr   r   no_gradr   appendsqueezenumpy)r   mel	waveformsimm_expspecwavr!   r!   r"   __call__(   s   



zVocoder.__call__N)r   )__name__
__module____qualname____doc__r#   r4   r!   r!   r!   r"   r      s    
r   c                   @   s@   e Zd ZU eed< dZeed< dZeed< dZ	e
ej ed< dS )
TTSRequesttext,  	max_steps        temperatureN
prompt_mel)r5   r6   r7   str__annotations__r<   intr>   floatr?   r   r'   Tensorr!   r!   r!   r"   r9   :   s
   
 r9   c                   @   sP   e Zd ZU ejed< dZeed< dZeed< dZ	e
ed< dd	 Zed
d ZdS )	TTSResultwaveformr
   r   r   steps_takenr=   generation_timec                 C   s&   t | jd}t||| j d S )Nr   )r'   
from_numpyrF   	unsqueezer   saver   )r   pathr3   r!   r!   r"   rK   I   s   zTTSResult.savec                 C   s   t | j| j S N)lenrF   r   )r   r!   r!   r"   durationM   s   zTTSResult.durationN)r5   r6   r7   npndarrayrA   r   rB   rG   rH   rC   rK   propertyrO   r!   r!   r!   r"   rE   B   s   
 
rE   c                   @   sn   e Zd ZdZdejfddZdd Zdd Zdd
dZ			dddZ
			dddZ			dddZdddZdS )InferenceEngineu  
    High-throughput batched TTS inference engine.

    Usage:
        engine = InferenceEngine("checkpoint.pt")

        # Single request
        result = engine.synthesize("नमस्ते भारत")

        # Batched (high throughput)
        results = engine.synthesize_batch(["text1", "text2", ...])

        # With voice prompt
        result = engine.synthesize("text", prompt_audio="ref.wav")

        # Benchmark
        engine.benchmark()
    cudac                 C   s   t t j r	|nd| _|| _t j|| jdd}|d | _t| j| j|| _	| j	
|d  | j	  | jd | _tdd| _d| _td	|  td
| j d|  tdtdd | j	 D d dd d S )Nr   F)map_locationweights_onlyconfigmodeld_model)r    g      @zEngine ready: z
  Device: z	, Dtype: 	  Model: c                 s   s    | ]}|  V  qd S rM   )numel).0pr!   r!   r"   	<genexpr>}       z+InferenceEngine.__init__.<locals>.<genexpr>g    .Az.1fzM params)r'   r    rT   is_availabledtypeloadrW   r   torX   load_state_dictr   rY   r   vocoder_start_scaleprintsum
parameters)r   checkpoint_pathr    ra   ckptr!   r!   r"   r#   h   s   

.zInferenceEngine.__init__c           	      C   s   dd |D }t dd |D }tjt||tj| jd}tjt||tj| jd}t|D ]\}}tj	|tjd||dt|f< d||dt|f< q.t
  | jj||d	}W d   ||fS 1 sgw   Y  ||fS )
u   Encode list of texts → padded embeddings + mask.
        Returns: text_emb [B, T_max, d], text_mask [B, T_max] (True=padding)
        c                 S      g | ]	}t |d qS zutf-8listencoder\   tr!   r!   r"   
<listcomp>       z1InferenceEngine._encode_texts.<locals>.<listcomp>c                 s       | ]}t |V  qd S rM   rN   rq   r!   r!   r"   r^      r_   z0InferenceEngine._encode_texts.<locals>.<genexpr>ra   r    ra   NF	text_mask)maxr'   zerosrN   longr    onesbool	enumeratetensorr)   rX   text_encoder)	r   textstoken_listsmax_lenpaddedmaskr/   tokenstext_embr!   r!   r"   _encode_texts   s    

zInferenceEngine._encode_textsc                 C   sN   | | j | j}t  | j|}W d   |S 1 s w   Y  |S )u   Encode prompt mel → embeddings for AR seeding.
        prompt_mel: [1, 80, T]
        Returns: prompt_embs [1, T_down, d]
        N)rc   r    ra   r'   r)   rX   encode_audio)r   r?   embsr!   r!   r"   _encode_prompt   s   

zInferenceEngine._encode_prompt      @c                 C   sz   t |\}}|dkrt j||d}t|d }|ddd|f }t jjdddddd}||}t|j	d	d
}|S )uO   Load audio file → mel for prompting.
        Returns: mel [1, 80, T]
        r
   Nr$   r   r   r	   g       @)r   r   r   r   r   gh㈵>)min)
r   rb   
functionalresamplerB   r   MelSpectrogramr'   logclamp)r   
audio_pathmax_secondsr3   srmax_samplesmel_transformr-   r!   r!   r"   load_prompt_audio   s   z!InferenceEngine.load_prompt_audior;   r=   Nc                  C   s  | j }| j }tj|tj| jd}	dg| }
dd t|D }dd t|D }|durtdd |D dd	}|dkrtj	|||| j| j
d
}g }t|D ]"\}}|durk|jd }|d ||d|f< || qN|d qNt|D ]8}|dd||d ddf }| jj|||||d\}}t|D ]}||| k r|| |||d   qqu|}| jj|ddddddf ||||d\}}|}|d7 }n	d}d}nd}d}|du rtj|d|| j| j
d
| j }| jj||d||d\}}t|D ]}|| |||d   qd}|dkr|t||  }t|D ]}|| |||d   q!td|D ]}|	 s@ nw| jj|||| ||d\}}|dkr]|t||  }t|D ]Q}|	| sjqa|| |||d   ||
|< ||   }|| | t|| dkr|| dd }t|dt| k r|dkrd|	|< qa|}q7g }t|D ]}tj|| dd}|| q||
fS )a  
        Core batched AR generation with KV-cache.

        Args:
            text_emb: [B, T_text, d]
            text_mask: [B, T_text]
            batch_size: int
            max_steps: int
            temperature: float
            prompt_embs: list of [1, T_prompt, d] or None per request

        Returns:
            all_embs: [B, T_total, d]
            steps_taken: list of int per request
        rw   r   c                 S      g | ]}g qS r!   r!   r\   _r!   r!   r"   rs          z3InferenceEngine._generate_batch.<locals>.<listcomp>c                 S   r   r!   r!   r   r!   r!   r"   rs      r   Nc                 s   s"    | ]}|d ur|j d V  qd S )Nr$   )r&   )r\   er!   r!   r"   r^      s     z2InferenceEngine._generate_batch.<locals>.<genexpr>defaultr    ra   r$   ry      ig{Gz?   F)dim)rY   rX   init_ar_cacher'   r~   r   r    r%   r{   r|   ra   r   r&   r*   predict_next_cachedrandnrf   
randn_likeanynormitemrN   rP   stdmeancat) r   r   rz   
batch_sizer<   r>   prompt_embsdcacheactiverG   all_emb_lists
prev_normsmax_prompt_lenprompt_paddedprompt_lengthsr/   peLrr   step_embr   step_offset	last_predcurrent_emb	start_embstepnext_embr   recentresultsr   r!   r!   r"   _generate_batch   s   














$zInferenceEngine._generate_batchc                 C   s$   | j |g|||r|gnd|dd S )aK  Synthesize single text.

        Args:
            text: Hindi text string
            max_steps: max AR steps
            temperature: sampling temperature (0 = greedy)
            prompt_audio: path to reference audio for voice cloning
            prompt_seconds: max seconds of prompt to use

        Returns: TTSResult
        N)r<   r>   prompt_audiosprompt_secondsr   )synthesize_batch)r   r:   r<   r>   prompt_audior   r!   r!   r"   
synthesize;  s   zInferenceEngine.synthesizec              	   C   sb  t |}t }| |\}}	d}
|dur7g }
|D ]}|dur1| j||d}| |}|
| q|
d qt  | j||	||||
d\}}W d   n1 sSw   Y  g }t	|D ]P}t  | j
|| | j}W d   n1 s{w   Y  | |  }|d }t| }|dkr|| d }|t||| t | d q^|S )aR  Synthesize a batch of texts in parallel.

        Args:
            texts: list of text strings
            max_steps: max AR steps
            temperature: sampling temperature
            prompt_audios: list of audio paths (or None) per text
            prompt_seconds: max seconds of prompt

        Returns: list of TTSResult
        N)r   )r<   r>   r   r   gffffff?)rF   rG   rH   )rN   timer   r   r   r*   r'   r)   r   r%   rX   mel_decoderrc   r    re   rC   r   rP   absr{   rE   )r   r   r<   r>   r   r   Bt0r   rz   r   par-   r   emb_resultsrG   r   r/   wav_listrF   mxr!   r!   r"   r   N  sF   




z InferenceEngine.synthesize_batchc                 C   s  |du rg d}d}d}t dd  t d| j d| j  t d	| jd
  d| jd  d t d|  t d  t ddddddddddddddddddddd t dd dd dd dd dd dd  dd!  |D ]@}z|g| }d"d# |D }td$d% |D }tj||tj| jd&}	tj||tj	| jd&}
t
|D ]\}}tj|tjd'|	|dt|f< d(|
|dt|f< qt  | jj|	|
d)}W d   n1 sw   Y  ||d* kr| j }tj|d+| j| j| jd,| j }|}td-D ]}| jj|||||
d)\}}qtj  | j }tj|d+| j| j| jd,| j }|}tj  t }t|D ]}| jj|||||
d)\}}qBtj  t | }|| | }|||  }|| | }|| }|| }t d|dd|d.d|d/d|d0d|d1d2|d3d2|d4d5 W qw tjjy   t d|ddd6d tj  Y  nw t d  t d7 t d8 t d9 t   dS ):z,Benchmark throughput at various batch sizes.N)r$             r      r   ul   भारत एक महान देश है और हम सब भारतवासी हैं।ggإ?
zF======================================================================u#     LeWM TTS Inference Benchmark — z, rZ   rY   zd, predictor_layerszL predictorz  AR steps: z  Batchz>6 zTime(s)z>8zAudio(s)z>9RTFSpeed
Throughputz>12zPer-reqz>10z------z--------z	---------z------------z
----------c                 S   rl   rm   rn   rq   r!   r!   r"   rs     rt   z-InferenceEngine.benchmark.<locals>.<listcomp>c                 s   ru   rM   rv   rq   r!   r!   r"   r^     r_   z,InferenceEngine.benchmark.<locals>.<genexpr>rw   rx   Fry   r   r$   r      z>8.3fz>9.1fz>8.4fz>7.0fzx z>10.0fz>9.3fsOOMz<  Throughput = total audio-seconds generated per wall-secondz)  Speed = single-request real-time factorz*  Per-req = wall-time per request in batch)rg   r    ra   rW   r{   r'   r|   r}   r~   r   r   r   rN   r)   rX   r   r   r   rY   rf   r%   r   rT   synchronizer   OutOfMemoryErrorempty_cache)r   batch_sizesstepsr:   audio_per_stepr   r   r   r   r   r   r/   r   r   r   startnxtr   r   elapsedtotal_audiortfspeed
throughputper_reqr!   r!   r"   	benchmark  s   "
$2

 







$


zInferenceEngine.benchmark)r   )r;   r=   N)r;   r=   Nr   )Nr;   )r5   r6   r7   r8   r'   float16r#   r   r   r   r   r   r   r   r!   r!   r!   r"   rS   T   s"    


 

?rS   __main__zLeWM TTS Inference Engine)descriptionz--checkpointT)requiredz--textzText to synthesize)r   helpz--texts+z"Multiple texts for batch synthesis)nargsr   z--promptz!Reference audio for voice cloningz--outputz
output.wavr   z--output_dirzOutput dir for batch modez--max_stepsr;   )typer   z--temperaturer=   z--benchmark
store_truezRun throughput benchmark)actionr   z--devicerT   z--fp32zUse FP32 instead of FP16r   )r<   r>   r   output)exist_okbatch_03dz.wavz  [z] u    — z.2fzs, z steps)r<   r>   r   zSaved: z steps, r   )Ar8   r'   torch.nn.functionalnnr   Fr   r,   rP   r   dataclassesr   r   typingr   pathlibr   rX   r   r   r9   rE   rS   r5   argparseArgumentParserparseradd_argumentrB   rC   
parse_argsargsfp32float32r   ra   
checkpointr    enginer   r   r   r<   r>   promptrN   r   
output_dirout_dirmkdirr   r/   rrL   rK   r@   rg   rO   rG   r:   r   resultr   rH   
print_helpr!   r!   r!   r"   <module>   s~        
,