o
    £¬Ái$  ã                   @   sŠ   d Z ddlZddlmZ ddlZddlZddlZddlZddl	m
Z
 ddlmZ G dd„ dƒZG dd„ dƒZd	d
„ ZedkrCeƒ  dS dS )uj   
Inference pipeline for LeWM TTS.
Text â†’ JEPA predictor â†’ mel reconstruction â†’ vocoder â†’ waveform
é    N)ÚPath)ÚLeWMTTSc                   @   s"   e Zd ZdZddd„Zdd	„ Zd
S )ÚGriffinLimVocoderz Griffin-Lim vocoder as fallback.é   é   éP   éÀ]  c                 C   sH   || _ || _|| _tjj|d d ||d| _tjj||ddd| _d S )Né   é   )Ún_stftÚn_melsÚsample_rateé@   g      ð?)Ún_fftÚ
hop_lengthÚn_iterÚpower)	r   r   r   Ú
torchaudioÚ
transformsÚInverseMelScaleÚinverse_melÚ
GriffinLimÚgriffin_lim)Úselfr   r   r   r   © r   ú"/home/ubuntu/lewm-tts/inference.pyÚ__init__   s   
ýüzGriffinLimVocoder.__init__c                 C   s"   t  |¡}|  |¡}|  |¡}|S )N)ÚtorchÚexpr   r   )r   ÚmelÚspecÚwaveformr   r   r   Ú__call__%   s   


zGriffinLimVocoder.__call__N)r   r   r   r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r"   r   r   r   r   r      s    
r   c                   @   s\   e Zd ZdZddd„Zdd„ Ze ¡ dd	d
„ƒZe ¡ dd„ ƒZ	e ¡ dd„ ƒZ
dd„ ZdS )ÚLeWMTTSInferencezFull inference pipeline.Úcudac                 C   s   t  t j ¡ r	|nd¡| _t j|| jdd}|d }t|ƒ | j¡| _| j |d ¡ | j 	¡  t
dddd	d
| _|| _d| _td|› ƒ d S )NÚcpuF)Úmap_locationÚweights_onlyÚconfigÚmodelr   r   r   r   )r   r   r   r   g      @zModel loaded from )r   Údevicer(   Úis_availableÚloadr   Útor-   Úload_state_dictÚevalr   Úvocoderr,   Ú_start_emb_scaleÚprint)r   Úcheckpoint_pathr.   Úckptr,   r   r   r   r   /   s   
ÿzLeWMTTSInference.__init__c                 C   s,   t | d¡ƒ}tj|tjd d¡ | j¡S )Nzutf-8)Údtyper   )ÚlistÚencoder   ÚtensorÚlongÚ	unsqueezer1   r.   )r   ÚtextÚtokensr   r   r   Útext_to_tokensH   s   zLeWMTTSInference.text_to_tokensé,  ç        c                 C   s   |   |¡}| j |¡}| jd }tjdd|| jd| j }|g}g }	t|ƒD ]M}
tj	|dd}| j 
||¡}|dkrEt |¡| }|| }| |¡ | ¡  ¡ }|	 |¡ t|	ƒdkrs|	dd… }t |¡d	t |¡ k rs|
d
krs nq&tj	|dd}| j |¡}|  | ¡ ¡}| ¡  ¡ dfS )z@
        Generate speech from text using AR prediction.
        Úd_modelr
   )r.   ©Údimr   é   iìÿÿÿNg{®Gáz„?é   r   )rA   r-   Útext_encoderr,   r   Úrandnr.   r5   ÚrangeÚcatÚpredict_nextÚ
randn_likeÚappendÚnormÚitemÚlenÚnpÚstdÚmeanÚmel_decoderr4   r)   ÚsqueezeÚnumpy)r   r?   Ú	max_stepsÚtemperatureÚtext_tokensÚtext_embÚdÚ	start_embÚall_embeddingsÚ
prev_normsÚstepÚcontextÚnext_embÚnoiserP   ÚrecentÚ
audio_embsr   r!   r   r   r   Ú
synthesizeL   s0   



 €zLeWMTTSInference.synthesizec                 C   sX   t j|dd d¡ | j¡}| j |¡\}}}| j |¡}|  | 	¡ ¡}| 
¡  ¡ dfS )u   
        Teacher-forced reconstruction: encode real mel â†’ decode back.
        Tests mel decoder + vocoder quality without AR drift.
        T©r+   r   r   )r   r0   r>   r1   r.   r-   Úaudio_encoderrV   r4   r)   rW   rX   )r   Úmel_pathr   ÚzÚmuÚlogvarÚ	mel_reconr!   r   r   r   Úreconstructw   s
   zLeWMTTSInference.reconstructc                 C   sº   t j|dd d¡ | j¡}| j |¡\}}}t jddt j| jd}| j 	|¡}| j 
|dd…dd…f |¡}t j|dd…dd…f |gdd}	| j |	¡}
|  |
 ¡ ¡}| ¡  ¡ d	fS )
zœ
        Hybrid: encode real mel, run predictor on real embeddings,
        decode predicted embeddings. Tests prediction quality without AR drift.
        Trh   r   r
   )r9   r.   NéÿÿÿÿrE   r   )r   r0   r>   r1   r.   r-   ri   Úzerosr=   rI   Ú	predictorrL   rV   r4   r)   rW   rX   )r   rj   r   rk   rl   rm   r[   r\   Ú	predictedÚcombinedrn   r!   r   r   r   Úreconstruct_with_predictionŠ   s   "z,LeWMTTSInference.reconstruct_with_predictionc                 C   sB   t  |¡ d¡}t |||¡ td|› dt|ƒ| d›dƒ d S )Nr   zSaved: z (z.2fzs))r   Ú
from_numpyr>   r   Úsaver6   rR   )r   r!   ÚsrÚoutput_pathÚ
wav_tensorr   r   r   Ú
save_audio¥   s   $zLeWMTTSInference.save_audioN)r(   )rB   rC   )r#   r$   r%   r&   r   rA   r   Úno_gradrg   ro   ru   r{   r   r   r   r   r'   ,   s    
*

r'   c                  C   s"  t  ¡ } | jddd | jdd dd | jdd dd | jd	g d
¢dd | jddd | jdtdd | jdtdd |  ¡ }t|jƒ}|jdkr^|j	sQJ dƒ‚| 
|j	|j|j¡\}}n)|jdkrs|jsjJ dƒ‚| |j¡\}}n|jdkr‡|jsJ dƒ‚| |j¡\}}| |||j¡ d S )Nz--checkpointT)Úrequiredz--textzText to synthesize)ÚdefaultÚhelpz--melz Mel path for reconstruction testz--mode)rg   ro   Úpredictrg   )Úchoicesr~   z--outputz
output.wav)r~   z--max_stepsrB   )Útyper~   z--temperaturerC   zNeed --text for synthesisro   zNeed --mel for reconstructionr€   zNeed --mel for prediction test)ÚargparseÚArgumentParserÚadd_argumentÚintÚfloatÚ
parse_argsr'   Ú
checkpointÚmoder?   rg   rY   rZ   r   ro   ru   r{   Úoutput)ÚparserÚargsÚttsr!   rx   r   r   r   Úmain«   s,   ÿ



r   Ú__main__)r&   r   Útorch.nnÚnnr   Újsonrƒ   rX   rS   Úpathlibr   r-   r   r   r'   r   r#   r   r   r   r   Ú<module>   s    
ÿ