o
    1åÂi,  ã                   @   sl   d Z ddlZddlZddlZddlZddlmZ ddlm	Z	m
Z
 G dd„ dƒZdd„ Zed	kr4eƒ  dS dS )
u¯   
Inference for LeWM TTS v5 (codec-based JEPA with duration predictor).
Text â†’ duration predict â†’ expand text â†’ JEPA predict â†’ proj_out â†’ EnCodec decode â†’ waveform
é    N)ÚPath)Ú	LeWMTTSv5Úlength_regulatec                   @   s~   e Zd Zddd„Zdd„ Ze ¡ ddd	„ƒZe ¡ 	
	ddd„ƒZe ¡ dd„ ƒZ	e ¡ dd„ ƒZ
e ¡ dd„ ƒZdd„ Zd
S )ÚLeWMTTSv5InferenceÚcudac                 C   s¬   t  t j ¡ r	|nd¡| _t j|| jdd}|d }t|ƒ | j¡| _| j |d ¡ | j 	¡  ddl
m} | ¡ | _| j d¡ | j 	¡  | j¡ || _td	|› ƒ d S )
NÚcpuF)Úmap_locationÚweights_onlyÚconfigÚmodelr   )ÚEncodecModelg      @zv5 model loaded from )ÚtorchÚdevicer   Úis_availableÚloadr   Útor   Úload_state_dictÚevalÚencodecr   Úencodec_model_24khzÚcodecÚset_target_bandwidthr
   Úprint)ÚselfÚcheckpoint_pathr   Úckptr
   r   © r   ú%/home/ubuntu/lewm-tts/inference_v5.pyÚ__init__   s   

zLeWMTTSv5Inference.__init__c                 C   s,   t | d¡ƒ}tj|tjd d¡ | j¡S )Nzutf-8)Údtyper   )ÚlistÚencoder   ÚtensorÚlongÚ	unsqueezer   r   )r   ÚtextÚtokensr   r   r   Útext_to_tokens#   s   z!LeWMTTSv5Inference.text_to_tokensç      ð?ç        c                 C   s¾  |   |¡}| j |¡}| j |¡}| ¡ |  ¡  ¡ jdd}| ¡  	¡ }t
dt|ƒ› d|› d|d d›dƒ t||ƒ}| jj | j¡}	| j ¡ }
|	g}|d	d	…d
d…f }| j |	|d
|
¡\}}
|d
kro|t |¡|  }| |¡ td|ƒD ]@}||jd k r|d	d	…||d …f }ntjdd|jd | jd}| j ||||
¡\}}
|d
kr´|t |¡|  }| |¡ qytj|dd}| j |¡}| dd¡}| j |¡}| ¡  ¡  ¡ }|dfS )z8AR synthesis with duration predictor for text alignment.é   ©ÚminzText: u    chars â†’ ú	 frames (éK   ú.2fús)Nr   éÿÿÿÿ©r   ©Údimé   éÀ]  ) r'   r   Útext_encoderÚpredict_durationsÚfloatÚroundr#   ÚclampÚsumÚitemr   Úlenr   Ú	start_embr   r   Úinit_ar_cacheÚpredict_next_cachedr   Ú
randn_likeÚappendÚrangeÚshapeÚzerosÚcatÚproj_outÚ	transposer   ÚdecoderÚsqueezer   Únumpy)r   r%   Úduration_scaleÚtemperatureÚtext_tokensÚtext_embÚ	durationsÚtotal_framesÚtext_emb_expandedr?   ÚcacheÚall_embeddingsÚ
text_frameÚnext_embÚstepÚ	pred_embsÚ	codec_embÚwaveformr   r   r   Ú
synthesize'   s:   
&


zLeWMTTSv5Inference.synthesizeNc                  C   sº  ddl m} t |¡\}}|||ddƒ | j¡}| j | d¡¡}	|	j	d }
| j
 |	 dd¡¡}|r7|| }n|}|  |¡}| j
 |¡}| j
 |¡}| ¡ |  ¡  ¡ jdd}| ¡  ¡ }t||ƒ}t||
 |d ƒ}td|
› d|
d	 d
›d|› d|d	 d
›d|› 
ƒ | j
 ¡ }| j
j | j¡}tj||dd…dd…f gdd}t|
ƒD ]:}|dd…||d …f }||j	d k rÉ|dd…||d …f }ntjdd|j	d | jd}| j
 ||||¡\}}q§|g}|dd…dd…f }t|ƒD ]H}|
| }||j	d k r|dd…||d …f }ntjdd|j	d | jd}| j
 ||||¡\}}|dkr4|t  |¡|  }| !|¡ |}qótj|dd}| j
 "|¡ dd¡}| j #|¡ $¡  %¡  &¡ }|dfS )zCAR synthesis with audio prompt as prefix for voice/style reference.r   ©Úconvert_audior6   r*   r5   r+   zPrompt: r-   r.   r/   zs) | Generate: zs) | Total text frames: Nr1   r3   r2   )'Úencodec.utilsr^   Ú
torchaudior   r   r   r   Úencoderr$   rE   r   Úproj_inrI   r'   r7   r8   r9   r:   r#   r;   r<   r=   r   Úmaxr   r@   r?   r   rG   rD   rF   rA   rB   rC   rH   rJ   rK   r   rL   ) r   r%   Úprompt_audio_pathÚprompt_textrM   rN   r^   ÚwavÚsrÚprompt_codecÚT_promptÚprompt_zÚ	full_textrO   rP   rQ   rR   rS   Ú
gen_framesrT   r?   Úprompt_inputrX   Ú	frame_embrV   rW   rU   Úcur_embÚabs_steprY   rZ   r[   r   r   r   Úsynthesize_promptedY   s`   



ÿÿþ
"

z&LeWMTTSv5Inference.synthesize_promptedc           	      C   sŠ   ddl m} t |¡\}}|||ddƒ | j¡}| j | d¡¡}| j	 
| dd¡¡}| j	 |¡ dd¡}| j |¡ ¡  ¡  ¡ }|dfS )uJ   Encode audio â†’ proj_in â†’ proj_out â†’ decode. Tests roundtrip quality.r   r]   r6   r*   r5   )r_   r^   r`   r   r   r   r   ra   r$   r   rb   rI   rH   rJ   rK   r   rL   )	r   Ú
audio_pathr^   rf   rg   rZ   ÚzÚcodec_reconr[   r   r   r   Úreconstruct¢   s   zLeWMTTSv5Inference.reconstructc                 C   sb   ddl m} t |¡\}}|||ddƒ | j¡}| j | d¡¡}| j 	|¡ 
¡  ¡  ¡ }|dfS )z3Pure EnCodec roundtrip (no JEPA). Baseline quality.r   r]   r6   r*   )r_   r^   r`   r   r   r   r   ra   r$   rJ   rK   r   rL   )r   rr   r^   rf   rg   rZ   r[   r   r   r   Úreconstruct_codec_only¯   s   z)LeWMTTSv5Inference.reconstruct_codec_onlyc                 C   s¨  ddl m} ddlm} t |¡\}}|||ddƒ | j¡}| j 	| 
d¡¡}|jd }| j | dd¡¡}	|  |¡}
| j |
¡}tj|
jd g| jd}tj|g| jd}|||ƒ}t||ƒ}|jd |krr|dd…d|…f }n"|jd |k r”tjd||jd  |jd	 | jd}tj||gdd
}|	jd }| jj |d	d	¡}tj||	dd…dd	…f gdd
}| j ||¡}| j |¡ dd¡}| j |¡ ¡  ¡  ¡ }|dfS )u`   Encode audio â†’ proj_in â†’ teacher-forced predict with text alignment â†’ proj_out â†’ decode.r   r]   )Úcompute_uniform_durationsr6   r*   r5   r2   Nr1   r3   )r_   r^   Úmodel_v5rw   r`   r   r   r   r   ra   r$   rE   r   rb   rI   r'   r7   r   r"   r   rF   rG   r?   ÚexpandÚ	predictorrH   rJ   rK   r   rL   )r   rr   r%   r^   rw   rf   rg   rZ   ÚTrs   rO   rP   Útext_lenÚ	audio_lenrQ   rS   ÚpadÚBÚstartÚ	input_embÚ	predictedrt   r[   r   r   r   Úreconstruct_with_predictionº   s6   



ÿ
"z.LeWMTTSv5Inference.reconstruct_with_predictionc                 C   sd   t  |¡ ¡ }|dkr|| d }t |¡ d¡}t |||¡ td|› dt	|ƒ| d›dƒ d S )Nr   gffffffî?zSaved: z (r/   r0   )
ÚnpÚabsrc   r   Ú
from_numpyr$   r`   Úsaver   r>   )r   r[   rg   Úoutput_pathÚmxÚ
wav_tensorr   r   r   Ú
save_audioÞ   s   $zLeWMTTSv5Inference.save_audio)r   )r(   r)   )Nr(   r)   )Ú__name__Ú
__module__Ú__qualname__r   r'   r   Úno_gradr\   rq   ru   rv   rƒ   r‹   r   r   r   r   r      s     
1ÿH



#r   c                  C   sT  t  ¡ } | jddd | jdd d | jdd dd | jd	g d
¢dd | jddd | jdtdd | jdtdd |  ¡ }t|jƒ}|jdkr]|jsPJ dƒ‚| 	|j|j
|j¡\}}nC|jdkrr|jsiJ dƒ‚| |j¡\}}n.|jdkr‡|js~J dƒ‚| |j¡\}}n|jdkr |jr’|js–J dƒ‚| |j|j¡\}}| |||j¡ d S )Nz--checkpointT)Úrequiredz--text)Údefaultz--audiozAudio path for reconstruction)r‘   Úhelpz--mode)r\   ru   Ú
codec_onlyÚpredictr\   )Úchoicesr‘   z--outputzoutput_v5.wavz--duration_scaler(   )Útyper‘   z--temperaturer)   zNeed --textru   zNeed --audior“   r”   zNeed --audio and --text)ÚargparseÚArgumentParserÚadd_argumentr9   Ú
parse_argsr   Ú
checkpointÚmoder%   r\   rM   rN   Úaudioru   rv   rƒ   r‹   Úoutput)ÚparserÚargsÚttsr[   rg   r   r   r   Úmainç   s2   ÿ




r¢   Ú__main__)Ú__doc__r   r`   rL   r„   r—   Úpathlibr   rx   r   r   r   r¢   rŒ   r   r   r   r   Ú<module>   s     Y
ÿ