o
    d¸Áiˆ  ã                   @   sF  d Z ddlZddlZddlZddlZddlZddlmZ G dd„ dƒZ	e
dkr¡e ¡ Zejddd	 ejd
dd ejddd ejdg d¢dd ejddd ejdedd ejdedd e ¡ Ze	ejƒZejdkr|e ejejej¡\ZZnejdkrŠe ej¡\ZZnejdkr—e ej¡\ZZe eeej¡ dS dS )z&Inference for LeWM TTS v2 (DAC-based).é    N)ÚLeWMTTSc                   @   sX   e Zd Zddd„Zdd„ Ze ¡ ddd	„ƒZe ¡ d
d„ ƒZe ¡ dd„ ƒZ	dd„ Z
dS )ÚLeWMTTSInferenceÚcudac                 C   s   t  t j ¡ r	|nd¡| _t j|| jdd}|d }t|ƒ | j¡| _| j |d ¡ | j 	¡  || _
tdƒ tjjdd}tj |¡| _| j 	¡  | j | j¡| _t  ¡ ' t  d	d	d
¡ | j¡}| j |d
¡}| j |¡}| ¡  ¡ | _W d   ƒ n1 s€w   Y  td| jd›ƒ d S )NÚcpuF)Úmap_locationÚweights_onlyÚconfigÚmodelzLoading DAC 24kHz...Ú24khz)Ú
model_typeé   éÀ]  zModel loaded. Start scale: z.3f)ÚtorchÚdevicer   Úis_availableÚloadr   Útor	   Úload_state_dictÚevalr   ÚprintÚdacÚutilsÚdownloadÚDACÚ	dac_modelÚno_gradÚrandnÚ
preprocessÚencoderÚstdÚitemÚ_start_scale)ÚselfÚcheckpoint_pathr   Úckptr   Ú
model_pathÚdummyÚz© r(   ú%/home/ubuntu/lewm-tts/v2/inference.pyÚ__init__   s&   


üzLeWMTTSInference.__init__c                 C   s,   t | d¡ƒ}tj|tjd d¡ | j¡S )Nzutf-8)Údtyper   )ÚlistÚencoder   ÚtensorÚlongÚ	unsqueezer   r   )r"   ÚtextÚtokensr(   r(   r)   Útext_to_tokens#   s   zLeWMTTSInference.text_to_tokenséô  ç        c                 C   s,  |   |¡}| j |¡}tjddd| jd| j }| j |¡}|g}g }	t|ƒD ]K}
tj	|dd}| j 
||¡}|dkrD|t |¡|  }| |¡ | ¡  ¡ }|	 |¡ t|	ƒdkrr|	dd … }t |¡dt |¡ k rr|
d	krr nq'tj	|dd}| j |¡}| dd
¡}| j |¡}| ¡  ¡  ¡ dfS )Nr   i   )r   )Údimr   é   iìÿÿÿg{®Gáz„?é   é   r   )r3   r	   Útext_encoderr   r   r   r!   Údac_in_projÚrangeÚcatÚpredict_nextÚ
randn_likeÚappendÚnormr    ÚlenÚnpr   ÚmeanÚlatents_to_dacÚ	transposer   ÚdecodeÚsqueezer   Únumpy)r"   r1   Ú	max_stepsÚtemperatureÚtext_tokensÚtext_embÚ	start_dacÚ	start_embÚall_embsÚ
prev_normsÚstepÚcontextÚnext_embrA   ÚrecentÚ
audio_embsÚdac_latentsÚwaveformr(   r(   r)   Ú
synthesize'   s0   


 €zLeWMTTSInference.synthesizec           	      C   sŠ   t  |¡\}}| d¡ | j¡}| j ||¡}| j |¡}| dd¡}| j	 
|¡}| j	 |¡}| dd¡}| j |¡}| ¡  ¡  ¡ dfS )u4   DAC encode â†’ project through model â†’ DAC decode.r   r   r9   r   )Ú
torchaudior   r0   r   r   r   r   r   rF   r	   r;   rE   rG   rH   r   rI   )	r"   Ú
audio_pathÚwavÚsrr'   Úz_inÚhÚz_outrX   r(   r(   r)   ÚreconstructG   s   zLeWMTTSInference.reconstructc                 C   sZ   t  |¡\}}| d¡ | j¡}| j ||¡}| j |¡}| j |¡}| 	¡  
¡  ¡ dfS )u-   Pure DAC encode â†’ decode. Baseline quality.r   r   )rZ   r   r0   r   r   r   r   r   rG   rH   r   rI   )r"   r[   r\   r]   r'   rX   r(   r(   r)   Úreconstruct_directW   s   z#LeWMTTSInference.reconstruct_directc                 C   sB   t  |¡ d¡}t |||¡ td|› dt|ƒ| d›dƒ d S )Nr   zSaved: z (z.2fzs))r   Ú
from_numpyr0   rZ   Úsaver   rB   )r"   rX   r]   Úoutput_pathÚ
wav_tensorr(   r(   r)   Ú
save_audioa   s   $zLeWMTTSInference.save_audioN)r   )r4   r5   )Ú__name__Ú
__module__Ú__qualname__r*   r3   r   r   rY   ra   rb   rg   r(   r(   r(   r)   r      s    


	r   Ú__main__z--checkpointT)Úrequiredz--text)Údefaultz--audioz--mode)rY   ra   Údac_baselinerY   )Úchoicesrm   z--outputz
output.wavz--max_stepsr4   )Útyperm   z--temperaturer5   ra   rn   ) Ú__doc__r   rZ   r   rI   rC   Úargparser	   r   r   rh   ÚArgumentParserÚparserÚadd_argumentÚintÚfloatÚ
parse_argsÚargsÚ
checkpointÚttsÚmoderY   r1   rJ   rK   r\   r]   ra   Úaudiorb   rg   Úoutputr(   r(   r(   r)   Ú<module>   s:    \ÿ



í