o
    ó¸Áiþ!  ã                   @   s   d Z ddlZddlZddlmZ ddlm  mZ G dd„ dejƒZ	G dd„ dejƒZ
G dd„ dejƒZG d	d
„ d
ejƒZdd„ Zddd„ZdS )u
  
LeWM TTS v2 â€” DAC-based JEPA Text-to-Speech

Architecture:
  - DAC encoder (frozen): audio â†’ 1024-dim continuous latents @ 75Hz
  - Linear proj: 1024 â†’ 256 (d_model)
  - TextEncoder: byte-level transformer
  - JEPAPredictor: causal transformer decoder with cross-attention to text
  - Linear proj: 256 â†’ 1024 (back to DAC space)
  - DAC decoder (frozen): latents â†’ waveform

  Two losses: next-embedding prediction (MSE) + Gaussian regularizer (KL)
  NO mel decoder needed â€” DAC handles audio reconstruction.
é    Nc                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚSinusoidalPositionalEncodingé    c                    s¬   t ƒ  ¡  t ||¡}tjd|tjd d¡}t t d|d¡ ¡ t 	d¡ |  ¡}t 
|| ¡|d d …dd d…f< t || ¡|d d …dd d…f< |  d| d¡¡ d S )Nr   )Údtypeé   é   g     ˆÃ@Úpe)ÚsuperÚ__init__ÚtorchÚzerosÚarangeÚfloatÚ	unsqueezeÚexpÚmathÚlogÚsinÚcosÚregister_buffer)ÚselfÚd_modelÚmax_lenr   ÚpositionÚdiv_term©Ú	__class__© ú!/home/ubuntu/lewm-tts/v2/model.pyr	      s   
(  z%SinusoidalPositionalEncoding.__init__c                 C   s    || j d d …d |jd …f  S )Nr   )r   Úshape)r   Úxr   r   r   Úforward    s    z$SinusoidalPositionalEncoding.forward)r   ©Ú__name__Ú
__module__Ú__qualname__r	   r    Ú__classcell__r   r   r   r   r      s    	r   c                       s(   e Zd Zd	‡ fdd„	Zd
dd„Z‡  ZS )ÚTextEncoderé   é   çš™™™™™¹?c                    st   t ƒ  ¡  || _t ||¡| _t|dd| _t |¡| _	tj
|||d |ddd}tj||d| _t ||¡| _d S )Ni   ©r   r(   TÚgelu©r   ÚnheadÚdim_feedforwardÚdropoutÚbatch_firstÚ
activation©Ú
num_layers)r   r	   r   ÚnnÚ	EmbeddingÚ
char_embedr   Ú	pos_embedÚDropoutr/   ÚTransformerEncoderLayerÚTransformerEncoderÚtransformerÚLinearÚproj)r   Ú
vocab_sizer   r-   r3   r/   Úencoder_layerr   r   r   r	   %   s   
ýzTextEncoder.__init__Nc                 C   sB   |   |¡t | j¡ }|  |¡}|  |¡}| j||d}|  |¡S )N)Úsrc_key_padding_mask)r6   r   Úsqrtr   r7   r/   r;   r=   )r   Útext_tokensÚ	text_maskr   r   r   r   r    4   s
   


zTextEncoder.forward)r'   r'   r(   r(   r)   ©Nr!   r   r   r   r   r&   $   s    r&   c                       s(   e Zd Zd
‡ fdd„	Zddd	„Z‡  ZS )ÚJEPAPredictorr'   r(   é   r)   c                    s„   t ƒ  ¡  t ||¡| _t|dd| _t |¡| _tj	|||d |ddd}tj
||d| _t t ||¡t ¡ t ||¡¡| _d S )Ni   r*   r(   Tr+   r,   r2   )r   r	   r4   r<   Úaudio_input_projr   r7   r8   r/   ÚTransformerDecoderLayerÚTransformerDecoderr;   Ú
SequentialÚGELUÚoutput_proj)r   r   r-   r3   r/   Údecoder_layerr   r   r   r	   =   s   
ý


ýzJEPAPredictor.__init__Nc           	      C   sf   |   |¡}|  |¡}|  |¡}|jd }tjtj|||jtjddd}| j	|||||d}|  
|¡S )Nr   )Údevicer   )Údiagonal)Útgt_maskÚtgt_key_padding_maskÚmemory_key_padding_mask)rG   r7   r/   r   r
   ÚtriuÚonesrN   Úboolr;   rL   )	r   Ú	audio_embÚtext_embÚ
audio_maskrC   r   ÚTÚcausal_maskÚ	predictedr   r   r   r    O   s   



ÿü
zJEPAPredictor.forward)r'   r(   rF   r)   ©NNr!   r   r   r   r   rE   <   s    rE   c                       s<   e Zd ZdZ‡ fdd„Zddd„Zddd„Zd	d
„ Z‡  ZS )ÚLeWMTTSu  
    LeWM TTS v2 with DAC latents.

    Training:
      - DAC encodes audio â†’ 1024-dim latents (frozen)
      - Project to d_model, predict next embedding, KL regularize
    Inference:
      - Text â†’ predict DAC latents autoregressively â†’ DAC decode â†’ waveform
    c           	         sâ   t ƒ  ¡  | dd¡}| dd¡}| dd¡}| dd¡}| dd¡}| d	d
¡}| dd¡}| dd¡| _| dd¡| _t ||¡| _t ||¡| _t ||¡| _	t ||¡| _
t|||||d| _t||||d| _|| _d S )Nr   r'   r-   r(   Údac_dimé   Útext_vocab_sizeÚtext_encoder_layersÚpredictor_layersrF   r/   r)   Ú	kl_weightÚrecon_weightg      $@)r>   r   r-   r3   r/   )r   r-   r3   r/   )r   r	   Úgetrc   rd   r4   r<   Údac_in_projÚdac_out_projÚproj_muÚproj_logvarr&   Útext_encoderrE   Ú	predictorÚconfig)	r   rl   r   r-   r^   Ú
text_vocabÚtext_layersrb   r/   r   r   r   r	   m   s.   
þþ
zLeWMTTS.__init__Nc                 C   s  |   |¡}|  |¡}|  |¡}| jr#t d| ¡}|t |¡|  }	n|}	|  ||¡}
|	dd…dd…f }|	dd…dd…f }|durN|dd…dd…f }nd}|  ||
||¡}|durz|  	d¡}t
j|| || dd| ¡ |jd  d  }nt
 ||¡}|dur­|  	d¡ ¡ }dd| | d	¡ | ¡   }||  ¡ | ¡ |jd  d  }ndt d| | d	¡ | ¡  ¡ }|  |¡}|durè|  	d¡ ¡ }t
j|| || dd| ¡ |jd  d  }nt
 ||¡}|| j|  | j|  }||||d
œS )u  
        Args:
            dac_latents: [B, T, 1024] â€” continuous DAC encoder output
            text_tokens: [B, T_text] â€” byte-level text tokens
            latent_mask: [B, T] bool, True = padding
            text_mask: [B, T_text] bool, True = padding
        g      à?Néÿÿÿÿr   Úsum)Ú	reductiong:Œ0âŽyE>g      à¿r   )Ú
total_lossÚprediction_lossÚkl_lossÚ
recon_loss)rf   rh   ri   Útrainingr
   r   Ú
randn_likerj   rk   r   ÚFÚmse_lossrp   r   r   ÚpowÚmeanrg   rc   rd   )r   Údac_latentsrB   Úlatent_maskrC   ÚhÚmuÚlogvarÚstdÚzrW   Ú	input_embÚ
target_embÚ	pred_maskr[   Úvalidrs   Úklrt   Ú	dac_reconru   rr   r   r   r   r    Œ   sT   
	

ÿþ$$
ÿþüzLeWMTTS.forwardc                 C   s$   | j |||d}|dd…dd…f S )z2AR inference: predict next embedding from context.)rC   Nro   )rk   )r   rV   rW   rC   r[   r   r   r   Úpredict_nextÕ   s   zLeWMTTS.predict_nextc                 C   s
   |   |¡S )z8Project model-space embeddings back to DAC latent space.)rg   )r   r‚   r   r   r   Úlatents_to_dacÚ   s   
zLeWMTTS.latents_to_dacr\   rD   )	r"   r#   r$   Ú__doc__r	   r    r‰   rŠ   r%   r   r   r   r   r]   b   s    


Ir]   c                 C   s   t dd„ |  ¡ D ƒƒS )Nc                 s   s    | ]
}|j r| ¡ V  qd S rD   )Úrequires_gradÚnumel)Ú.0Úpr   r   r   Ú	<genexpr>à   s   € z#count_parameters.<locals>.<genexpr>)rp   Ú
parameters)Úmodelr   r   r   Úcount_parametersß   s   r“   c              	   C   sH   | d u rdddddddddœ} t | ƒ}tdt|ƒd d	›d
ƒ || fS )Nr'   r(   r_   rF   r)   )r   r-   r^   r`   ra   rb   r/   rc   zLeWM TTS v2 (DAC): g    €„.Az.2fzM trainable parameters)r]   Úprintr“   )rl   r’   r   r   r   Úbuild_modelã   s   ø
r•   rD   )r‹   r   r
   Útorch.nnr4   Útorch.nn.functionalÚ
functionalrx   ÚModuler   r&   rE   r]   r“   r•   r   r   r   r   Ú<module>   s    &}