o
    a_Êi)˜  ã                   @   sú   d Z ddlZddlZddlZddlmZ ddlm  mZ ddl	Z	G dd„ dej
ƒZG dd„ dej
ƒZG dd„ dej
ƒZG d	d
„ d
ej
ƒZG dd„ dej
ƒZG dd„ dej
ƒZG dd„ dej
ƒZG dd„ dej
ƒZG dd„ dej
ƒZdd„ Zddd„ZdS )ub  
LeWM TTS Model â€” JEPA-based Text-to-Speech

Architecture:
  - TextEncoder: Character-level transformer for Hindi/Devanagari text
  - AudioEncoder: 1D CNN + Transformer that encodes mel spectrograms to embeddings
    - Returns intermediate layer outputs for multi-scale targets
  - JEPAPredictor: Transformer that predicts next audio embedding given text + audio context
  - SpeakerConditioner: FiLM-based speaker conditioning (scale + shift)
  - MelDecoder: FiLM-conditioned decoder with speaker-aware residual blocks
  - Losses: prediction (MSE + cosine), KL, reconstruction, spectral, speaker consistency
é    Nc                       s(   e Zd ZdZ‡ fdd„Zdd„ Z‡  ZS )ÚFiLMConditioneruà   Feature-wise Linear Modulation: x â†’ gamma * x + beta.
    Learns per-speaker scale (gamma) and shift (beta) from a conditioning vector.
    Much more expressive than simple addition â€” can reshape the entire distribution.c                    sV   t ƒ  ¡  t ||d ¡| _tjj| jjdd tj | jj	¡ d| jj	j
d |…< d S )Né   ç{®Gáz”?)Ústdç      ð?)ÚsuperÚ__init__ÚnnÚLinearÚprojÚinitÚnormal_ÚweightÚzeros_ÚbiasÚdata)ÚselfÚcond_dimÚchannels©Ú	__class__© ú/home/ubuntu/lewm-tts/model.pyr      s
   
zFiLMConditioner.__init__c                 C   sr   |   |¡}|jddd\}}| ¡ dkr3|jd |jd kr)| d¡}| d¡}n
| d¡}| d¡}|| | S )uÃ   
        Args:
            x: [B, T, d] or [B, d, T] (set channel_last accordingly)
            cond: [B, d_cond] â€” speaker conditioning vector
        Returns: modulated x, same shape
        r   éÿÿÿÿ©Údimé   é   )r   Úchunkr   ÚshapeÚ	unsqueeze)r   ÚxÚcondÚparamsÚgammaÚbetar   r   r   Úforward&   s   



zFiLMConditioner.forward©Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r&   Ú__classcell__r   r   r   r   r      s    	r   c                       s,   e Zd ZdZd
‡ fdd„	Zddd	„Z‡  ZS )ÚTextEncoderz3Character-level transformer encoder for Hindi text.é   é   çš™™™™™¹?c                    st   t ƒ  ¡  || _t ||¡| _t|dd| _t |¡| _	tj
|||d |ddd}tj||d| _t ||¡| _d S )Ni   ©Úmax_lenr/   TÚgelu©Úd_modelÚnheadÚdim_feedforwardÚdropoutÚbatch_firstÚ
activation©Ú
num_layers)r   r   r5   r	   Ú	EmbeddingÚ
char_embedÚSinusoidalPositionalEncodingÚ	pos_embedÚDropoutr8   ÚTransformerEncoderLayerÚTransformerEncoderÚtransformerr
   r   )r   Ú
vocab_sizer5   r6   r<   r8   Úencoder_layerr   r   r   r   A   s   
üzTextEncoder.__init__Nc                 C   sF   |   |¡t | j¡ }|  |¡}|  |¡}| j||d}|  |¡}|S )zÔ
        Args:
            text_tokens: [B, T_text] long tensor of byte-level token IDs
            text_mask: [B, T_text] bool, True = padding
        Returns:
            text_emb: [B, T_text, d_model]
        ©Úsrc_key_padding_mask)r>   ÚmathÚsqrtr5   r@   r8   rD   r   )r   Útext_tokensÚ	text_maskr!   r   r   r   r&   S   s   


zTextEncoder.forward)r.   r.   r/   r/   r0   ©Nr'   r   r   r   r   r-   >   s    r-   c                       s0   e Zd ZdZ		d‡ fdd„	Zdd
d„Z‡  ZS )ÚAudioEncoderzÑ
    Encodes mel spectrograms into a sequence of audio embeddings.
    Uses 1D CNN for downsampling + Transformer for contextualization.
    Returns intermediate layer outputs for multi-scale EMA targets.
    éd   r.   r/   r0   c                    s(  t ƒ  ¡  ˆ | _|| _|| _|dkr/t tj|ˆ dddt ¡ tjˆ ˆ ddddt ¡ ¡| _	n(t tj|ˆ dddt ¡ tjˆ ˆ ddddt ¡ tjˆ ˆ ddddt ¡ ¡| _	t
ˆ dd	| _t ˆ¡| _t ‡ ‡‡fd
d„t|ƒD ƒ¡| _t ‡ fdd„t|ƒD ƒ¡| _t ˆ ˆ ¡| _t ˆ ˆ ¡| _d S )Nr   é   r   ©Úkernel_sizeÚpaddingr/   r   ©rR   ÚstriderS   é   r1   c              
      s&   g | ]}t jˆ ˆˆ d  ˆddd‘qS )r/   Tr3   r4   )r	   rB   ©Ú.0Ú_©r5   r8   r6   r   r   Ú
<listcomp>‰   s    úüÿz)AudioEncoder.__init__.<locals>.<listcomp>c                    s   g | ]}t  ˆ ¡‘qS r   )r	   Ú	LayerNormrW   )r5   r   r   r[   ”   s    
ÿ)r   r   r5   Údownsample_factorr<   r	   Ú
SequentialÚConv1dÚGELUÚconv_prer?   r@   rA   r8   Ú
ModuleListÚrangeÚlayersÚlayer_normsr
   Úproj_muÚproj_logvar)r   Ún_melsr5   r6   r<   r]   r8   r   rZ   r   r   l   s:   
üú	
ù
ÿzAudioEncoder.__init__NFc                 C   s  |   |¡}| dd¡}|dur(|jd }|dd…dd| j…f dd…d|…f }|  |¡}|  |¡}g }tt| j| j	ƒƒD ]\}\}}	|||d}|rR| 
|	|ƒ¡ q=|  |¡}
|  |¡}| jrst d| ¡}t |¡}|
||  }n|
}|r}||
||fS ||
|fS )uã  
        Args:
            mel: [B, n_mels, T_mel] â€” log mel spectrogram
            mel_mask: [B, T_mel] bool, True = padding (before downsampling)
            return_intermediates: if True, return list of per-layer outputs
        Returns:
            z: [B, T_down, d_model] â€” sampled latent embeddings
            mu: [B, T_down, d_model]
            logvar: [B, T_down, d_model]
            intermediates: list of [B, T_down, d_model] (only if return_intermediates)
        r   r   NrG   ç      à?)ra   Ú	transposer   r]   r@   r8   Ú	enumerateÚziprd   re   Úappendrf   rg   ÚtrainingÚtorchÚexpÚ
randn_like)r   ÚmelÚmel_maskÚreturn_intermediatesr!   ÚT_downÚintermediatesÚiÚlayerÚnormÚmuÚlogvarr   ÚepsÚzr   r   r   r&   œ   s.   

(

€



zAudioEncoder.forward)rO   r.   r/   r/   r/   r0   )NFr'   r   r   r   r   rN   e   s    ÿ0rN   c                       sP   e Zd ZdZd‡ fdd„	Zdd	d
„Zdd„ Zddd„Zdd„ Zddd„Z	‡  Z
S )ÚJEPAPredictorzù
    Predicts next audio embedding given:
    - Text context (from TextEncoder)
    - Previous audio embeddings (from AudioEncoder)

    Uses cross-attention to condition on text, and causal self-attention
    over the audio embedding sequence.
    r.   r/   é   r0   c                    sŠ   t ƒ  ¡  || _t ||¡| _t|dd| _t |¡| _	tj
|||d |ddd}tj||d| _t t ||¡t ¡ t ||¡¡| _d S )NrV   r1   r/   Tr3   r4   r;   )r   r   r5   r	   r
   Úaudio_input_projr?   r@   rA   r8   ÚTransformerDecoderLayerÚTransformerDecoderrD   r^   r`   Úoutput_proj)r   r5   r6   r<   r8   Údecoder_layerr   r   r   r   ×   s"   
ü


ýzJEPAPredictor.__init__Nc           	      C   sj   |   |¡}|  |¡}|  |¡}|jd }tjtj|||jtjddd}| j	|||||d}|  
|¡}|S )uu  
        Args:
            audio_emb: [B, T_audio, d_model] â€” audio embeddings (from encoder)
            text_emb: [B, T_text, d_model] â€” text embeddings
            audio_mask: [B, T_audio] bool, True = padding
            text_mask: [B, T_text] bool, True = padding
        Returns:
            predicted: [B, T_audio, d_model] â€” predicted next embeddings
        r   )ÚdeviceÚdtype)Údiagonal)Útgt_maskÚtgt_key_padding_maskÚmemory_key_padding_mask)r€   r@   r8   r   ro   ÚtriuÚonesr…   ÚboolrD   rƒ   )	r   Ú	audio_embÚtext_embÚ
audio_maskrL   r!   ÚTÚcausal_maskÚ	predictedr   r   r   r&   ï   s   




ÿü
zJEPAPredictor.forwardc                 C   sp  |j }|j}|| }|jd }	|jjddd\}
}}|jjddd\}}}t ||
|¡}t |||¡}t |||¡}|durX|jd dkrXtj	||gdd}tj	||gdd}n|}|}| 
|	d||¡ dd¡}| 
|	d||¡ dd¡}| 
|	d||¡ dd¡}t || dd¡¡|d	  }tj|dd}t ||¡}| dd¡ ¡  
|	d|¡}t ||jj|jj¡}|||fS )
z÷Run multi-head attention with KV cache.
        q_input: [B, 1, d], kv_input: [B, 1, d] (new token for K,V)
        cache_k, cache_v: [B, T_prev, d] projected keys/values
        Returns: output [B, 1, d], updated cache_k, updated cache_v
        r   r   r   Nr   r   r   éþÿÿÿri   )Ú	embed_dimÚ	num_headsr   Úin_proj_weightr   Úin_proj_biasÚFÚlinearro   ÚcatÚviewrj   ÚmatmulÚsoftmaxÚ
contiguousÚout_projr   r   )r   ÚmhaÚq_inputÚkv_inputÚcache_kÚcache_vÚdr6   Úhead_dimÚBÚWqÚWkÚWvÚbqÚbkÚbvÚqÚk_newÚv_newÚkÚvÚk_mhÚv_mhÚattnÚoutr   r   r   Ú_cached_mha  s.   

zJEPAPredictor._cached_mhac                 C   sd  |j }|j}|| }	|jd }
|jjddd\}}}|jjddd\}}}t |||¡}|du r?t |||¡}t |||¡}n||}}| |
d||	¡ 	dd¡}| |
d||	¡ 	dd¡}| |
d||	¡ 	dd¡}t
 || 	dd¡¡|	d	  }|durˆ| | d¡ d¡td
ƒ¡}tj|dd}t
 ||¡}| 	dd¡ ¡  |
d|¡}t ||jj|jj¡}|||fS )z€Cross-attention with cached memory K,V (computed once).
        memory_mask: [B, T_text] bool, True = padding (applied as -inf).r   r   r   Nr   r   r   r”   ri   z-inf)r•   r–   r   r—   r   r˜   r™   rš   rœ   rj   ro   r   Úmasked_fillr    Úfloatrž   rŸ   r    r   r   )r   r¡   r¢   Úmemoryr¤   r¥   Úmemory_maskr¦   r6   r§   r¨   r©   rª   r«   r¬   r­   r®   r¯   r²   r³   r´   rµ   r¶   r·   r   r   r   Ú_cached_cross_attn7  s,   


z JEPAPredictor._cached_cross_attnc                 C   s&   dg| dg| dg| dg| dœS )zInitialize empty KV cache.N)Úself_kÚself_vÚcross_kÚcross_vr   ©r   r<   r   r   r   Ú
init_cacheZ  s
   üzJEPAPredictor.init_cachec              
   C   s  |   |¡}|| jjdd…||d …f  }t| jjƒD ]l\}}|  |j|||d | |d | ¡\}	|d |< |d |< | || 	|	¡ ¡}| j
|j|||d | |d | |d\}
|d |< |d |< | || |
¡ ¡}| | | | |¡¡¡¡}| || |¡ ¡}q|  |¡|fS )uà   Process one token with KV cache.
        new_emb: [B, 1, d_model] â€” raw audio embedding (not projected)
        text_mask: [B, T_text] bool, True = padding
        Returns: predicted [B, 1, d_model], updated cache
        Nr   r¾   r¿   rÀ   rÁ   )r¼   )r€   r@   Úperk   rD   rd   r¸   Ú	self_attnÚnorm1Údropout1r½   Úmultihead_attnÚnorm2Údropout2Úlinear2r8   r:   Úlinear1Únorm3Údropout3rƒ   )r   Únew_embr   Ústep_idxÚcacherL   r!   rw   rx   Úsa_outÚca_outÚff_outr   r   r   Úinference_stepc  s   
 ÿþzJEPAPredictor.inference_step)r.   r/   r   r0   )NNrM   )r(   r)   r*   r+   r   r&   r¸   r½   rÃ   rÕ   r,   r   r   r   r   r~   Í   s    	

*#	r~   c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )r?   é    c                    s¬   t ƒ  ¡  t ||¡}tjd|tjd d¡}t t d|d¡ ¡ t 	d¡ |  ¡}t 
|| ¡|d d …dd d…f< t || ¡|d d …dd d…f< |  d| d¡¡ d S )Nr   )r†   r   r   g     ˆÃ@rÄ   )r   r   ro   ÚzerosÚarangerº   r    rp   rI   ÚlogÚsinÚcosÚregister_buffer)r   r5   r2   rÄ   ÚpositionÚdiv_termr   r   r   r   …  s   
(  z%SinusoidalPositionalEncoding.__init__c                 C   s    || j d d …d |jd …f  S )Nr   )rÄ   r   )r   r!   r   r   r   r&   Ž  s    z$SinusoidalPositionalEncoding.forward)rÖ   )r(   r)   r*   r   r&   r,   r   r   r   r   r?   „  s    	r?   c                       s,   e Zd ZdZd‡ fdd„	Zd	dd„Z‡  ZS )
ÚFiLMResConvBlockz…Residual conv block with FiLM speaker conditioning.
    When no speaker conditioning is provided, behaves like a normal ResConvBlock.r   Nc                    s^   t ƒ  ¡  tj||||d d| _tj||||d d| _|d u| _| jr-t||ƒ| _d S d S )Nr   )rS   )	r   r   r	   r_   Úconv1Úconv2Úhas_filmr   Úfilm)r   r   rR   r   r   r   r   r   ˜  s   

ÿzFiLMResConvBlock.__init__c                 C   s<   t  |  |¡¡}|  |¡}| jr|dur|  ||¡}|| S )z+x: [B, C, T], spk_cond: [B, d_cond] or NoneN)r™   r3   rà   rá   râ   rã   )r   r!   Úspk_condÚhr   r   r   r&      s
   
zFiLMResConvBlock.forward)r   NrM   r'   r   r   r   r   rß   ”  s    rß   c                       s,   e Zd ZdZd‡ fdd„	Zdd	d
„Z‡  ZS )Ú
MelDecoderzhDecodes latent embeddings back to mel spectrograms.
    Speaker-conditioned via FiLM in residual blocks.r.   rO   r/   r   c                    s  t ƒ  ¡  |d }|dkr|nd }t t ||¡t ¡ ¡| _|dk| _| jr,t||ƒ| _	|dkrJtj
||dddd| _t|d|d| _d | _d | _n&tj
||dddd| _t|d|d| _tj
||dddd| _t|d|d| _tj||ddd| _t|d	|d| _tj||ddd| _d S )
Nr   r   r/   rT   r   )rR   r   rP   rQ   é   )r   r   r	   r^   r
   r`   r   râ   r   Ú
input_filmÚConvTranspose1dÚup1rß   Úres1Úup2Úres2r_   Úrefine_convÚ
refine_resÚout_conv)r   r5   rh   Úupsample_factorÚ
n_speakersÚhiddenr   r   r   r   r   ­  s,   

þ
zMelDecoder.__init__Nc                 C   s˜   |   |¡}| jr|dur|  ||¡}| dd¡}t |  |¡¡}|  ||¡}| jdur9t |  |¡¡}|  	||¡}t |  
|¡¡}|  ||¡}|  |¡S )uR   z: [B, T_down, d_model], spk_cond: [B, d_model] or None â†’ mel: [B, n_mels, T_up]Nr   r   )r   râ   rè   rj   r™   r3   rê   rë   rì   rí   rî   rï   rð   )r   r}   rä   r!   r   r   r   r&   Ì  s   


zMelDecoder.forward)r.   rO   r/   r   rM   r'   r   r   r   r   ræ   ©  s    ræ   c                       s2   e Zd ZdZd	‡ fdd„	Zdd„ Zdd„ Z‡  ZS )
ÚMultiResolutionSpectralLosszäComputes spectral convergence + log magnitude loss at multiple STFT resolutions.
    This forces the mel decoder to produce sharp, well-defined spectral structure
    rather than blurry averages that Vocos can't decode properly.©)é@   é   rö   )é€   é    rø   )r.   rö   r.   c                    s   t ƒ  ¡  || _d S rM   )r   r   Úresolutions)r   rú   r   r   r   r   ç  s   

z$MultiResolutionSpectralLoss.__init__c                 C   sÄ   t j||jd}|j\}}}	| || |	¡}
| || |	¡}t j|
||||dd}t j|||||dd}| ¡ }| ¡ }t j|| ddt j|ddd  }t 	t  
|d ¡t  
|d ¡¡}|| S )zECompute spectral convergence + log magnitude loss for one resolution.©r…   T)ÚwindowÚreturn_complexÚfro)Úpç:Œ0âŽyE>gH¯¼šò×z>)ro   Úhann_windowr…   r   ÚreshapeÚstftÚabsry   r™   Úl1_lossrÙ   )r   ÚpredÚtargetÚn_fftÚ
hop_lengthÚ
win_lengthrü   r¨   ÚMr‘   Ú	pred_flatÚtarget_flatÚ	pred_stftÚtarget_stftÚpred_magÚ
target_magÚsc_lossÚlog_mag_lossr   r   r   Ú
_stft_lossì  s$   ÿÿ$þz&MultiResolutionSpectralLoss._stft_lossc              	   C   sf   t jd|jd}d}| jD ]\}}}|jd |kr(||  |||||¡ }|d7 }q|dkr1|| }|S )zpred, target: [B, n_mels, T]ç        rû   r   r   r   )ro   Útensorr…   rú   r   r  )r   r  r  ÚlossÚcountr  ÚhopÚwinr   r   r   r&   	  s   €z#MultiResolutionSpectralLoss.forward)rõ   )r(   r)   r*   r+   r   r  r&   r,   r   r   r   r   rô   â  s
    rô   c                       sv   e Zd ZdZ‡ fdd„Zdd„ Zdd„ Ze ¡ dd	„ ƒZ	d
d„ Z
ddd„Zdd„ Zddd„Zdd„ Zddd„Z‡  ZS )ÚLeWMTTSaÐ  
    Complete LeWM TTS model with speaker-preserving JEPA.

    Speaker fidelity is enforced through 4 mechanisms:
      1. FiLM conditioning (scale+shift) on latents and targets
      2. FiLM-conditioned MelDecoder (speaker-aware reconstruction)
      3. Multi-scale EMA targets (retain acoustic detail across layers)
      4. Cosine + MSE prediction loss (preserves direction = speaker identity)
      5. Speaker consistency classifier on raw encoder output
    c                    s:  t ƒ  ¡  | dd¡}| dd¡}| dd¡}| dd¡}| dd¡}| d	d¡}| d
d¡}| dd¡}	| dd¡| _| dd¡}
t|||||	d| _| dd¡}t||||||	d| _t||||	d| _	t
||||
d| _| dd¡| _t | j¡| _| j ¡ D ]}d|_q‚| dd¡| _| dd ¡| _t ||¡| _|
| _|
dkrÊt |
|¡| _t||ƒ| _t||ƒ| _t t ||¡t ¡ t ||
¡¡| _nd | _d | _d | _d | _t  t! "dd|¡d ¡| _#| dd¡| _$| d d¡| _%| d!d"¡| _&| d#d"¡| _'| d$d¡| _(t)ƒ | _*| d%d&¡| _+| d'd¡| _,|| _-d S )(Nr5   r.   r6   r/   rh   rO   Útext_vocab_sizeÚtext_encoder_layersÚaudio_encoder_layersÚpredictor_layersr   r8   r0   Ú	kl_weightg{®Gáz„?rò   r   )rE   r5   r6   r<   r8   r]   )rh   r5   r6   r<   r]   r8   )r5   r6   r<   r8   )r5   rh   rñ   rò   Úrecon_weightr   FÚ	ema_decayçV-²ïï?Úema_target_layersr   Úpred_weightç      $@Úcosine_weightÚinput_noiser  Úscheduled_sampling_rateÚ	free_bitsÚspectral_weightri   Úspeaker_weight).r   r   Úgetr   r-   Útext_encoderrN   Úaudio_encoderr~   Ú	predictorræ   Úmel_decoderr!  ÚcopyÚdeepcopyÚema_audio_encoderÚ
parametersÚrequires_gradr"  r$  r	   r
   Úema_target_projrò   r=   Úspeaker_embedr   Úlatent_filmÚtarget_filmr^   r`   Úspeaker_classifierÚ	Parameterro   ÚrandnÚ	start_embr%  r'  r(  r)  r*  rô   Úspectral_loss_fnr+  r,  Úconfig)r   r@  r5   r6   rh   Ú
text_vocabÚtext_layersÚaudio_layersr  r8   rò   r]   rÿ   r   r   r   r   %  sx   
þýþþ

ý
zLeWMTTS.__init__c                 C   s    | j dur|dur|   |¡S dS )z>Get speaker conditioning vector. Returns [B, d_model] or None.N)r8  )r   Ú
speaker_idr   r   r   Ú_get_speaker_cond‚  s   
zLeWMTTS._get_speaker_condc                 C   s   |dur|dur|||ƒS |S )z7Apply FiLM conditioning to tensor. No-op if no speaker.Nr   )r   r!   rä   Úfilm_moduler   r   r   Ú_apply_speaker_filmˆ  s   
zLeWMTTS._apply_speaker_filmc                 C   sD   t | j ¡ | j ¡ ƒD ]\}}|j | j¡j|jd| j d qdS )z"Update EMA target encoder weights.r   )ÚalphaN)rl   r/  r5  r4  r   Úmul_r"  Úadd_)r   Úp_onlineÚp_emar   r   r   Ú
update_emaŽ  s
   
ÿ"þzLeWMTTS.update_emac                    sœ   t  ¡ ; | j ¡  | j||dd\}}}‰ | j ¡  | jdur+‡ fdd„| jD ƒ}nˆ }t j|ddjdd}W d  ƒ n1 sBw   Y  |  |¡}|S )zÐCompute multi-scale EMA targets by averaging intermediate layer outputs.
        data2vec-style: captures both low-level acoustic detail (early layers)
        and high-level semantic structure (late layers).T)rt   Nc                    s   g | ]}ˆ | ‘qS r   r   )rX   rw   ©rv   r   r   r[   ¢  s    z7LeWMTTS._compute_multiscale_targets.<locals>.<listcomp>r   r   )	ro   Úno_gradr4  ÚevalÚtrainr$  ÚstackÚmeanr7  )r   rr   rs   rY   Úmu_emaÚselectedÚ
target_embr   rN  r   Ú_compute_multiscale_targets•  s   

ÿ

ò
z#LeWMTTS._compute_multiscale_targetsNc           9   	   C   s˜  |   |¡}|  ||¡}|dur|  ||| j¡}|  ||¡\}}	}
|jd }|jd }|	}|  ||| j¡}t ¡  | j 	¡  |  ||¡\}}}| j 
¡  |}W d  ƒ n1 sXw   Y  |  ||| j¡}| j |dd¡}tj||dd…dd…f gdd}| jr©| jdkr©t |¡}tj||d |jd |jd| j |dd…dd…f< || }|durá|dd…dd| jj…f dd…d|…f }tj|dtj|jd}tj||dd…dd…f gdd}|}nd}d}|  ||||¡}| jr6| jdkr6t ¡ * tj|||jd| jk }d|dd…df< | d¡}t || ¡ |¡}W d  ƒ n	1 s)w   Y  |  ||||¡}|dur™|  d¡ ¡ }| ¡ |jd  d	 }|| d
 |  ¡ | }||  d|jd ¡}||  d|jd ¡} |  d¡ d¡dk}!|! !¡ rdt"j#||! | |! dd $¡  }"n)tj%d|jd}"n t" &||¡}dt"j#| d|jd ¡| d|jd ¡dd $¡  }"|| j'|"  }#dd|
 |	 (d
¡ |
 )¡   }$tj*|$| j+ dd}%|dur|	jd }&|dd…dd| jj…f dd…d|&…f }'|'  d¡ ¡ }(|%|(  ¡ |( ¡ |	jd  d	  })n|% $¡ })|  ,||¡}*|  ,| ¡ |¡}+|jd
 },|*jd
 }-t-|,|-|+jd
 ƒ}.|dd…dd…d|.…f }|*dd…dd…d|.…f }*|+dd…dd…d|.…f }+|durm|dd…d|.…f }|dur©|  d¡ ¡ }/t"j.|*|/ ||/ dd|/ ¡ |jd  d	  }0t"j.|+|/ ||/ dd|/ ¡ |jd  d	  }1nt" .|*|¡}0t" .|+|¡}1|0d|1  }0|durË|  /|*|/ ||/ ¡}2n|  /|*|¡}2tj%d|jd}3| j0dur(|dur(|dur|dd…dd| jj…f dd…d|…f }4|4  d¡ ¡ }5||5 jdd|5jddd	  }6n|j$dd}6|  0|6¡}7t" 1|7|¡}3| j2|# | j3|)  | j4|0  | j5|2  | j6|3  }8|8|#||"|)|0|2|3dœS )a2  
        Args:
            mel: [B, n_mels, T_mel]
            text_tokens: [B, T_text]
            mel_mask: [B, T_mel] bool
            text_mask: [B, T_text] bool
            speaker_id: [B] long tensor of speaker IDs (optional)
        Returns:
            loss_dict with all individual losses
        Nr   r   r   r   rû   )r†   r…   Fr   r   r   r  g      à¿)ÚminÚsum)Ú	reductionri   )Ú
total_lossÚprediction_lossÚmse_lossÚcosine_lossÚkl_lossÚ
recon_lossÚspectral_lossÚspeaker_loss)7rE  r.  rG  r9  r/  r   ro   rO  r4  rP  rQ  r:  r>  Úexpandr›   rn   r(  Ú
zeros_liker=  r…   r]   r×   r   r0  r)  Úrandr    ÚwhereÚdetachrº   rY  r  ÚsqueezeÚanyr™   Úcosine_similarityrS  r  r]  r'  Úpowrp   Úclampr*  r1  rX  r  r?  r;  Úcross_entropyr%  r   r!  r+  r,  )9r   rr   rK   rs   rL   rD  rä   r   r}   rz   r{   r¨   ru   Úmu_rawrY   rT  rV  ÚstartÚ	input_embÚnoiseÚds_maskÚ
start_maskÚ	pred_maskÚ	loss_maskr“   Úmask_ssÚmask_ss_expÚmixed_inputÚvalidÚn_validr]  r  Útgt_flatÚ
valid_flatr^  r\  Ú
kl_per_dimÚkl_freeÚ	T_down_klÚ
ds_mask_klÚvalid_klr_  Ú	mel_reconÚmel_pred_reconÚT_melÚT_reconÚT_minÚ	valid_melr`  Úpred_recon_lossra  rb  Úds_mask_spkÚ	valid_spkÚ	mu_pooledÚ
spk_logitsr[  r   r   r   r&   ­  s   





ü"
ÿÿ*"

ú

ÿþýü

*$



ÿþÿþ

*"
ÿþýüøzLeWMTTS.forwardc                 C   s   |   |¡\}}}|S )z0Encode mel to latent embeddings (for inference).)r/  )r   rr   r}   rz   rY   r   r   r   Úencode_audiog  s   zLeWMTTS.encode_audioc                 C   s$   | j |||d}|dd…dd…f S )z.Predict next audio embedding autoregressively.©rL   Nr   )r0  )r   rŽ   r   rL   r“   r   r   r   Úpredict_nextl  s   zLeWMTTS.predict_nextc                 C   s   t | jjjƒ}| j |¡S )z1Initialize KV cache for autoregressive inference.)Úlenr0  rD   rd   rÃ   rÂ   r   r   r   Úinit_ar_cacheq  s   zLeWMTTS.init_ar_cachec                 C   s   | j j|||||dS )zEPredict next embedding using KV cache. O(1) per step instead of O(n).rŽ  )r0  rÕ   )r   rÏ   r   rÐ   rÑ   rL   r   r   r   Úpredict_next_cachedv  s   zLeWMTTS.predict_next_cached)NNNrM   )r(   r)   r*   r+   r   rE  rG  ro   rO  rM  rW  r&   r  r  r‘  r’  r,   r   r   r   r   r    s    ]

 ;
r  c                 C   s   t dd„ |  ¡ D ƒƒS )Nc                 s   s    | ]
}|j r| ¡ V  qd S rM   )r6  Únumel)rX   rÿ   r   r   r   Ú	<genexpr>|  s   € z#count_parameters.<locals>.<genexpr>)rY  r5  )Úmodelr   r   r   Úcount_parameters{  s   r–  c                 C   sT   | d u rdddddddddddd	d
ddœ} t | ƒ}tdt|ƒd d›dƒ || fS )Nr.   r/   rO   r   r0   gš™™™™™©?r&  r   r#  g       @r  )r5   r6   rh   r  r  r  r  r8   r   r%  r'  r"  r*  r(  zLeWM TTS model: g    €„.Az.2fzM parameters)r  Úprintr–  )r@  r•  r   r   r   Úbuild_model  s&   òr˜  rM   )r+   rI   r2  ro   Útorch.nnr	   Útorch.nn.functionalÚ
functionalr™   Ú
torchaudioÚModuler   r-   rN   r~   r?   rß   ræ   rô   r  r–  r˜  r   r   r   r   Ú<module>   s*    &'h 897  d