o
    <šÊiwg  ã                   @   sò   d Z ddlZddlZddlZddlmZ ddlm  mZ G dd„ dej	ƒZ
G dd„ dej	ƒZG dd„ dej	ƒZG d	d
„ d
ej	ƒZG dd„ dej	ƒZG dd„ dej	ƒZG dd„ dej	ƒZG dd„ dej	ƒZG dd„ dej	ƒZdd„ Zddd„ZdS )u—  
LeWM TTS v6 â€” JEPA + VQ: Discrete token prediction for stable AR synthesis.

Key insight: continuous AR prediction accumulates errors every step â†’ noise.
Discrete tokens can't drift â€” a predicted token is always a valid codebook entry.

Architecture:
  - TextEncoder: byte-level transformer (same as before)
  - AudioEncoder: CNN + Transformer â†’ continuous embeddings (same as before)
  - VectorQuantizer: continuous â†’ discrete codebook indices + quantized embeddings
  - JEPAPredictor: predicts next codebook INDEX (classification, not regression)
  - MelDecoder: codebook embeddings â†’ mel spectrogram
  - Vocos: mel â†’ waveform (external, frozen)
é    Nc                       s2   e Zd ZdZd‡ fdd„	Zdd	„ Zd
d„ Z‡  ZS )ÚVectorQuantizerz¶
    Discretizes continuous embeddings into codebook indices.
    Uses EMA codebook updates (no codebook loss needed in optimizer).
    Straight-through estimator for gradients.
    é   é   ç      Ð?ç®Gáz®ï?c                    sd   t ƒ  ¡  || _|| _|| _|| _|  dt ||¡¡ |  dt 	|¡¡ |  d| j
 ¡ ¡ d| _d S )NÚcodebookÚ	ema_countÚema_sumF)ÚsuperÚ__init__Úd_modelÚn_codesÚcommitment_weightÚ	ema_decayÚregister_bufferÚtorchÚrandnÚonesr   ÚcloneÚ_initialized)Úselfr   r   r   r   ©Ú	__class__© ú!/home/ubuntu/lewm-tts/model_vq.pyr       s   

zVectorQuantizer.__init__c                 C   sh   | j rdS t|jd | jƒ}tj|jd |jdd|… }||  ¡ | jd|…< | j	 
| j¡ d| _ dS )z;Initialize codebook from first batch using k-means++ style.Nr   ©ÚdeviceT)r   ÚminÚshaper   r   Úrandpermr   Údetachr   r	   Úcopy_)r   Úflat_zÚnÚindicesr   r   r   Ú_init_codebook-   s   
zVectorQuantizer._init_codebookc                 C   s  |j \}}}| d|¡}| jr| js|  |¡ | d¡jdddd| | j ¡   | j d¡jddd ¡  }|j	dd}| j| }| jrÞt
 ¡  t || j¡ ¡ }	|	jdd}
|	 ¡ | }| j | j¡j|
d| j d | j | j¡j|d| j d | j ¡ }| jd	 || jd	   | }| j | j| d¡ ¡ |
dk}| ¡  ¡ }|dkrÏt
jd|j d |f|jd
}||  ¡ | j|< ||  ¡ | j|< d| j|< W d  ƒ n1 sÙw   Y  t || ¡ ¡}|||  ¡  }| |||¡}| ||¡}||| j| fS )a  
        Args:
            z: [B, T, d_model] continuous embeddings
        Returns:
            z_q: [B, T, d_model] quantized embeddings (straight-through)
            indices: [B, T] codebook indices
            commit_loss: scalar commitment loss
        éÿÿÿÿé   é   T)ÚdimÚkeepdim©r)   r   )Úalphagñhãˆµøä>r   ç      ð?N)r   ÚreshapeÚtrainingr   r%   ÚpowÚsumr   ÚtÚargminr   Úno_gradÚFÚone_hotr   Úfloatr   Úmul_r   Úadd_r	   r!   Ú	unsqueezeÚitemÚrandintr   r    Úmse_lossr   )r   ÚzÚBÚTÚDr"   Údistsr$   Úz_qr6   ÚcountsÚsumsr#   Úcount_smoothÚdeadÚn_deadÚrand_idxÚcommit_lossr   r   r   Úforward7   sH   	
ÿþÿ



€ëzVectorQuantizer.forward)r   r   r   r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r%   rK   Ú__classcell__r   r   r   r   r      s
    
r   c                       ó&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚSinusoidalPositionalEncodingé    c                    s¬   t ƒ  ¡  t ||¡}tjd|tjd d¡}t t d|d¡ ¡ t 	d¡ |  ¡}t 
|| ¡|d d …dd d…f< t || ¡|d d …dd d…f< |  d| d¡¡ d S )Nr   )Údtyper(   r'   g     ˆÃ@Úpe)r
   r   r   ÚzerosÚaranger7   r:   ÚexpÚmathÚlogÚsinÚcosr   )r   r   Úmax_lenrU   ÚpositionÚdiv_termr   r   r   r   y   s   
(  z%SinusoidalPositionalEncoding.__init__c                 C   s    || j d d …d |jd …f  S )Nr(   )rU   r   ©r   Úxr   r   r   rK   ‚   s    z$SinusoidalPositionalEncoding.forward)rS   ©rL   rM   rN   r   rK   rP   r   r   r   r   rR   x   s    	rR   c                       s(   e Zd Zd	‡ fdd„	Zd
dd„Z‡  ZS )ÚTextEncoderr   é   çš™™™™™¹?c                    st   t ƒ  ¡  || _t ||¡| _t|dd| _t |¡| _	tj
|||d |ddd}tj||d| _t ||¡| _d S )Ni   ©r]   rd   TÚgelu©r   ÚnheadÚdim_feedforwardÚdropoutÚbatch_firstÚ
activation©Ú
num_layers)r
   r   r   ÚnnÚ	EmbeddingÚ
char_embedrR   Ú	pos_embedÚDropoutrk   ÚTransformerEncoderLayerÚTransformerEncoderÚtransformerÚLinearÚproj)r   Ú
vocab_sizer   ri   ro   rk   Úencoder_layerr   r   r   r   ‡   s   

þzTextEncoder.__init__Nc                 C   sB   |   |¡t | j¡ }|  |¡}|  |¡}| j||d}|  |¡S )N©Úsrc_key_padding_mask)rr   rY   Úsqrtr   rs   rk   rw   ry   )r   Útext_tokensÚ	text_maskra   r   r   r   rK   ”   s
   


zTextEncoder.forward)r   r   rd   rd   re   ©Nrb   r   r   r   r   rc   †   s    rc   c                       s,   e Zd Z		d
‡ fdd„	Zddd	„Z‡  ZS )ÚAudioEncoderéd   r   rd   re   c                    sþ   t ƒ  ¡  || _|| _|dkr,t tj||dddt ¡ tj||ddddt ¡ ¡| _n(t tj||dddt ¡ tj||ddddt ¡ tj||ddddt ¡ ¡| _t	|dd	| _
t |¡| _tj|||d |d
dd}tj||d| _t ||¡| _d S )Nr'   é   é   ©Úkernel_sizeÚpaddingrd   r(   ©r‡   Ústriderˆ   é   rf   Trg   rh   rn   )r
   r   r   Údownsample_factorrp   Ú
SequentialÚConv1dÚGELUÚconv_prerR   rs   rt   rk   ru   rv   rw   rx   ry   )r   Ún_melsr   ri   ro   rŒ   rk   r{   r   r   r   r      s*   
þý
þzAudioEncoder.__init__Nc                 C   s|   |   |¡}| dd¡}|d ur(|jd }|d d …d d | j…f d d …d |…f }|  |¡}|  |¡}| j||d}|  |¡S )Nr(   r'   r|   )r   Ú	transposer   rŒ   rs   rk   rw   ry   )r   ÚmelÚmel_maskra   ÚT_downr   r   r   rK   ¸   s   

(


zAudioEncoder.forward)rƒ   r   rd   rd   rd   re   r   rb   r   r   r   r   r‚   œ   s
    ÿr‚   c                       sP   e Zd ZdZd‡ fdd„	Zdd
d„Zdd„ Zddd„Zdd„ Zddd„Z	‡  Z
S )ÚTokenPredictorz«
    Causal transformer that predicts next CODEBOOK INDEX.
    Input: quantized embeddings (codebook lookups)
    Output: logits over codebook entries [B, T, n_codes]
    r   rd   é   r   re   c                    s   t ƒ  ¡  || _|| _t ||¡| _t|dd| _t 	|¡| _
tj|||d |ddd}tj||d| _t t ||¡t ¡ t ||¡¡| _d S )Nr‹   rf   rd   Trg   rh   rn   )r
   r   r   r   rp   rx   Ú
input_projrR   rs   rt   rk   ÚTransformerDecoderLayerÚTransformerDecoderrw   r   r   Úoutput_head)r   r   ri   ro   r   rk   Údecoder_layerr   r   r   r   Í   s    

þ


ýzTokenPredictor.__init__Nc                 C   sf   |   |¡}|  |¡}|  |¡}|jd }tjtj|||jtjddd}| j	|||||d}|  
|¡S )Nr(   )r   rT   )Údiagonal)Útgt_maskÚtgt_key_padding_maskÚmemory_key_padding_mask)r˜   rs   rk   r   r   Útriur   r   Úboolrw   r›   )r   Ú	audio_embÚtext_embÚ
audio_maskr€   ra   r@   Úcausal_maskr   r   r   rK   ã   s   



 ü
zTokenPredictor.forwardc                 C   s&   d g| d g| d g| d g| dœS )N)Úself_kÚself_vÚcross_kÚcross_vr   )r   ro   r   r   r   Ú
init_cacheô   s
   üzTokenPredictor.init_cachec              
   C   s  |   |¡}|| jjdd…||d …f  }t| jjƒD ]k\}}|  |j|||d | |d | ¡\}	|d |< |d |< | || 	|	¡ ¡}|  
|j|||d | |d | |¡\}
|d |< |d |< | || |
¡ ¡}| | | | |¡¡¡¡}| || |¡ ¡}q|  |¡|fS )zLOne-step cached inference. Returns logits [B, 1, n_codes] and updated cache.Nr(   r§   r¨   r©   rª   )r˜   rs   rU   Ú	enumeraterw   ÚlayersÚ_cached_mhaÚ	self_attnÚnorm1Údropout1Ú_cached_cross_attnÚmultihead_attnÚnorm2Údropout2Úlinear2rk   rm   Úlinear1Únorm3Údropout3r›   )r   Únew_embr¤   Ústep_idxÚcacher€   ra   ÚiÚlayerÚsa_outÚca_outÚff_outr   r   r   Úinference_stepü   s   
 ÿþzTokenPredictor.inference_stepc                 C   s~  |j |j}}|| }|jd }	|jjddd\}
}}|jjddd\}}}t ||
|¡}t |||¡}t |||¡}|d urO|jd dkrOtj	||gddn|}|d ure|jd dkretj	||gddn|}| 
|	d||¡ dd¡}| 
|	d||¡ dd¡}| 
|	d||¡ dd¡}t || dd¡¡|d  }t tj|dd|¡}| dd¡ ¡  
|	d|¡}t ||jj|jj¡||fS )	Nr   r…   r+   r(   r'   r&   éþÿÿÿç      à?)Ú	embed_dimÚ	num_headsr   Úin_proj_weightÚchunkÚin_proj_biasr5   Úlinearr   ÚcatÚviewr’   ÚmatmulÚsoftmaxÚ
contiguousÚout_projÚweightÚbias)r   ÚmhaÚqÚkvÚcache_kÚcache_vÚdri   Úhead_dimr?   ÚWqÚWkÚWvÚbqÚbkÚbvÚq_projÚk_newÚv_newÚkÚvÚk_mhÚv_mhÚattnÚoutr   r   r   r®     s"   
,,zTokenPredictor._cached_mhac                 C   s^  |j |j}}|| }	|jd }
|jjddd\}}}|jjddd\}}}t |||¡}|d u r@t |||¡}t |||¡}n||}}| |
d||	¡ 	dd¡}| |
d||	¡ 	dd¡}| |
d||	¡ 	dd¡}t
 || 	dd¡¡|	d  }|d ur‰| | d¡ d¡td	ƒ¡}t
 tj|dd|¡}| 	dd¡ ¡  |
d|¡}t ||jj|jj¡||fS )
Nr   r…   r+   r(   r'   r&   rÃ   rÄ   ú-inf)rÅ   rÆ   r   rÇ   rÈ   rÉ   r5   rÊ   rÌ   r’   r   rÍ   Úmasked_fillr:   r7   rÎ   rÏ   rÐ   rÑ   rÒ   )r   rÓ   rÔ   ÚmemoryrÖ   r×   Úmemory_maskrØ   ri   rÙ   r?   rÚ   rÛ   rÜ   rÝ   rÞ   rß   rà   rã   rä   rå   ræ   rç   rè   r   r   r   r²   ,  s&   

z!TokenPredictor._cached_cross_attn)r   rd   r—   r   re   ©NNr   )rL   rM   rN   rO   r   rK   r«   rÂ   r®   r²   rP   r   r   r   r   r–   Æ   s    

r–   c                       rQ   )ÚResConvBlockr…   c              
      sF   t ƒ  ¡  t tj||||d dt ¡ tj||||d d¡| _d S )Nr'   )rˆ   )r
   r   rp   r   rŽ   r   Únet)r   Úchannelsr‡   r   r   r   r   I  s   

ýzResConvBlock.__init__c                 C   s   ||   |¡ S r   )rï   r`   r   r   r   rK   Q  s   zResConvBlock.forward)r…   rb   r   r   r   r   rî   H  s    rî   c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )	Ú
MelDecoderr   rƒ   rd   c                    sî   t ƒ  ¡  |d }t t ||¡t ¡ ¡| _|dkr1t tj||ddddt ¡ t|dd¡| _	n't tj||ddddt ¡ t|ddtj||ddddt ¡ t|dd¡| _	t tj
||dddt ¡ t|d	dtj
||ddd¡| _d S )
Nr'   rd   r(   r‰   r…   )r‡   r„   r†   é   )r
   r   rp   r   rx   r   ry   ÚConvTranspose1drî   ÚupsamplerŽ   rè   )r   r   r‘   Úupsample_factorÚhiddenr   r   r   r   V  s&   

þ

ü

ýzMelDecoder.__init__c                 C   s&   |   |¡ dd¡}|  |¡}|  |¡S )Nr(   r'   )ry   r’   rô   rè   )r   r>   ra   r   r   r   rK   o  s   

zMelDecoder.forward)r   rƒ   rd   rb   r   r   r   r   rñ   U  s    rñ   c                       s.   e Zd Zd‡ fdd„	Zdd„ Zdd„ Z‡  ZS )	ÚMultiResolutionSpectralLoss©)é@   é   rù   )é€   é    rû   )r   rù   r   c                    s   t ƒ  ¡  || _d S r   )r
   r   Úresolutions)r   rý   r   r   r   r   x  s   

z$MultiResolutionSpectralLoss.__init__c                 C   sÀ   t j||jd}|j\}}}	t j| || |	¡||||dd}
t j| || |	¡||||dd}t j| ¡ |
 ¡  ddt j| ¡ ddd  }t 	t  
|
 ¡ d ¡t  
| ¡ d ¡¡}|| S )Nr   T)ÚwindowÚreturn_complexÚfro)Úpç:Œ0âŽyE>gH¯¼šò×z>)r   Úhann_windowr   r   Ústftr.   ÚnormÚabsr5   Úl1_lossrZ   )r   ÚpredÚtargetÚn_fftÚ
hop_lengthÚ
win_lengthrþ   r?   ÚMr@   Ú	pred_stftÚtarget_stftÚscÚlog_magr   r   r   Ú
_stft_loss|  s   ""0(z&MultiResolutionSpectralLoss._stft_lossc              	   C   s`   t jd|jd}d}| jD ]\}}}|jd |kr(||  |||||¡ }|d7 }q|t|dƒ S )Nç        r   r   r'   r(   )r   Útensorr   rý   r   r  Úmax)r   r  r	  ÚlossÚcountr
  ÚhopÚwinr   r   r   rK   …  s   €z#MultiResolutionSpectralLoss.forward)rø   )rL   rM   rN   r   r  rK   rP   r   r   r   r   r÷   w  s    	r÷   c                       sn   e Zd ZdZ‡ fdd„Zddd„Zdd„ Ze ¡ 	
	ddd„ƒZ	e ¡ dd„ ƒZ
e ¡ 	
	ddd„ƒZ‡  ZS )Ú	LeWMTTSvqun  
    JEPA + VQ TTS model.

    Training flow:
      mel â†’ AudioEncoder â†’ continuous z â†’ VQ â†’ discrete indices + quantized z_q
      [start, z_q[:-1]] + text â†’ TokenPredictor â†’ logits â†’ cross_entropy(logits, indices)
      z_q â†’ MelDecoder â†’ mel_recon â†’ L1 + spectral loss

    Inference flow:
      text â†’ TextEncoder â†’ text_emb
      start_emb â†’ Predictor â†’ logits â†’ argmax â†’ codebook lookup â†’ next input â†’ repeat
      all codebook embeddings â†’ MelDecoder â†’ mel â†’ Vocos â†’ waveform

    No continuous drift possible â€” every predicted embedding is a valid codebook entry.
    c                    sR  t ƒ  ¡  | dd¡}| dd¡}| dd¡}| dd¡}| dd¡}| d	d¡}| d
d¡}| dd¡}	| dd¡}
| dd¡}t|||||	d| _t||||||	d| _t||
| dd¡d| _t	||||
|	d| _
t|||d| _t t dd|¡d ¡| _| dd¡| _| dd¡| _| dd¡| _tƒ | _| dd¡| _|| _|| _d S ) Nr   r   ri   rd   r‘   rƒ   Útext_vocab_sizeÚtext_encoder_layersÚaudio_encoder_layersÚpredictor_layersr—   rk   re   r   r   rŒ   )rz   r   ri   ro   rk   )r‘   r   ri   ro   rŒ   rk   r   r   )r   r   r   )r   ri   ro   r   rk   )r   r‘   rõ   r(   g{®Gáz”?Úpred_weightr-   Úrecon_weightÚspectral_weightrÄ   Úlabel_smoothing)r
   r   Úgetrc   Útext_encoderr‚   Úaudio_encoderr   Úvqr–   Ú	predictorrñ   Úmel_decoderrp   Ú	Parameterr   r   Ú	start_embr  r   r!  r÷   Úspectral_loss_fnr"  ÚconfigrŒ   )r   r,  r   ri   r‘   Ú
text_vocabÚtext_layersÚaudio_layersr  rk   r   rŒ   r   r   r   r   ¢  sN   
þý
þþÿ
zLeWMTTSvq.__init__Nc           !      K   s  |j d }|  ||¡}|  ||¡}|  |¡\}	}
}|	j d }| j |dd¡}tj||	d d …d d…f gdd}|d urn|d d …d d | j…f d d …d |…f }tj	|dtj
|jd}tj||d d …d d…f gdd}|}nd }d }|  ||||¡}|d ur£| }|| }|
| }|j d dkrštj||| jd}ntjd|jd}ntj| d|j d ¡|
 d¡| jd}|  |	¡}|j d	 }|j d	 }t||ƒ}|d d …d d …d |…f }|d d …d d …d |…f }|d urò|d d …d |…f }|d ur|  d¡ ¡ }tj|| || d
d| ¡ |j d  d  }nt ||¡}|d ur-|  || || ¡}n|  ||¡}t ¡ . |d urR|j d dkrR|jdd|k ¡  ¡ }n|jdd|
k ¡  ¡ }W d   ƒ n	1 siw   Y  | j| | | j|  | j|  } | |||||dœS )Nr   r(   r&   r+   )rT   r   )r"  r  r   r'   r1   )Ú	reductionr  )Ú
total_lossÚ
token_lossrJ   Ú
recon_lossÚspectral_lossÚtoken_accuracy)r   r$  r%  r&  r*  Úexpandr   rË   rŒ   rV   r¢   r   r'  r5   Úcross_entropyr"  r  r.   r(  r   r:   r7   r  r1   r+  r4   ÚargmaxÚmeanr  r   r!  )!r   r“   r   r”   r€   Úkwargsr?   r¤   Úz_continuousrC   r$   rJ   r•   ÚstartÚ	input_embÚds_maskÚ
start_maskÚ	pred_maskÚ	loss_maskÚlogitsÚvalidÚlogits_flatÚtargets_flatr2  Ú	mel_reconÚT_melÚT_reconÚT_minÚ	valid_melr3  r4  Ú	token_accr1  r   r   r   rK   Õ  s†   

"("ÿþ
	



ÿþ

€üÿþýúzLeWMTTSvq.forwardc                 C   s   |   |¡}|  |¡\}}}|S )u&   Encode mel â†’ discrete token indices.)r%  r&  )r   r“   r>   Ú_r$   r   r   r   Úencode_to_tokens6  s   
zLeWMTTSvq.encode_to_tokensé,  r-   é2   c                 C   sâ   |   ||¡}| j t| jjjƒ¡}| j}g }	t|ƒD ]O}
| j |||
||¡\}}| 	d¡}|t
|dƒ }|dkrQ|j|dd\}}tdƒ|||dd…dd…f k < tj|dd}t |d¡}|	 |¡ | jj| }qtj|	ddS )u,   AR token generation: text â†’ token indices.r(   r  r   r&   r+   ré   N)r$  r'  r«   Úlenrw   r­   r*  ÚrangerÂ   Úsqueezer  Útopkr7   r5   rÎ   r   ÚmultinomialÚappendr&  r   rË   )r   r   Ú	max_stepsÚtemperatureÚtop_kr€   r¤   r¼   ÚembÚall_indicesÚsteprB  Ú	topk_valsrL  ÚprobsÚidxr   r   r   Úsynthesize_tokens<  s$   
ÿ
 
zLeWMTTSvq.synthesize_tokensc                 C   s   | j j| }|  |¡S )u*   Convert token indices â†’ mel spectrogram.)r&  r   r(  )r   r$   rC   r   r   r   Útokens_to_mel\  s   
zLeWMTTSvq.tokens_to_melc                 C   s    |   |||||¡}|  |¡|fS )u'   Full pipeline: text â†’ tokens â†’ mel.)r_  r`  )r   r   rV  rW  rX  r€   r$   r   r   r   Úsynthesize_melb  s   
ÿzLeWMTTSvq.synthesize_melrí   )rN  r-   rO  N)rL   rM   rN   rO   r   rK   rM  r   r4   r_  r`  ra  rP   r   r   r   r   r  ‘  s    
3aÿ
ÿr  c                 C   s   t dd„ |  ¡ D ƒƒS )Nc                 s   s    | ]
}|j r| ¡ V  qd S r   )Úrequires_gradÚnumel)Ú.0r  r   r   r   Ú	<genexpr>m  s   € z#count_parameters.<locals>.<genexpr>)r1   Ú
parameters)Úmodelr   r   r   Úcount_parametersl  s   rh  c                 C   sV   | d u rddddddddddddd	ddd
œ} t | ƒ}tdt|ƒd d›dƒ || fS )Nr   rd   rƒ   r—   r   re   r-   rÄ   r   )r   ri   r‘   r  r  r  r  r   rk   r  r   r!  r   r"  rŒ   zLeWM TTS VQ model: g    €„.Az.2fzM parameters)r  Úprintrh  )r,  rg  r   r   r   Úbuild_model_vqp  s(   ñrj  r   )rO   rY   Úcopyr   Útorch.nnrp   Útorch.nn.functionalÚ
functionalr5   ÚModuler   rR   rc   r‚   r–   rî   rñ   r÷   r  rh  rj  r   r   r   r   Ú<module>   s&    _* " \