o
    Š«Êi^,  ã                   @   s   d Z ddlZddlZddlmZ ddlm  mZ G dd„ dejƒZ	G dd„ dejƒZ
G dd„ dejƒZG d	d
„ d
ejƒZdd„ Zddd„ZdS )u£  
LeWM TTS v6 â€” Predict pre-trained EnCodec tokens.

The key insight from failed experiments:
- Continuous AR prediction drifts (noise)
- VQ learned from scratch + prediction = too many moving targets
- Solution: use a FROZEN pre-trained codec (EnCodec). The codebook is fixed and good.
  The model only needs to learn: text â†’ token sequence. Standard next-token prediction.

Architecture:
  - TextEncoder: byte-level transformer
  - TokenPredictor: causal transformer, cross-attends to text, outputs logits over 1024 EnCodec codes
  - EnCodec decoder (frozen): tokens â†’ waveform (no mel decoder needed!)

This is essentially a small language model that speaks EnCodec.
é    Nc                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚSinusoidalPositionalEncodingé    c                    s¬   t ƒ  ¡  t ||¡}tjd|tjd d¡}t t d|d¡ ¡ t 	d¡ |  ¡}t 
|| ¡|d d …dd d…f< t || ¡|d d …dd d…f< |  d| d¡¡ d S )Nr   )Údtypeé   é   g     ˆÃ@Úpe)ÚsuperÚ__init__ÚtorchÚzerosÚarangeÚfloatÚ	unsqueezeÚexpÚmathÚlogÚsinÚcosÚregister_buffer)ÚselfÚd_modelÚmax_lenr   ÚpositionÚdiv_term©Ú	__class__© ú!/home/ubuntu/lewm-tts/model_v6.pyr	      s   
(  z%SinusoidalPositionalEncoding.__init__c                 C   s    || j d d …d |jd …f  S )Nr   )r   Úshape)r   Úxr   r   r   Úforward"   s    z$SinusoidalPositionalEncoding.forward)r   ©Ú__name__Ú
__module__Ú__qualname__r	   r    Ú__classcell__r   r   r   r   r      s    	r   c                       s(   e Zd Zd	‡ fdd„	Zd
dd„Z‡  ZS )ÚTextEncoderé   é   çš™™™™™¹?c                    st   t ƒ  ¡  || _t ||¡| _t|dd| _t |¡| _	tj
|||d |ddd}tj||d| _t ||¡| _d S )Ni   ©r   r(   TÚgelu©r   ÚnheadÚdim_feedforwardÚdropoutÚbatch_firstÚ
activation©Ú
num_layers)r   r	   r   ÚnnÚ	EmbeddingÚ
char_embedr   Ú	pos_embedÚDropoutr/   ÚTransformerEncoderLayerÚTransformerEncoderÚtransformerÚLinearÚproj)r   Ú
vocab_sizer   r-   r3   r/   Úlayerr   r   r   r	   '   s   

þzTextEncoder.__init__Nc                 C   sB   |   |¡t | j¡ }|  |¡}|  |¡}| j||d}|  |¡S )N)Úsrc_key_padding_mask)r6   r   Úsqrtr   r7   r/   r;   r=   )r   ÚtokensÚmaskr   r   r   r   r    4   s
   


zTextEncoder.forward)r'   r'   r(   r(   r)   ©Nr!   r   r   r   r   r&   &   s    r&   c                       sH   e Zd ZdZd‡ fdd„	Zdd
d„Zdd„ Zddd„Zddd„Z‡  Z	S )ÚTokenPredictorzMCausal transformer: predicts next EnCodec token given previous tokens + text.r'   r(   é   é   r)   c                    sš   t ƒ  ¡  || _|| _t |d |¡| _|| _t|dd| _	t 
|¡| _tj|||d |ddd}tj||d| _t t ||¡t ¡ t ||¡¡| _d S )	Nr   i   r*   r(   Tr+   r,   r2   )r   r	   r   Ún_codesr4   r5   Útoken_embedÚbos_idr   r7   r8   r/   ÚTransformerDecoderLayerÚTransformerDecoderr;   Ú
Sequentialr<   ÚGELUÚoutput_head)r   r   r-   r3   rH   r/   r?   r   r   r   r	   ?   s"   

þ


ýzTokenPredictor.__init__Nc                 C   sf   |   |¡}|  |¡}|  |¡}|jd }tjtj|||jtjddd}| j	|||||d}|  
|¡S )u¬   
        token_ids: [B, T] â€” EnCodec token IDs (with BOS prepended)
        text_emb: [B, T_text, d] â€” from TextEncoder
        Returns: logits [B, T, n_codes]
        r   )Údevicer   )Údiagonal)Útgt_maskÚtgt_key_padding_maskÚmemory_key_padding_mask)rI   r7   r/   r   r
   ÚtriuÚonesrP   Úboolr;   rO   )r   Ú	token_idsÚtext_embÚ
token_maskÚ	text_maskr   ÚTÚcausalr   r   r   r    X   s   



 ü
zTokenPredictor.forwardc                 C   s2   t | jjƒ}d g| d g| d g| d g| dœS )N)Úself_kÚself_vÚcross_kÚcross_v)Úlenr;   Úlayers)r   Únr   r   r   Ú
init_cacheo   s   þzTokenPredictor.init_cachec              
   C   s&  |   |¡}|| jjdd…||d …f  }t| jjƒD ]k\}}|  |j|||d | |d | ¡\}	|d |< |d |< | || 	|	¡ ¡}|  |j
|||d | |d | |¡\}
|d |< |d |< | || |
¡ ¡}| | | | |¡¡¡¡}| || |¡ ¡}q|  |¡ d¡}||fS )ul   
        token_id: [B, 1] long â€” single token
        Returns: logits [B, n_codes], updated cache
        Nr   r^   r_   r`   ra   )rI   r7   r   Ú	enumerater;   rc   Ú_cached_attnÚ	self_attnÚnorm1Údropout1Úmultihead_attnÚnorm2Údropout2Úlinear2r/   r1   Úlinear1Únorm3Údropout3rO   Úsqueeze)r   Útoken_idrY   ÚstepÚcacher[   r   Úir?   Úsa_outÚca_outÚffÚlogitsr   r   r   Úinference_stepv   s    
 ÿþzTokenPredictor.inference_stepc                 C   s¼  |j |j}}|| }	|jd }
|jjddd\}}}|jjddd\}}}t |||¡}|d u r@t |||¡}t |||¡}n4t |||¡}t |||¡}|jd dkro|jd dkrotj	||gdd}tj	||gdd}n||}}| 
|
d||	¡ dd¡}| 
|
d||	¡ dd¡}| 
|
d||	¡ dd¡}t || dd¡¡|	d  }|d ur¸| | d¡ d¡td	ƒ¡}t tj|dd|¡}| dd¡ ¡  
|
d|¡}t ||jj|jj¡||fS )
Nr   é   ©Údimr   éÿÿÿÿr   éþÿÿÿg      à?ú-inf)Ú	embed_dimÚ	num_headsr   Úin_proj_weightÚchunkÚin_proj_biasÚFÚlinearr
   ÚcatÚviewÚ	transposeÚmatmulÚmasked_fillr   r   ÚsoftmaxÚ
contiguousÚout_projÚweightÚbias)r   ÚmhaÚqÚkvÚckÚcvrC   ÚdÚnhÚhdÚBÚWqÚWkÚWvÚbqÚbkÚbvÚqpÚkÚvÚknÚvnÚkmÚvmÚaÚor   r   r   rg      s0   

zTokenPredictor._cached_attn)r'   r(   rF   rG   r)   ©NNrD   )
r"   r#   r$   Ú__doc__r	   r    re   r{   rg   r%   r   r   r   r   rE   <   s    

rE   c                       s@   e Zd ZdZ‡ fdd„Zddd„Ze ¡ 		dd
d„ƒZ‡  Z	S )Ú	LeWMTTSv6uÀ   
    Text â†’ EnCodec tokens. Simple next-token prediction.
    No encoder, no VQ, no mel decoder. Just predict the right token.
    EnCodec decoder handles tokenâ†’waveform at inference.
    c              	      s¢   t ƒ  ¡  | dd¡}| dd¡}| dd¡}t| dd¡||| dd¡| d	d
¡d| _t||| dd¡|| d	d
¡d| _|| _|| _| dd
¡| _	|| _
d S )Nr   r'   r-   r(   rH   rG   Útext_vocab_sizeÚtext_encoder_layersr/   r)   )r>   r   r-   r3   r/   Úpredictor_layersrF   )r   r-   r3   rH   r/   Úlabel_smoothing)r   r	   Úgetr&   Útext_encoderrE   Ú	predictorrH   rJ   r±   Úconfig)r   rµ   r   r-   rH   r   r   r   r	   º   s(   



ü

ü
zLeWMTTSv6.__init__Nc                 K   s@  |j \}}|  ||¡}tj|df| jtj|jd}	tj|	|dd…dd…f gdd}
|durKtj|dtj	|jd}tj||dd…dd…f gdd}nd}|  
|
|||¡}|dure| }|| }|| }n| d| j¡}| d¡}tj||| jd}t ¡  |jdd|k ¡  ¡ }W d  ƒ n1 s•w   Y  |||dœS )u   
        token_ids: [B, T] â€” EnCodec token indices (ground truth)
        text_tokens: [B, T_text] â€” byte-level text
        r   ©r   rP   Nr   r}   )r±   )Ú
total_lossÚ
token_lossÚtoken_accuracy)r   r³   r
   ÚfullrJ   ÚlongrP   r‰   r   rW   r´   ÚreshaperH   r‡   Úcross_entropyr±   Úno_gradÚargmaxr   Úmean)r   rX   Útext_tokensrZ   r[   Úkwargsr›   r\   rY   ÚbosÚ	input_idsÚbos_maskÚ
input_maskrz   ÚvalidÚlogits_flatÚtargets_flatr¸   Ú	token_accr   r   r   r    Ò   s2   
"$

ÿ
ÿýzLeWMTTSv6.forwardéî  çš™™™™™é?é2   c                 C   sà   |   ||¡}| j ¡ }|jd }tj|df| jtj|jd}	g }
t	|ƒD ]D}| j 
|	||||¡\}}|t|dƒ }|dkrV|j|dd\}}tdƒ|||dd…dd…f k < tj|dd}t |d¡}	|
 |	¡ q$tj|
ddS )	u/   AR generation: text â†’ EnCodec token sequence.r   r   r¶   g:Œ0âŽyE>r   r}   r   N)r³   r´   re   r   r
   rº   rJ   r»   rP   Úranger{   ÚmaxÚtopkr   r‡   rŽ   ÚmultinomialÚappendr‰   )r   rÁ   Ú	max_stepsÚtemperatureÚtop_kr[   rY   ru   r›   ÚtokenÚ
all_tokensrt   rz   Ú	topk_valsÚ_Úprobsr   r   r   Úgenerate   s   

 zLeWMTTSv6.generater«   )rË   rÌ   rÍ   N)
r"   r#   r$   r¬   r	   r    r
   r¾   rÛ   r%   r   r   r   r   r­   ³   s    
.ÿr­   c                 C   s   t dd„ |  ¡ D ƒƒS )Nc                 s   s    | ]
}|j r| ¡ V  qd S rD   )Úrequires_gradÚnumel)Ú.0Úpr   r   r   Ú	<genexpr>  s   € z#count_parameters.<locals>.<genexpr>)ÚsumÚ
parameters)Úmodelr   r   r   Úcount_parameters  s   rä   c              	   C   sH   | d u rdddddddddœ} t | ƒ}tdt|ƒd d	›d
ƒ || fS )Nr'   r(   rG   rF   r)   )r   r-   rH   r®   r¯   r°   r/   r±   zLeWM TTS v6: g    €„.Az.2fzM parameters)r­   Úprinträ   )rµ   rã   r   r   r   Úbuild_model_v6  s   ø
ræ   rD   )r¬   r   r
   Útorch.nnr4   Útorch.nn.functionalÚ
functionalr‡   ÚModuler   r&   rE   r­   rä   ræ   r   r   r   r   Ú<module>   s    wh