o
    |ºÊir7  ã                   @   s¢   d Z ddlZddlZddlmZ ddlm  mZ G dd„ dejƒZ	G dd„ dejƒZ
G dd„ dejƒZG d	d
„ d
ejƒZG dd„ dejƒZdd„ Zddd„ZdS )uÒ   
LeWM TTS v7 â€” AR (level 1) + NAR (levels 2-8) on frozen EnCodec tokens.
Multi-speaker ready via speaker embeddings.

Inference: text + speaker_id â†’ AR tokens â†’ NAR tokens â†’ EnCodec decode â†’ waveform
é    Nc                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚSinusoidalPEé    c                    sª   t ƒ  ¡  t ||¡}t d|¡ ¡  d¡}t t d|d¡ ¡ t 	d¡ |  ¡}t 
|| ¡|d d …dd d…f< t || ¡|d d …dd d…f< |  d| d¡¡ d S )Nr   é   é   g     ˆÃ@Úpe)ÚsuperÚ__init__ÚtorchÚzerosÚarangeÚfloatÚ	unsqueezeÚexpÚmathÚlogÚsinÚcosÚregister_buffer)ÚselfÚdÚmax_lenr   ÚposÚdiv©Ú	__class__© ú!/home/ubuntu/lewm-tts/model_v7.pyr      s   
(  zSinusoidalPE.__init__c                 C   s    || j d d …d |jd …f  S ©Nr   )r   Úshape)r   Úxr   r   r   Úforward   s    zSinusoidalPE.forward)r   ©Ú__name__Ú
__module__Ú__qualname__r   r    Ú__classcell__r   r   r   r   r      s    	r   c                       s(   e Zd Zd	‡ fdd„	Zd
dd„Z‡  ZS )ÚTextEncoderé   é   çš™™™™™¹?c                    sn   t ƒ  ¡  || _t ||¡| _t|ƒ| _t |¡| _	tj
|||d |ddd}t ||¡| _t ||¡| _d S )Nr(   TÚgelu©Úbatch_firstÚ
activation)r   r   r   ÚnnÚ	EmbeddingÚembedr   r   ÚDropoutÚdropÚTransformerEncoderLayerÚTransformerEncoderÚtfÚLinearÚproj)r   Úvocabr   ÚnheadÚlayersÚdropoutÚlayerr   r   r   r      s   

zTextEncoder.__init__Nc                 C   s:   |   |¡t | j¡ }|  |  |¡¡}|  | j||d¡S )N)Úsrc_key_padding_mask)r0   r   Úsqrtr   r2   r   r7   r5   )r   ÚtokensÚmaskr   r   r   r   r    '   s   zTextEncoder.forward)r'   r'   r(   r(   r)   ©Nr!   r   r   r   r   r&      s    
r&   c                       sH   e Zd ZdZd‡ fdd„	Zdd
d„Zdd„ Zddd„Zddd„Z‡  Z	S )ÚARModelz5Causal transformer predicting level-1 EnCodec tokens.r'   r(   é   é   r)   c                    s”   t ƒ  ¡  || _|| _|| _t |d |¡| _t|ƒ| _	t 
|¡| _tj|||d |ddd}t ||¡| _t t ||¡t ¡ t ||¡¡| _d S ©Nr   r(   Tr*   r+   )r   r   r   Ún_codesÚbos_idr.   r/   Ú	tok_embedr   r   r1   r2   ÚTransformerDecoderLayerÚTransformerDecoderr5   Ú
Sequentialr6   ÚGELUÚhead)r   r   r9   r:   rF   r;   r<   r   r   r   r   2   s   

(zARModel.__init__Nc                 C   s^   |   |  |  |¡¡¡}|jd }tjtj|||jtjddd}| j	|||||d}|  
|¡S )zEtoken_ids: [B, T] with BOS prepended. Returns logits [B, T, n_codes].r   )ÚdeviceÚdtype)Údiagonal)Útgt_maskÚtgt_key_padding_maskÚmemory_key_padding_mask)r2   r   rH   r   r	   ÚtriuÚonesrN   Úboolr5   rM   )r   Ú	token_idsÚtext_embÚtok_maskÚ	text_maskr   ÚTÚcausalr   r   r   r    @   s   
 ÿ
zARModel.forwardc                 C   s2   t | jjƒ}d g| d g| d g| d g| dœS )N)ÚskÚsvÚckÚcv)Úlenr5   r:   )r   Únr   r   r   Ú
init_cacheK   s   &zARModel.init_cachec              
   C   s  |   |¡| jjdd…||d …f  }t| jjƒD ]k\}}|  |j|||d | |d | ¡\}	|d |< |d |< | || |	¡ ¡}|  |j	|||d | |d | |¡\}
|d |< |d |< | 
|| |
¡ ¡}| | | | |¡¡¡¡}| || |¡ ¡}q|  |¡ d¡|fS )zESingle token step. tok_id: [B,1]. Returns logits [B, n_codes], cache.Nr   r]   r^   r_   r`   )rH   r   Ú	enumerater5   r:   Ú_attnÚ	self_attnÚnorm1Údropout1Úmultihead_attnÚnorm2Údropout2Úlinear2r;   r-   Úlinear1Únorm3Údropout3rM   Úsqueeze)r   Útok_idrX   Ústep_idxÚcacherZ   r   Úir<   ÚsaÚcaÚffr   r   r   ÚstepO   s   &ÿÿzARModel.stepc                 C   sº  |j |j|j |j }}}	|jd }
|j dd¡\}}}|j dd¡\}}}t |||¡}|d u rBt |||¡t |||¡}}n4t |||¡t |||¡}}|jd dkrq|jd dkrqt 	||gd¡t 	||gd¡}}n||}}| 
|
d||	¡ dd¡}| 
|
d||	¡ dd¡}| 
|
d||	¡ dd¡}t || dd¡¡|	d  }|d urº| | d¡ d¡tdƒ¡}t t |d¡|¡ dd¡ ¡  
|
d|¡}t ||jj|jj¡||fS )	Nr   é   r   éÿÿÿÿr   éþÿÿÿg      à?ú-inf)Ú	embed_dimÚ	num_headsr   Úin_proj_weightÚchunkÚin_proj_biasÚFÚlinearr	   ÚcatÚviewÚ	transposeÚmatmulÚmasked_fillr   r   ÚsoftmaxÚ
contiguousÚout_projÚweightÚbias)r   ÚmhaÚqÚkvr_   r`   r@   r   ÚnhÚhdÚBÚWqÚWkÚWvÚbqÚbkÚbvÚqpÚkÚvÚknÚvnÚkmÚvmÚaÚor   r   r   re   ]   s&   
 $
*zARModel._attn)r'   r(   rC   rD   r)   ©NNrA   )
r"   r#   r$   Ú__doc__r   r    rc   rx   re   r%   r   r   r   r   rB   /   s    

rB   c                       s>   e Zd ZdZd‡ fdd	„	Zddd„Ze ¡ ddd„ƒZ‡  Z	S )ÚNARModelzWBidirectional transformer: given level-1 tokens + text, predict levels 2-8 in parallel.r'   r(   é   rD   rC   r)   c                    sœ   t ƒ  ¡  || _|d | _t ||¡| _t|ƒ| _t 	|¡| _
t ||¡| _tj|||d |ddd}t ||¡| _t t ||¡t ¡ t ||¡¡| _d S rE   )r   r   rF   Ú	n_predictr.   r/   rH   r   r   r1   r2   Úlevel_embedrI   rJ   r5   rK   r6   rL   rM   )r   r   r9   r:   rF   Ún_rvqr;   r<   r   r   r   r   z   s   


(zNARModel.__init__Nc                 C   sV   |   |¡}|| jj|d   d¡ d¡ }|  |  |¡¡}| j||||d}|  |¡S )uÜ   
        level1_tokens: [B, T] â€” level 1 codebook indices
        text_emb: [B, T_text, d]
        target_level: int 1-7 (which level to predict, 0-indexed from level 2)
        Returns: logits [B, T, n_codes]
        r   r   ©rR   rS   )rH   r¨   rŒ   r   r2   r   r5   rM   )r   Úlevel1_tokensrX   Útarget_levelrZ   rY   r   r   r   r   r    Œ   s   
 
ÿ
zNARModel.forwardc                 C   sÂ   |j \}}tj|d|tj|jd}||dd…df< |  |¡}tdƒD ]<}	|| jj|	d   	d¡ 	d¡ }
|  
|
¡}
| j|
|||d}
|  |
¡}|jdd	}||dd…|	d f< ||  |¡ }q"|S )
z<Predict all 7 levels in 7 forward passes. Returns [B, 8, T].rC   ©rO   rN   Nr   é   r   rª   rz   ©Údim)r   r	   r
   ÚlongrN   rH   Úranger¨   rŒ   r   r   r5   rM   Úargmax)r   r«   rX   rZ   rY   r“   r[   Ú	all_codesÚaccumÚlvlr   Úlogitsr?   r   r   r   Úgenerate_all_levelsš   s   

 

ÿ
zNARModel.generate_all_levels)r'   r(   r¦   rD   rC   r)   r£   )
r"   r#   r$   r¤   r   r    r	   Úno_gradr¸   r%   r   r   r   r   r¥   w   s    
r¥   c                       sH   e Zd Z‡ fdd„Zdd„ Z		ddd„Ze ¡ 	
	ddd„ƒZ‡  Z	S )Ú	LeWMTTSv7c                    sø   t ƒ  ¡  | dd¡}| dd¡}| dd¡}| dd¡}| d	d
¡}| dd¡}t| dd¡||| dd¡|d| _t||| dd¡||d| _t||| dd¡|||d| _|| _	|d
krgt
 ||¡| _nd | _|| _|| _| dd¡| _|| _d S )NÚd_modelr'   r9   r(   rF   rD   r©   rC   Ú
n_speakersr   r;   r)   Útext_vocab_sizeÚtext_encoder_layers)r8   r   r9   r:   r;   Ú	ar_layers)r   r9   r:   rF   r;   Ú
nar_layersr¦   )r   r9   r:   rF   r©   r;   Úlabel_smoothing)r   r   Úgetr&   Útext_encoderrB   Úarr¥   Únarr¼   r.   r/   Úspeaker_embedrF   r©   rÁ   Úconfig)r   rÇ   r   r9   rF   r©   r¼   r;   r   r   r   r   µ   s2   
þÿÿ
zLeWMTTSv7.__init__c                 C   s.   | j d ur|d ur|   |¡ d¡}|| }|S r   )rÆ   r   )r   rX   Ú
speaker_idÚspkr   r   r   Ú_add_speakerÓ   s   zLeWMTTSv7._add_speakerNc                 C   sV  |j \}}}|dd…df }	|  ||¡}
|  |
|¡}
tj|df| jjtj|	jd}tj	||	dd…dd…f gdd}|dur[tj
|dtj|	jd}tj	||dd…dd…f gdd}nd}|  ||
||¡}|durz| }tj|| |	| | jd}ntj| d| j¡|	 d¡| jd}t d|d d¡ ¡ }|dd…|d f }|	}|  ||
|||¡}|durÀ| }tj|| || | jd}ntj| d| j¡| d¡| jd}t ¡ A |durù||  d¡|	| k ¡  ¡ }||  d¡|| k ¡  ¡ }n| d¡|	k ¡  ¡ }| d¡|k ¡  ¡ }W d  ƒ n	1 sw   Y  || }|||||d	œS )
ua   
        all_tokens: [B, n_rvq, T] â€” all 8 RVQ levels
        text_tokens: [B, T_text]
        Nr   r   r­   rz   r¯   )rÁ   )r   )Ú
total_lossÚar_lossÚnar_lossÚar_accÚnar_acc)r   rÃ   rÊ   r	   ÚfullrÄ   rG   r±   rN   r„   r
   rV   r‚   Úcross_entropyrÁ   ÚreshaperF   ÚrandintÚitemrÅ   r¹   r³   r   Úmean)r   Ú
all_tokensÚtext_tokensÚ
token_maskrZ   rÈ   r“   r©   r[   Úlevel1rX   ÚbosÚar_inputÚbos_maskÚar_maskÚ	ar_logitsÚvalidrÌ   r¶   ÚtargetÚnar_input_tokensÚ
nar_logitsrÍ   rÎ   rÏ   Útotalr   r   r   r    Ù   sV   "$ÿÿÿÿ
 €úþzLeWMTTSv7.forwardéî  çš™™™™™é?é2   c                 C   sü   |   ||¡}|  ||¡}|jd }| j ¡ }	tj|df| jjtj|j	d}
g }t
|ƒD ]A}| j |
|||	|¡\}}	|t|dƒ }|dkr]|j|dd\}}tdƒ|||dd…dd…f k < t t |d¡d¡}
| |
¡ q+tj|dd}| j |||¡}|S )	u'   Full pipeline: text â†’ 8-level tokens.r   r   r­   g:Œ0âŽyE>rz   r¯   r|   N)rÃ   rÊ   r   rÄ   rc   r	   rÐ   rG   r±   rN   r²   rx   ÚmaxÚtopkr   Úmultinomialr‚   r‰   Úappendr„   rÅ   r¸   )r   r×   Ú	max_stepsÚtemperatureÚtop_krZ   rÈ   rX   r“   rs   ÚtokrÙ   rx   r·   Útopk_vÚ_r´   r   r   r   Úgenerate  s"   

 zLeWMTTSv7.generate)NNN)rä   rå   ræ   NN)
r"   r#   r$   r   rÊ   r    r	   r¹   rñ   r%   r   r   r   r   rº   ´   s    
ÿ?ÿrº   c                 C   s   t dd„ |  ¡ D ƒƒS )Nc                 s   s    | ]
}|j r| ¡ V  qd S rA   )Úrequires_gradÚnumel)Ú.0Úpr   r   r   Ú	<genexpr>6  s   € z#count_parameters.<locals>.<genexpr>)ÚsumÚ
parameters)Úmodelr   r   r   Úcount_parameters5  s   rú   c                 C   sN   | d u rddddddddddddœ} t | ƒ}td	t|ƒd
 d›dƒ || fS )Nr'   r(   rD   rC   r¦   r)   r   )r»   r9   rF   r©   r½   r¾   r¿   rÀ   r;   rÁ   r¼   zLeWM TTS v7: g    €„.Az.2fzM params)rº   Úprintrú   )rÇ   rù   r   r   r   Úbuild_model_v79  s   ûrü   rA   )r¤   r   r	   Útorch.nnr.   Útorch.nn.functionalÚ
functionalr‚   ÚModuler   r&   rB   r¥   rº   rú   rü   r   r   r   r   Ú<module>   s    H= 