o
    ’ÜÂiK  ã                   @   s¼   d Z ddlZddlZddlZddlmZ ddlm  mZ G dd„ dej	ƒZ
ddd„Zdd„ ZG d	d
„ d
ej	ƒZG dd„ dej	ƒZG dd„ dej	ƒZG dd„ dej	ƒZdd„ Zddd„ZdS )u€  
LeWM TTS v5 â€” JEPA with EnCodec backend.

Architecture:
  - EnCodec encoder/decoder (frozen): audio â†” 128d embeddings @ 75Hz
  - Simple linear projections: 128d â†” d_model (no lossy compression)
  - TextEncoder: byte-level transformer for Hindi text
  - DurationPredictor: predicts per-character duration
  - JEPAPredictor: causal transformer with frame-level text conditioning
é    Nc                       s,   e Zd ZdZd‡ fdd„	Zdd	d
„Z‡  ZS )ÚDurationPredictorz<Predicts log-duration per text token. Similar to FastSpeech.é   é   é   çš™™™™™¹?c              	      sn   t ƒ  ¡  g }t|ƒD ]}| tj||||d dt ¡ t |¡t |¡g¡ qt 	|¡| _
t |d¡| _d S )Nr   )Úpaddingé   )ÚsuperÚ__init__ÚrangeÚextendÚnnÚConv1dÚReLUÚ	LayerNormÚDropoutÚ
ModuleListÚconvsÚLinearÚproj)ÚselfÚd_modelÚkernel_sizeÚ
num_layersÚdropoutÚlayersÚ_©Ú	__class__© ú!/home/ubuntu/lewm-tts/model_v5.pyr
      s   
üzDurationPredictor.__init__Nc                 C   s´   |  dd¡}tdt| jƒdƒD ]0}| j| |ƒ}| j|d  |ƒ}|  dd¡}| j|d  |ƒ}| j|d  |ƒ}|  dd¡}q|  dd¡}|  |¡ d¡}|durX| |d¡}|S )	z[
        text_emb: [B, T_text, d_model]
        Returns: log_durations [B, T_text]
        r   r   r   é   r   éÿÿÿÿNç        )Ú	transposer   Úlenr   r   ÚsqueezeÚmasked_fill)r   Útext_embÚ	text_maskÚxÚiÚlog_durr   r   r    Úforward$   s   zDurationPredictor.forward)r   r   r   r   ©N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r
   r-   Ú__classcell__r   r   r   r    r      s    r   c                 C   s¢   | j \}}}|dur| |d¡}|jdd ¡  ¡ }tj|||| jd}t|ƒD ]&}|| }	|	 ¡  ¡ }
|
dkr9q(tj	| | |	dd}|||d|j d …f< q(|S )zô
    Expand text embeddings by repeating each token according to its duration.
    text_emb: [B, T_text, d_model]
    durations: [B, T_text] integer durations
    Returns: expanded [B, T_audio, d_model], where T_audio = max(sum(durations))
    Nr   r   ©Údim©Údevice)
Úshaper'   ÚsumÚmaxÚitemÚtorchÚzerosr7   r   Úrepeat_interleave)r(   Ú	durationsr)   ÚBÚT_textÚDÚmax_audio_lenÚexpandedÚbÚdurs_bÚtotalÚexp_br   r   r    Úlength_regulate8   s   rI   c           
      C   s˜   | j d }|  ¡  ¡ }tj||tj| jd}t|ƒD ]/}| |  ¡ }||  ¡ }|dkr-q|| }|| }	|||d|…f< ||d|	…f  d7  < q|S )zî
    Compute uniform durations: spread audio frames evenly across text characters.
    text_lengths: [B] number of valid text tokens
    audio_lengths: [B] number of valid audio frames
    Returns: durations [B, max_text_len] integer
    r   ©Údtyper7   Nr   )r8   r:   r;   r<   r=   Úlongr7   r   )
Útext_lengthsÚaudio_lengthsr@   Úmax_text_lenr?   rE   ÚtlÚalÚbaseÚ	remainderr   r   r    Úcompute_uniform_durationsO   s   
rT   c                       sN   e Zd ZdZd‡ fdd„	Zdd	d
„Zddd„Zdd„ Zdd„ Zdd„ Z	‡  Z
S )ÚJEPAPredictorz<Causal predictor with frame-level text added to audio input.r   r!   é   r   c                    s˜   t ƒ  ¡  || _t ||¡| _t ||¡| _t|dd| _t 	|¡| _
tj|||d |ddd}tj||d| _t t ||¡t ¡ t ||¡¡| _d S )Ni   ©Úmax_lenr!   TÚgelu©r   ÚnheadÚdim_feedforwardr   Úbatch_firstÚ
activation©r   )r	   r
   r   r   r   Úaudio_input_projÚtext_input_projÚSinusoidalPositionalEncodingÚ	pos_embedr   r   ÚTransformerEncoderLayerÚTransformerEncoderÚtransformerÚ
SequentialÚGELUÚoutput_proj)r   r   r[   r   r   Úencoder_layerr   r   r    r
   j   s    

þ


ýzJEPAPredictor.__init__Nc                 C   sl   |   |¡|  |¡ }|  |¡}|  |¡}|jd }tjtj|||jtj	ddd}| j
|||d}|  |¡S )uœ   
        audio_emb: [B, T, d_model] â€” shifted audio embeddings
        text_emb_expanded: [B, T, d_model] â€” frame-level text (already expanded)
        r   )r7   rK   )Údiagonal)ÚmaskÚsrc_key_padding_mask)r`   ra   rc   r   r8   r<   ÚtriuÚonesr7   Úboolrf   ri   )r   Ú	audio_embÚtext_emb_expandedÚ
audio_maskr*   ÚTÚcausal_maskÚ	predictedr   r   r    r-   ~   s   


 
zJEPAPredictor.forwardc                 C   s(   |   |¡}|  |¡}|  |¡}|  |¡S )zELegacy cross-attention forward (not used in duration-predictor mode).)r`   rc   r   ri   )r   rq   r(   rs   r)   r*   r   r   r    Úforward_cross_attnŒ   s   



z JEPAPredictor.forward_cross_attnc                 C   sr  |j }|j}|| }|jd }	|jjddd\}
}}|jjddd\}}}t ||
|¡}t |||¡}t |||¡}|d urX|jd dkrXtj	||gdd}tj	||gdd}n||}}| 
|	d||¡ dd¡}| 
|	d||¡ dd¡}| 
|	d||¡ dd¡}t || dd¡¡|d  }tj|dd}t ||¡}| dd¡ ¡  
|	d|¡}t ||jj|jj¡}|||fS )	Nr   r   r4   r   r   r"   éþÿÿÿg      à?)Ú	embed_dimÚ	num_headsr8   Úin_proj_weightÚchunkÚin_proj_biasÚFÚlinearr<   ÚcatÚviewr$   ÚmatmulÚsoftmaxÚ
contiguousÚout_projÚweightÚbias)r   ÚmhaÚq_inputÚkv_inputÚcache_kÚcache_vÚdr[   Úhead_dimr@   ÚWqÚWkÚWvÚbqÚbkÚbvÚqÚk_newÚv_newÚkÚvÚk_mhÚv_mhÚattnÚoutr   r   r    Ú_cached_mha“   s,   


zJEPAPredictor._cached_mhac                 C   s   d g| d g| dœS )N)Úself_kÚself_vr   ©r   r   r   r   r    Ú
init_cache¬   s   þzJEPAPredictor.init_cachec           
   
   C   sÖ   |   |¡|  |¡ }|| jjdd…||d …f  }t| jjƒD ]C\}}|  |j|||d | |d | ¡\}|d |< |d |< | 	|| 
|¡ ¡}| | | | |¡¡¡¡}	| || |	¡ ¡}q |  |¡|fS )už   
        new_emb: [B, 1, d_model] â€” audio embedding for this step
        text_emb_frame: [B, 1, d_model] â€” expanded text embedding for this step
        Nr   rŸ   r    )r`   ra   rc   ÚpeÚ	enumeraterf   r   rž   Ú	self_attnÚnorm1Údropout1Úlinear2r   r^   Úlinear1Únorm2Údropout2ri   )
r   Únew_embÚtext_emb_frameÚstep_idxÚcacher*   r+   ÚlayerÚsa_outÚff_outr   r   r    Úinference_step²   s    ÿzJEPAPredictor.inference_step)r   r!   rV   r   r.   ©NN)r/   r0   r1   r2   r
   r-   rw   rž   r¢   r³   r3   r   r   r   r    rU   h   s    

rU   c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )rb   é    c                    s¬   t ƒ  ¡  t ||¡}tjd|tjd d¡}t t d|d¡ ¡ t 	d¡ |  ¡}t 
|| ¡|d d …dd d…f< t || ¡|d d …dd d…f< |  d| d¡¡ d S )Nr   )rK   r   r   g     ˆÃ@r£   )r	   r
   r<   r=   ÚarangeÚfloatÚ	unsqueezeÚexpÚmathÚlogÚsinÚcosÚregister_buffer)r   r   rX   r£   ÚpositionÚdiv_termr   r   r    r
   Æ   s   
(  z%SinusoidalPositionalEncoding.__init__c                 C   s    || j d d …d |jd …f  S )Nr   )r£   r8   )r   r*   r   r   r    r-   Ï   s    z$SinusoidalPositionalEncoding.forward)rµ   ©r/   r0   r1   r
   r-   r3   r   r   r   r    rb   Å   s    	rb   c                       s(   e Zd Zd	‡ fdd„	Zd
dd„Z‡  ZS )ÚTextEncoderr   r!   r   c                    st   t ƒ  ¡  || _t ||¡| _t|dd| _t |¡| _	tj
|||d |ddd}tj||d| _t ||¡| _d S )Ni   rW   r!   TrY   rZ   r_   )r	   r
   r   r   Ú	EmbeddingÚ
char_embedrb   rc   r   r   rd   re   rf   r   r   )r   Ú
vocab_sizer   r[   r   r   rj   r   r   r    r
   Ö   s   

þzTextEncoder.__init__Nc                 C   sB   |   |¡t | j¡ }|  |¡}|  |¡}| j||d}|  |¡S )N)rm   )rÄ   rº   Úsqrtr   rc   r   rf   r   )r   Útext_tokensr)   r*   r   r   r    r-   ã   s
   


zTextEncoder.forward)r   r   r!   r!   r   r.   rÁ   r   r   r   r    rÂ   Õ   s    rÂ   c                       sT   e Zd ZdZ‡ fdd„Ze ¡ dd„ ƒZddd„Zdd	d
„Z	dd„ Z
dd„ Z‡  ZS )Ú	LeWMTTSv5u›  
    JEPA TTS with EnCodec backend + duration predictor.
    Text is expanded to frame level and added to audio input â€” can't be ignored.

    Training: codec_emb â†’ proj_in â†’ predict(audio + aligned_text) â†’ loss vs targets
              Duration predictor trained with uniform alignment as supervision.
    Inference: text â†’ duration predict â†’ expand text â†’ AR predict â†’ proj_out â†’ decode
    c           
         sr  t ƒ  ¡  | dd¡}| dd¡}| dd¡}| dd¡}| dd¡}| d	d
¡}| dd¡}|| _t|||||d| _t|dd|d| _t 	t 
||¡t ¡ t 
||¡¡| _t 	t 
||¡t ¡ t 
||¡¡| _t||||d| _t | j¡| _| j ¡ D ]}	d|	_q~| dd¡| _t t dd|¡d ¡| _| dd¡| _| dd¡| _| dd¡| _| dd¡| _|| _d S )Nr   r   r[   r!   Ú	codec_dimé€   Útext_vocab_sizeÚtext_encoder_layersÚpredictor_layersrV   r   r   )rÅ   r   r[   r   r   r   r   )r   r   r   r   )r   r[   r   r   FÚ	ema_decayçV-²ïï?r   g{®Gáz”?Úpred_weightç      $@Úroundtrip_weightç      ð?Ú
dur_weightÚinput_noiser#   )r	   r
   ÚgetrÉ   rÂ   Útext_encoderr   Úduration_predictorr   rg   r   rh   Úproj_inÚproj_outrU   Ú	predictorÚcopyÚdeepcopyÚema_proj_inÚ
parametersÚrequires_gradrÎ   Ú	Parameterr<   ÚrandnÚ	start_embrÐ   rÒ   rÔ   rÕ   Úconfig)
r   rä   r   r[   rÉ   Ú
text_vocabÚtext_layersrÍ   r   Úpr   r   r    r
   ÷   sP   
þÿ

ý

ýþ
zLeWMTTSv5.__init__c                 C   sD   t | j ¡ | j ¡ ƒD ]\}}|j | j¡j|jd| j d qd S )Nr   )Úalpha)ÚziprÙ   rß   rÞ   ÚdataÚmul_rÎ   Úadd_)r   Úp_onlineÚp_emar   r   r    Ú
update_ema1  s
   
ÿ"þzLeWMTTSv5.update_emaNc           !      C   s‚  |j d }|j d }| dd¡}|  ||¡}|  ||¡}	|dur(| jdd}
ntj|f|j d |jd}
|durA| jdd}n
tj|f||jd}t|
|ƒ}t 	| 
¡ jdd¡}|durw|  
¡ }tj|	| || dd	| ¡ d
  }nt |	|¡}t|||ƒ}|j d |kr•|dd…d|…f }n"|j d |k r·tj|||j d  |j d |jd}tj||gdd}|  |¡}t ¡  | j ¡  |  |¡}| j ¡  W d  ƒ n1 sÚw   Y  | j |dd¡}tj||dd…dd…f gdd}| jr%| jdkr%t |¡}tj||d |j d |jd| j |dd…dd…f< || }|durItj|dtj|jd}tj||dd…dd…f gdd}|}nd}d}|  |||¡}|durw|  d¡}tj|| || dd	| ¡ |j d  d
  }nt ||¡}|  |¡}|dur¥|  d¡ 
¡ }tj || || dd	| ¡ | j! d
  }nt  ||¡}| j"| | j#|  | j$|  } | |||dœS )uÝ   
        Args:
            codec_emb: [B, 128, T] â€” continuous EnCodec embeddings
            text_tokens: [B, T_text]
            codec_mask: [B, T] bool, True = padding
            text_mask: [B, T_text] bool
        r   r   r   Nr4   r6   ©Úminr9   )Ú	reductiong:Œ0âŽyE>r"   rJ   )Ú
total_lossÚprediction_lossÚroundtrip_lossÚdur_loss)%r8   r$   r×   rØ   r9   r<   Úfullr7   rT   r»   r·   Úclampr~   Úmse_lossrI   r=   r€   rÙ   Úno_gradrÞ   ÚevalÚtrainrã   ÚexpandÚtrainingrÕ   Ú
zeros_likerâ   rp   rÛ   r¸   rÚ   Úl1_lossrÉ   rÐ   rÒ   rÔ   )!r   Ú	codec_embrÇ   Ú
codec_maskr)   r@   rt   Ú	codec_seqr(   Úlog_dur_predrM   rN   Úgt_durationsÚ
log_dur_gtÚ
valid_textrö   rr   ÚpadÚzÚ
target_embÚstartÚ	input_embÚnoiseÚ
start_maskÚ	pred_maskÚ	loss_maskrv   Úvalidrô   Úcodec_reconÚvalid_rtrõ   ró   r   r   r    r-   7  sž   



ÿ
þÿ



ý"
ÿÿ
"
ÿþ

ÿþÿþüzLeWMTTSv5.forwardc                 C   s>   |   ||¡}t |¡ ¡  ¡ jdd}|dur| |d¡}|S )z7Predict durations from text embeddings (for inference).r   rð   Nr   )rØ   r<   r¹   ÚroundrL   rø   r'   )r   r(   r)   r,   r?   r   r   r    Úpredict_durations«  s
   zLeWMTTSv5.predict_durationsc                 C   s   t | jjjƒ}| j |¡S r.   )r%   rÛ   rf   r   r¢   r¡   r   r   r    Úinit_ar_cache³  s   zLeWMTTSv5.init_ar_cachec                 C   s   | j  ||||¡S )us   
        new_emb: [B, 1, d_model]
        text_emb_frame: [B, 1, d_model] â€” expanded text for this frame
        )rÛ   r³   )r   r¬   r­   r®   r¯   r   r   r    Úpredict_next_cached·  s   zLeWMTTSv5.predict_next_cachedr´   r.   )r/   r0   r1   r2   r
   r<   rú   rï   r-   r  r  r  r3   r   r   r   r    rÈ   í   s    	:


trÈ   c                 C   s   t dd„ |  ¡ D ƒƒS )Nc                 s   s    | ]
}|j r| ¡ V  qd S r.   )rà   Únumel)Ú.0rç   r   r   r    Ú	<genexpr>À  s   € z#count_parameters.<locals>.<genexpr>)r9   rß   )Úmodelr   r   r    Úcount_parameters¿  s   r  c                 C   sN   | d u rddddddddddd	d
œ} t | ƒ}tdt|ƒd d›dƒ || fS )Nr   r!   rÊ   rV   r   rÑ   rÓ   rÏ   r#   )r   r[   rÉ   rË   rÌ   rÍ   r   rÐ   rÒ   rÎ   rÕ   zLeWM TTS v5: g    €„.Az.2fzM trainable parameters)rÈ   Úprintr  )rä   r  r   r   r    Úbuild_modelÃ  s   ûr  r.   )r2   rº   rÜ   r<   Útorch.nnr   Útorch.nn.functionalÚ
functionalr~   ÚModuler   rI   rT   rU   rb   rÂ   rÈ   r  r  r   r   r   r    Ú<module>   s     
#] S