o
    i$                     @  s   d Z ddlmZ ddlmZ ddlZddlm  mZ	 ddlmZ ddl
mZ ddlmZ ddlmZmZmZmZmZmZmZmZ G d	d
 d
ejZG dd dejZG dd dejZdS )z\
ein notation:
b - batch
n - sequence
nt - text sequence
nw - raw wave length
d - dimension
    )annotations)LiteralN)nn)RMSNorm)RotaryEmbedding)	AttentionAttnProcessorConvNeXtV2BlockConvPositionEmbeddingFeedForwardTimestepEmbeddingget_pos_embed_indicesprecompute_freqs_cisc                      s*   e Zd Zd fdd	Zddd	d
Z  ZS )TextEmbeddingTr      c                   s|   t    t|d | _|| _|dkr9d| _d| _| jdt	| jdd tj
 fdd	t|D  | _d S d| _d S )
N   r   Ti   	freqs_cisF)
persistentc                   s   g | ]	}t   qS  )r	   ).0_	conv_multtext_dimr   P/home/ubuntu/.local/lib/python3.10/site-packages/f5_tts/model/backbones/unett.py
<listcomp>0   s    z*TextEmbedding.__init__.<locals>.<listcomp>)super__init__r   	Embedding
text_embedmask_paddingextra_modelingprecompute_max_posregister_bufferr   
Sequentialrangetext_blocks)selftext_num_embedsr   r    conv_layersr   	__class__r   r   r   %   s   


zTextEmbedding.__init__Ftextint['b nt']c              
   C  s  |d }|d d d |f }|j d |j d }}tj|d|| fdd}| jr,|dk}|r3t|}| |}| jrtj|ftj	d}t
||| jd}| j| }	||	 }| jr||ddd|dd}| jD ]}
|
|}||ddd|dd}ql|S | |}|S )Nr   r   value)dtype)max_posg        )shapeFpadr    torch
zeros_liker   r!   zeroslongr   r"   r   masked_fill	unsqueezeexpandsizer&   )r'   r,   seq_len	drop_textbatchtext_len	text_maskbatch_startpos_idxtext_pos_embedblockr   r   r   forward5   s,   


"
$
zTextEmbedding.forward)Tr   r   F)r,   r-   __name__
__module____qualname__r   rG   __classcell__r   r   r*   r   r   $   s    r   c                      s(   e Zd Z fddZd
ddd	Z  ZS )InputEmbeddingc                   s0   t    t|d | || _t|d| _d S )Nr   dim)r   r   r   Linearprojr
   conv_pos_embed)r'   mel_dimr   out_dimr*   r   r   r   Z   s   
zInputEmbedding.__init__Fxfloat['b n d']condr   c                 C  s:   |rt |}| t j|||fdd}| || }|S )Nr2   rO   )r6   r7   rR   catrS   )r'   rV   rX   r   drop_audio_condr   r   r   rG   _   s
   
zInputEmbedding.forwardrH   )rV   rW   rX   rW   r   rW   rI   r   r   r*   r   rN   Y   s    rN   c                      sp   e Zd Zddddddddddd	dd
dddd& fddZ			d'd(ddZdd Z					d)d*d$d%Z  ZS )+UNetT   @   g?   d      NTr   r6   Fconcat)depthheadsdim_headdropoutff_multrT   r(   r   text_mask_paddingqk_normr)   pe_attn_headattn_backendattn_mask_enabledskip_connect_typerl    Literal['add', 'concat', 'none']c             
     s@  t    |d dksJ dt|| _|	d u r|}	t||	|
|d| _d\| _| _t||	|| _	t
|| _|| _|| _|dk}|| _tg | _t|D ]E}||d k}t|}tt|||d|||||d}t|}t|||d	d
}|r|rtj|d |ddnd }| jt|||||g qLt|| _t||| _d S )Nr   r   z(UNet-Transformer's depth should be even.)r    r)   NNra   )ri   rj   rk   )	processorrP   rc   rd   re   rh   tanh)rP   multre   approximateF)bias)r   r   r   
time_embedr   r   	text_condtext_uncondrN   input_embedr   rotary_embedrP   rl   rb   r   
ModuleListlayersr%   r   r   r   r   rQ   appendnorm_outproj_out)r'   rP   rb   rc   rd   re   rf   rT   r(   r   rg   rh   r)   ri   rj   rk   rl   needs_skip_projidxis_later_half	attn_normattnff_normff	skip_projr*   r   r   r   l   s\   


 
zUNetT.__init__rZ   boolr?   cachec           	      C  s   |j d }|r-|r| jd u r| j||dd| _| j}n| jd u r)| j||dd| _| j}n| j|||d}| j||||d}|S )Nr   T)r?   F)rZ   )r3   rv   r   ru   rw   )	r'   rV   rX   r,   rZ   r?   r   r>   r   r   r   r   get_input_embed   s   
	

zUNetT.get_input_embedc                 C  s   d\| _ | _d S )Nrn   )ru   rv   )r'   r   r   r   clear_cache   s   zUNetT.clear_cacherV   rW   rX   r,   r-   timefloat['b'] | float['']maskbool['b n'] | None	cfg_inferc
                 C  s  |j d |j d }
}|jdkr||
}| |}|rT| j|||dd|	d}| j|||dd|	d}tj||fdd}tj||fdd}|d urQtj||fddnd }n| j||||||	d}tj|d|gdd}|d urwtj	|ddd}| j
|d }| j}g }t| jD ]Q\}\}}}}}|d }|| jd	 k}| }|r|| |r| }|d
krtj||fdd}||}n|dkr|| }|||||d| }|||| }qt|dksJ | |d d dd d d f }| |S )Nr   r   F)rZ   r?   r   TrO   )r   r   r.   r   ra   r2   add)roper   )r3   ndimrepeatrt   r   r6   rY   r;   r4   r5   rx   forward_from_seq_lenrl   	enumeraterz   rb   r{   poplenr|   r}   )r'   rV   rX   r,   r   r   rZ   r?   r   r   r@   r>   tx_condx_uncondr   rl   skipsr   maybe_skip_projr   r   r   r   layeris_first_halfr   skipr   r   r   rG      sF   


 

 
zUNetT.forward)rl   rm   )FFT)rZ   r   r?   r   r   r   )NFFFF)rV   rW   rX   rW   r,   r-   r   r   r   r   rZ   r   r?   r   r   r   r   r   )rJ   rK   rL   r   r   r   rG   rM   r   r   r*   r   r[   k   s8    U	r[   )__doc__
__future__r   typingr   r6   torch.nn.functionalr   
functionalr4   x_transformersr   x_transformers.x_transformersr   f5_tts.model.modulesr   r   r	   r
   r   r   r   r   Moduler   rN   r[   r   r   r   r   <module>   s    
(5