o
    i/                     @  s   d Z ddlmZ ddlZddlm  mZ ddlmZ ddlm	Z	 ddl
mZmZmZmZmZmZ G dd dejZG d	d
 d
ejZG dd dejZdS )z\
ein notation:
b - batch
n - sequence
nt - text sequence
nw - raw wave length
d - dimension
    )annotationsN)nn)RotaryEmbedding)AdaLayerNorm_FinalConvNeXtV2BlockConvPositionEmbeddingDiTBlockTimestepEmbeddingprecompute_freqs_cisc                      s4   e Zd Z	d fdd	Zdd ZddddZ  ZS )TextEmbeddingTFr      c                   s   t    t|d | _|| _|| _|r|sJ d|dkrDd| _d| _| j	dt
| jdd tj fd	d
t|D  | _d S d| _d S )N   zGtext_embedding_average_upsampling requires text_mask_padding to be Truer   Ti    	freqs_cisF)
persistentc                   s   g | ]	}t   qS  )r   .0_	conv_multtext_dimr   N/home/ubuntu/.local/lib/python3.10/site-packages/f5_tts/model/backbones/dit.py
<listcomp>0   s    z*TextEmbedding.__init__.<locals>.<listcomp>)super__init__r   	Embedding
text_embedmask_paddingaverage_upsamplingextra_modelingprecompute_max_posregister_bufferr
   
Sequentialrangetext_blocks)selftext_num_embedsr   r   r   conv_layersr   	__class__r   r   r       s   


zTextEmbedding.__init__c                 C  s  |j \}}}|jdd}t|}t|D ]o}	t||	  }
t||	  }|
dks/|dkr0qt||	 d }||	|d d f }||
 }||
 }g }t|
D ]}|||
| kr[dnd }||g|  qPtj	|d | |j
tjd}|| }|||	d |d d f< q|S )Nr   dimr   devicedtype)shapesumtorch
zeros_liker#   intitemwhereextendtensorr-   long)r%   text	text_masktarget_lensbatchmax_seq_lenr   	text_lensupsampled_textitext_len	audio_len	valid_ind
valid_database_repeat	remainderindicesjrepeat_count	upsampledr   r   r   average_upsample_text_by_mask5   s(   
z+TextEmbedding.average_upsample_text_by_maskr9   int['b nt']c              
   C  s  |d }d }t |r|j|jt jd}t|  }nt|}|d d d |f }tj	|d||j
d  fdd}t |rXt j||jdd}||dk }|| d}| jr_|dk}|rft |}| |}|d ury||d d}| jr| jd |d d f }|d ur|d|d|j }|| }| jr||ddd|dd}| jD ]}	|	|}||ddd|dd}qn| |}| jrt |r|j|jt jd}
nt j|j
d ft||jt jd}
| || |
}|S )Nr   r,   r   )value)r-   g        )r1   	is_tensortor-   r8   r3   maxr4   Fpadr/   arange	unsqueezemasked_fillr   r2   r   r   r   r.   expandsizer$   r   fullrK   )r%   r9   seq_len	drop_textvalid_pos_maskr=   seq_posr:   freqsblockr;   r   r   r   forwardT   sJ   



"
$

"zTextEmbedding.forward)TFr   r   )F)r9   rL   )__name__
__module____qualname__r   rK   r`   __classcell__r   r   r(   r   r      s
    r   c                      s,   e Zd Z fddZ		ddddZ  ZS )InputEmbeddingc                   s0   t    t|d | || _t|d| _d S )Nr   r*   )r   r   r   Linearprojr   conv_pos_embed)r%   mel_dimr   out_dimr(   r   r   r      s   
zInputEmbedding.__init__FNxfloat['b n d']condr   
audio_maskbool['b n'] | Nonec                 C  s>   |rt |}| t j|||fdd}| j||d| }|S )NrN   r*   )mask)r1   r2   rg   catrh   )r%   rk   rm   r   drop_audio_condrn   r   r   r   r`      s
   
zInputEmbedding.forward)FN)rk   rl   rm   rl   r   rl   rn   ro   )ra   rb   rc   r   r`   rd   r   r   r(   r   re      s
    
re   c                      s   e Zd Zdddddddddd	dd
ddd	d	d	d fdd
Zdd Zdd Z						d(d)ddZdd Z									d*d+d&d'Z  Z	S ),DiT   @   g?   d      NTFr   r1   )depthheadsdim_headdropoutff_multri   r&   r   text_mask_padding!text_embedding_average_upsamplingqk_normr'   pe_attn_headattn_backendattn_mask_enabledlong_skip_connectioncheckpoint_activationsc                  s   t    t| _|	d u r|}	t||	|
||d| _d\| _| _t||	| _	t
| _| _|| _t f	ddt|D | _|rUtjd ddnd | _t| _t|| _|| _|   d S )N)r   r   r'   NNc                   s&   g | ]}t  d 	qS ))	r+   rz   r{   r}   r|   r   r   r   r   )r   r   	r   r   r+   r{   r|   r}   rz   r   r   r   r   r      s    z DiT.__init__.<locals>.<listcomp>r   F)bias)r   r   r	   
time_embedr   r   	text_condtext_uncondre   input_embedr   rotary_embedr+   ry   r   
ModuleListr#   transformer_blocksrf   r   r   norm_outproj_outr   initialize_weights)r%   r+   ry   rz   r{   r|   r}   ri   r&   r   r~   r   r   r'   r   r   r   r   r   r(   r   r   r      s4   



zDiT.__init__c                 C  s   | j D ]}tj|jjjd tj|jjjd qtj| jjjd tj| jjjd tj| j	jd tj| j	jd d S )Nr   )
r   r   init	constant_	attn_normlinearweightr   r   r   )r%   r_   r   r   r   r      s   
zDiT.initialize_weightsc                   s    fdd}|S )Nc                    s    |  }|S )Nr   )inputsoutputsmoduler   r   ckpt_forward   s   z&DiT.ckpt_wrapper.<locals>.ckpt_forwardr   )r%   r   r   r   r   r   ckpt_wrapper   s   zDiT.ckpt_wrapperrr   boolr[   cachern   ro   c           
      C  s   | j d u s| jd u s|s/|d u r|jd }n|jdd}| j|||d}	|r/|r,|	| _ n|	| _|r:|r7| j }	n| j}	| j|||	||d}|S )Nr   r*   )rZ   r[   )rr   rn   )r   r   r/   r0   r   r   )
r%   rk   rm   r9   rr   r[   r   rn   rZ   r   r   r   r   get_input_embed   s   
zDiT.get_input_embedc                 C  s   d\| _ | _d S )Nr   )r   r   )r%   r   r   r   clear_cache   s   zDiT.clear_cacherk   rl   rm   r9   rL   timefloat['b'] | float['']rp   	cfg_inferc
              	   C  s^  |j d |j d }
}|jdkr||
}| |}|rV| j|||dd|	|d}| j|||dd|	|d}tj||fdd}tj||fdd}|d urStj||fddnd }n| j||||||	|d}| j|}| j	d uro|}| j
D ]}| jrtjjj| |||||dd}qr|||||d}qr| j	d ur| 	tj||fd	d}| ||}| |}|S )
Nr   r   F)rr   r[   r   rn   Tr*   )use_reentrant)rp   roperN   )r/   ndimrepeatr   r   r1   rq   r   forward_from_seq_lenr   r   r   utils
checkpointr   r   r   )r%   rk   rm   r9   r   rp   rr   r[   r   r   r<   rZ   tx_condx_uncondr   residualr_   outputr   r   r   r`   #  s:   


 

"

zDiT.forward)FFTN)rr   r   r[   r   r   r   rn   ro   )NFFFF)rk   rl   rm   rl   r9   rL   r   r   rp   ro   rr   r   r[   r   r   r   r   r   )
ra   rb   rc   r   r   r   r   r   r`   rd   r   r   r(   r   rs      sB    C 	rs   )__doc__
__future__r   r1   torch.nn.functionalr   
functionalrR   x_transformers.x_transformersr   f5_tts.model.modulesr   r   r   r   r	   r
   Moduler   re   rs   r   r   r   r   <module>   s    
 p