o
    i_                     @  s   d Z ddlmZ ddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZmZmZ G dd dejZG d	d
 d
ejZG dd dejZdS )z\
ein notation:
b - batch
n - sequence
nt - text sequence
nw - raw wave length
d - dimension
    )annotationsN)nn)RotaryEmbedding)AdaLayerNorm_FinalConvPositionEmbedding
MMDiTBlockTimestepEmbeddingget_pos_embed_indicesprecompute_freqs_cisc                      s*   e Zd Zd fdd	Zddd	d
Z  ZS )TextEmbeddingTc                   sD   t    t|d || _|| _d| _| jdt|| jdd d S )N   i   	freqs_cisF)
persistent)	super__init__r   	Embedding
text_embedmask_paddingprecompute_max_posregister_bufferr
   )selfout_dimtext_num_embedsr   	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/f5_tts/model/backbones/mmdit.pyr      s
   
zTextEmbedding.__init__Ftextint['b nt']returnint['b nt d']c              	   C  s   |d }| j r|dk}|rt|}| |}tj|jd ftjd}|jd }t||| jd}| j	| }|| }| j rM|
|ddd|dd}|S )Nr   r   )dtype)max_posg        )r   torch
zeros_liker   zerosshapelongr	   r   r   masked_fill	unsqueezeexpandsize)r   r   	drop_text	text_maskbatch_startbatch_text_lenpos_idxtext_pos_embedr   r   r   forward(   s   



"zTextEmbedding.forward)TF)r   r   r   r    __name__
__module____qualname__r   r3   __classcell__r   r   r   r   r      s    	r   c                      s(   e Zd Z fddZd	d
ddZ  ZS )AudioEmbeddingc                   s*   t    td| || _t|| _d S )N   )r   r   r   Linearlinearr   conv_pos_embed)r   in_dimr   r   r   r   r   D   s   
zAudioEmbedding.__init__Fxfloat['b n d']condc                 C  s<   |rt |}t j||fdd}| |}| || }|S )Nr#   dim)r$   r%   catr=   r>   )r   r@   rB   drop_audio_condr   r   r   r3   I   s   

zAudioEmbedding.forwardr4   )r@   rA   rB   rA   r5   r   r   r   r   r:   C   s    r:   c                      sx   e Zd Zdddddddddd	d
d	d fdd
Zdd Zdd Z					d&d'ddZdd Z									d(d)d$d%Z  Z	S )*MMDiT   @   g?   d      TNFr$   )depthheadsdim_headdropoutff_multmel_dimr   text_mask_paddingqk_normcheckpoint_activationsattn_backendattn_mask_enabledc                  s   t    t| _t||	d| _d\| _| _t|| _	t
| _| _| _t f	ddtD | _t| _t|| _|| _|   d S )N)r   NNc                   s.   g | ]}t |d  k d	qS )r   )	rD   rN   rO   rP   rQ   context_pre_onlyrT   rV   rW   )r   ).0i	rV   rW   rM   rD   rO   rP   rQ   rN   rT   r   r   
<listcomp>t   s    
z"MMDiT.__init__.<locals>.<listcomp>)r   r   r   
time_embedr   r   	text_condtext_uncondr:   audio_embedr   rotary_embedrD   rM   r   
ModuleListrangetransformer_blocksr   norm_outr<   proj_outrU   initialize_weights)r   rD   rM   rN   rO   rP   rQ   rR   r   rS   rT   rU   rV   rW   r   r\   r   r   V   s"   



zMMDiT.__init__c                 C  s   | j D ]*}tj|jjjd tj|jjjd tj|jjjd tj|jjjd qtj| j	jjd tj| j	jjd tj| j
jd tj| j
jd d S )Nr   )re   r   init	constant_attn_norm_xr=   weightbiasattn_norm_crf   rg   )r   blockr   r   r   rh      s   
zMMDiT.initialize_weightsc                   s    fdd}|S )Nc                    s    |  }|S )Nr   )inputsoutputsmoduler   r   ckpt_forward   s   z(MMDiT.ckpt_wrapper.<locals>.ckpt_forwardr   )r   rs   rt   r   rr   r   ckpt_wrapper   s   zMMDiT.ckpt_wrapperrF   boolr-   cachec                 C  sr   |r&|r| j d u r| j|dd| _ | j }n| jd u r"| j|dd| _| j}n| j||d}| j|||d}||fS )NT)r-   F)rF   )r`   r   r_   ra   )r   r@   rB   r   rF   r-   rw   cr   r   r   get_input_embed   s   	

zMMDiT.get_input_embedc                 C  s   d\| _ | _d S )NrX   )r_   r`   )r   r   r   r   clear_cache   s   zMMDiT.clear_cacher@   rA   rB   r   r   timefloat['b'] | float['']maskbool['b n'] | None	cfg_inferc
                 C  s  |j d }
|jdkr||
}| |}|d dk}|rj| j|||dd|	d\}}| j|||dd|	d\}}tj||fdd}tj||fdd}tj||fdd}|d ur^tj||fddnd }tj||fdd}n| j||||||	d\}}|j d }|j d }| j|}| j|}| j	D ](}| j
rtjjj| ||||||||dd	\}}q||||||||d\}}q| ||}| |}|S )	Nr   r   F)rF   r-   rw   TrC   )use_reentrant)r}   ropec_ropec_mask)r'   ndimrepeatr^   ry   r$   rE   rb   forward_from_seq_lenre   rU   utils
checkpointru   rf   rg   )r   r@   rB   r   r{   r}   rF   r-   r   rw   batchtr   x_condc_condx_uncondc_uncondrx   seq_lentext_len
rope_audio	rope_textro   outputr   r   r   r3      s:   








zMMDiT.forward)FFT)rF   rv   r-   rv   rw   rv   )NFFFF)r@   rA   rB   rA   r   r   r{   r|   r}   r~   rF   rv   r-   rv   r   rv   rw   rv   )
r6   r7   r8   r   rh   ru   ry   rz   r3   r9   r   r   r   r   rG   U   s6    4	rG   )__doc__
__future__r   r$   r   x_transformers.x_transformersr   f5_tts.model.modulesr   r   r   r   r	   r
   Moduler   r:   rG   r   r   r   r   <module>   s    
 %