o
    i &                     @  s   d Z ddlmZ ddlmZ ddlmZ ddlZddlm  m	Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZ G dd dejZdS )z\
ein notation:
b - batch
n - sequence
nt - text sequence
nw - raw wave length
d - dimension
    )annotations)random)CallableN)nn)pad_sequence)odeint)MelSpec)defaultexistsget_epss_timestepslens_to_masklist_str_to_idxlist_str_to_tensormask_from_frac_lengthsc                      s   e Zd Zdedddddde ddf	d2 fddZedd Ze dddddddddddddd3d*d+Z	ddd,d4d0d1Z
  ZS )5CFM        euler)methodg333333?g?N)gffffff?      ?transformer	nn.Moduleodeint_kwargsdictmel_spec_modulenn.Module | Nonemel_spec_kwargsfrac_lengths_masktuple[float, float]vocab_char_mapdict[str:int] | Nonec                   sn   t    |	| _t|tdi || _t|| jj}|| _|| _|| _	|| _
|j}|| _|| _|| _|
| _d S )N )super__init__r   r	   r   mel_specn_mel_channelsnum_channelsaudio_drop_probcond_drop_probr   dimsigmar   r   )selfr   r)   r   r&   r'   r%   r   r   r   r   r(   	__class__r    D/home/ubuntu/.local/lib/python3.10/site-packages/f5_tts/model/cfm.pyr"   #   s   

zCFM.__init__c                 C  s   t |  jS )N)next
parametersdevice)r*   r    r    r-   r0   O   s   z
CFM.device    r   i   TFg?)lensstepscfg_strengthsway_sampling_coefseedmax_durationvocoderuse_epssno_ref_audioduplicate_testt_inter	edit_maskcondfloat['b n d'] | float['b nw']textint['b nt'] | list[str]durationint | int['b']r2   int['b'] | Noner6   
int | Noner8   0Callable[[float['b d n']], float['b nw']] | Nonec             	     sN     |jdkr|}|ddd}|jd jksJ |t j	}g |jd d |j
R \}}}t|sHtj|f||tjd}ttrltjr\tj|nt|jd |kslJ t|}|d urx||@ }t|trtj|f||tjd}ttdkjdd|d |}|j|	d}| }	|rtj|dd||	d|  fdd	}tj|ddd|	| fdd	}|rt|}tj|d|	|jd  fd
d	}|d}t||t||dkrt|nd  fdd}g }|D ]}t|rt| | tj!|jj
j	d qt"|ddd}d}|r?|}d| | ||  }t|d|  }|dkrQ|rQt#|j
j	d}ntj$|d|d j
j	d}|d urv||t%tj&d | d |   }t'|||fi j(}j)*  |d }|}t|||}t|
r|ddd}|
|}||fS )N   r      )r0   dtyper(   )maxr   )valueFc              
     sb    dk rj || dddd}|S j || ddd}tj|ddd\}}|||    S )	Ngh㈵>FT)xr>   r@   timemaskdrop_audio_cond	drop_textcache)rN   r>   r@   rO   rP   	cfg_inferrS   rG   r   rK   )r   torchchunk)trN   predpred_cfg	null_predr4   rP   r*   	step_condr@   r    r-   fn   s.   
	zCFM.sample.<locals>.fnT)padding_valuebatch_first)+evalndimr#   permuteshaper%   tor.   r/   rJ   r0   r
   rU   fulllong
isinstancelistr   r   r   r   intmaximumsumclampamaxFpad
zeros_like	unsqueezewheremanual_seedappendrandnr   r   linspacecospir   r   r   clear_cache)r*   r>   r@   rB   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   batchcond_seq_lenr0   	cond_mask	test_condr]   y0durt_startrW   
trajectorysampledoutr    r[   r-   sampleS   s~   

 


 


"

"
$

z
CFM.sample)r2   noise_schedulerinpr   
str | Nonec             	   C  s  |j dkr| |}|ddd}|jd | jksJ g |jd d |j| j| jR \}}}}}	t|t	rWt
| jrGt|| j|}nt||}|jd |ksWJ t
|sdtj|f||d}t||d}
tj|f| jd j| j }t||}t
|
r||
M }|}t|}tj|f|| jd}|dd}d| | ||  }|| }t|d t||}t | jk }t | jk rd	}d	}nd
}| j|||||||
d}t j!||dd}|| }|" ||fS )NrG   r   rH   rI   )r0   )length)rJ   r0   ).NTF)rN   r>   r@   rO   rQ   rR   rP   none)	reduction)#ra   r#   rb   rc   r%   rJ   r0   r)   rg   rh   r
   r   r   rd   r   rU   re   r   zerosfloatuniform_r   r   
randn_likerandrq   rr   rp   r   r&   r'   r   rn   mse_lossmean)r*   r   r@   r2   r   rz   seq_lenrJ   r0      _σ1rP   frac_lengthsrand_span_maskx1x0rO   rW      φflowr>   rQ   rR   rX   lossr    r    r-   forward   sF   
	
0



zCFM.forward)r   r   r   r   r   r   r   r   r   r   r   r   )r>   r?   r@   rA   rB   rC   r2   rD   r6   rE   r8   rF   )r   r?   r@   rA   r2   rD   r   r   )__name__
__module____qualname__r   r"   propertyr0   rU   no_gradr   r   __classcell__r    r    r+   r-   r   "   sB    ,
 r   )__doc__
__future__r   r   typingr   rU   torch.nn.functionalr   
functionalrn   torch.nn.utils.rnnr   torchdiffeqr   f5_tts.model.modulesr   f5_tts.model.utilsr	   r
   r   r   r   r   r   Moduler   r    r    r    r-   <module>   s    
$