o
    ¡¿¯iP  ã                   @   sš   d Z ddlZddlmZmZmZmZ ddlZddlZ	ddl
Z
ddlZddlm  mZ ddlmZ ddlmZ ddlmZ ddlmZ G dd	„ d	eƒZdS )
z-F0 extractor using DIO + Stonemask algorithm.é    N)ÚAnyÚDictÚTupleÚUnion)Úinterp1d)Úcheck_argument_types)ÚAbsFeatsExtract)Úpad_listc                       s4  e Zd ZdZ									d,d	eeef d
edededededededef‡ fdd„Zdefdd„Z	de
eef fdd„Z				d-dejdejdejdejdejdeejejf fdd„Zdejdejfd d!„Zed"ejd#ejdejfd$d%„ƒZed&ejdejfd'd(„ƒZd"ejd)ejdejfd*d+„Z‡  ZS ).ÚDioa<  F0 estimation with dio + stonemask algorithm.

    This is f0 extractor based on dio + stonmask algorithm introduced in `WORLD:
    a vocoder-based high-quality speech synthesis system for real-time applications`_.

    .. _`WORLD: a vocoder-based high-quality speech synthesis system for real-time
        applications`: https://doi.org/10.1587/transinf.2015EDP7457

    Note:
        This module is based on NumPy implementation. Therefore, the computational graph
        is not connected.

    Todo:
        Replace this module with PyTorch-based implementation.

    é"V  é   é   éP   é  TNÚfsÚn_fftÚ
hop_lengthÚf0minÚf0maxÚuse_token_averaged_f0Úuse_continuous_f0Ú
use_log_f0Úreduction_factorc
           
         s€   t ƒ sJ ‚tƒ  ¡  t|tƒrt |¡}|| _|| _|| _	d| | | _
|| _|| _|| _|| _|| _|r;|	dks;J ‚|	| _d S )Niè  é   )r   ÚsuperÚ__init__Ú
isinstanceÚstrÚhumanfriendlyÚ
parse_sizer   r   r   Úframe_periodr   r   r   r   r   r   )
Úselfr   r   r   r   r   r   r   r   r   ©Ú	__class__© úQ/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tts/feats_extract/dio.pyr   '   s    




zDio.__init__Úreturnc                 C   s   dS )Nr   r$   ©r!   r$   r$   r%   Úoutput_sizeD   s   zDio.output_sizec                 C   s,   t | j| j| j| j| j| j| j| j| j	d	S )N)	r   r   r   r   r   r   r   r   r   )
Údictr   r   r   r   r   r   r   r   r   r'   r$   r$   r%   Úget_parametersG   s   ÷zDio.get_parametersÚinputÚinput_lengthsÚfeats_lengthsÚ	durationsÚdurations_lengthsc                    sÀ   |d u r|j |jd tjd|jd  }‡ fdd„t||ƒD ƒ}|d ur0‡ fdd„t||ƒD ƒ}ˆ jrG|ˆ j }‡ fdd„t||ƒD ƒ}|}n|jdd„ |D ƒtjd}t|d	ƒ}| 	d
¡|fS )Nr   ©Údtyper   c                    s"   g | ]\}}ˆ   |d |… ¡‘qS )N)Ú_calculate_f0)Ú.0ÚxÚxlr'   r$   r%   Ú
<listcomp>c   s   " zDio.forward.<locals>.<listcomp>c                    ó"   g | ]\}}ˆ   ||¡ d ¡‘qS ©éÿÿÿÿ)Ú_adjust_num_framesÚview)r3   ÚpÚflr'   r$   r%   r6   g   ó    ÿÿc                    r7   r8   )Ú_average_by_durationr;   )r3   r<   Údr'   r$   r%   r6   o   r>   c                 S   s   g | ]}t |ƒ‘qS r$   )Úlen)r3   r<   r$   r$   r%   r6   u   s    ç        r9   )
Únew_onesÚshapeÚtorchÚlongÚzipr   r   Ú
new_tensorr	   Ú	unsqueeze)r!   r+   r,   r-   r.   r/   ÚpitchÚpitch_lengthsr$   r'   r%   ÚforwardT   s"   	ÿ
þ

þ
zDio.forwardc                 C   s–   |  ¡  ¡  tj¡}tj|| j| j| j	| j
d\}}t |||| j¡}| jr+|  |¡}| jr@t |dk¡d }t || ¡||< |j| d¡tjdS )N)Úf0_floorÚf0_ceilr    r   r9   r0   )ÚcpuÚnumpyÚastypeÚnpÚdoubleÚpyworldÚdior   r   r   r    Ú	stonemaskr   Ú_convert_to_continuous_f0r   ÚwhereÚlogrH   ÚreshaperE   Úfloat)r!   r+   r4   Úf0ÚtimeaxisÚnonzero_idxsr$   r$   r%   r2   }   s   
û
zDio._calculate_f0r4   Ú
num_framesc                 C   sD   |t | ƒkrt | d|t | ƒ f¡} | S |t | ƒk r | d |… } | S )Nr   )rA   ÚFÚpad)r4   r_   r$   r$   r%   r:   Ž   s   þzDio._adjust_num_framesr\   c                 C   s¸   | dk  ¡ rt d¡ | S | | dk d }| | dk d }t | |k¡d d }t | |k¡d d }|| d |…< || |d …< t | dk¡d }t|| | ƒ}|t d| jd ¡ƒ} | S )Nr   z All frames seems to be unvoiced.r9   )ÚallÚloggingÚwarnrR   rX   r   ÚarangerD   )r\   Ústart_f0Úend_f0Ú	start_idxÚend_idxr^   Ú	interp_fnr$   r$   r%   rW   –   s   
zDio._convert_to_continuous_f0r@   c                    sp   dt ˆ ƒ| ¡    kr| jk sJ ‚ J ‚t |jddd¡}‡ fdd„t|d d… |dd … ƒD ƒ}t |¡S )Nr   ©Údim)r   r   c                    sj   g | ]1\}}t ˆ ||…  ˆ ||…  d ¡¡ƒdkr.ˆ ||…  ˆ ||…  d ¡¡jddnˆ  d ¡‘qS )rB   r   rk   )rA   Úmasked_selectÚgtÚmeanrH   )r3   ÚstartÚend©r4   r$   r%   r6   °   s    (þ(ÿýz,Dio._average_by_duration.<locals>.<listcomp>r9   r   )	rA   Úsumr   r`   ra   ÚcumsumrG   rE   Ústack)r!   r4   r@   Úd_cumsumÚx_avgr$   rr   r%   r?   ­   s   *
ü
zDio._average_by_duration)	r   r   r   r   r   TTTN)NNNN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   Úintr   Úboolr   r(   r   r   r*   rE   ÚTensorr   rL   r2   Ústaticmethodr:   rR   ÚarrayrW   r?   Ú__classcell__r$   r$   r"   r%   r
      sn    ö
þýüûúùø	÷
öúþýüûú
ù)$r
   )r{   rc   Útypingr   r   r   r   r   rP   rR   rT   rE   Útorch.nn.functionalÚnnÚ
functionalr`   Úscipy.interpolater   Ú	typeguardr   Ú+espnet2.tts.feats_extract.abs_feats_extractr   Ú&espnet.nets.pytorch_backend.nets_utilsr	   r
   r$   r$   r$   r%   Ú<module>   s   