o
    i                     @   s   d dl Zd dlZd dlmZ d dlm  mZ d dlm	Z	 G dd dej
Ze	dddd Zd	d
 Ze	dddd Zdd ZdS )    N)jitc                       s*   e Zd ZdZ fddZdddZ  ZS )AlignmentModulezmAlignment Learning Framework proposed for parallel TTS models in:

    https://arxiv.org/abs/2108.10447

    c                    sr   t    tj||ddd| _tj||ddd| _tj||ddd| _tj||ddd| _tj||ddd| _d S )N      )kernel_sizepaddingr   )	super__init__nnConv1dt_conv1t_conv2f_conv1f_conv2f_conv3)selfadimodim	__class__ S/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/gan_tts/jets/alignments.pyr	      s   
zAlignmentModule.__init__Nc                 C   s   | dd}t| |}| |}| dd}| dd}t| |}t| |}| |}| dd}|d|d }t	j
|ddd}| }|dur`|d}||tj }tj|dd}|S )	aW  Calculate alignment loss.

        Args:
            text (Tensor): Batched text embedding (B, T_text, adim).
            feats (Tensor): Batched acoustic feature (B, T_feats, odim).
            x_masks (Tensor): Mask tensor (B, T_text).

        Returns:
            Tensor: Log probability of attention matrix (B, T_feats, T_text).

        r      r   )pdimN)r   )	transposeFrelur   r   r   r   r   	unsqueezetorchnormmasked_fillnpinflog_softmax)r   textfeatsx_masksdistscore
log_p_attnr   r   r   forward   s"   


zAlignmentModule.forward)N)__name__
__module____qualname____doc__r	   r-   __classcell__r   r   r   r   r      s    	r   T)nopythonc                 C   sJ  | j d }| j d }tj||ftj d}| dd}t|D ]}|dd |d f  |d|f< qtd|D ],}tdt|d |D ]}t||d |d f |||d f |||f  |||f< qCq7tj|f|d d}t|d ddD ],}||d  d }||d  }	|	dkrd}
n|||f ||	|f kr|}
n|	}
|
||< qv|S )Nr   r   )
fill_valuer   r   )	shaper$   fullr%   r   rangesumminmax)r,   T_melT_inpQlog_probjiAi_ai_bargmax_ir   r   r   _monotonic_alignment_search>   s*   

"<
rE   c                 C   s   |  d}|  d}| j}d}tj||f|d}t|D ]A}| |d|| d|| f }	t|	   }
t	
|
}t||||dt|f< t|| }||	||
f   }q|| }||fS )a  Extract duration from an attention probability matrix

    Args:
        log_p_attn (Tensor): Batched log probability of attention
            matrix (B, T_feats, T_text).
        text_lengths (Tensor): Text length tensor (B,).
        feats_legnths (Tensor): Feature length tensor (B,).

    Returns:
        Tensor: Batched token duration extracted from `log_p_attn` (B, T_text).
        Tensor: Binarization loss tensor ().

    r   r   )deviceN)sizerF   r!   zerosr7   rE   detachcpunumpyr$   bincount
from_numpytolenarangemean)r,   text_lengthsfeats_lengthsBT_textrF   bin_lossdsbcur_log_p_attnviterbi_dst_idxr   r   r   viterbi_decode_   s   


 r]   c                 C   s   | j d }t| }| tj} t|D ]X}|| }|| }| |d |f }	|	 }
dgt|
 }
||d |f }tt	|
d d |
dd  D ]#\}\}}t
||| dkre|||  |||f< qHd|||f< qHq|S )Nr   r   r   )r5   r$   
zeros_likeastypeint32r7   cumsumlist	enumerateziprO   rQ   )rW   xsrR   rS   rT   xs_avgrX   t_textt_featsdd_cumsumxnstartendr   r   r   _average_by_duration   s    

*ro   c                 C   s<   | j }| |||g}dd |D }t| }t||}|S )a  Average frame-level features into token-level according to durations

    Args:
        ds (Tensor): Batched token duration (B, T_text).
        xs (Tensor): Batched feature sequences to be averaged (B, T_feats).
        text_lengths (Tensor): Text length tensor (B,).
        feats_lengths (Tensor): Feature length tensor (B,).

    Returns:
        Tensor: Batched feature averaged according to the token duration (B, T_text).

    c                 S   s   g | ]
}|    qS r   )rI   rJ   rK   ).0argr   r   r   
<listcomp>   s    z'average_by_duration.<locals>.<listcomp>)rF   ro   r!   rM   rN   )rW   re   rR   rS   rF   argsrf   r   r   r   average_by_duration   s   rt   )rK   r$   r!   torch.nnr
   torch.nn.functional
functionalr   numbar   Moduler   rE   r]   ro   rt   r   r   r   r   <module>   s   3
  
