o
    wi%                     @   sT   d dl Z d dlmZ d dl mZ d dlmZmZ d dlmZ G dd de jj	Z
dS )    N)	rearrange)nn)ConditionalInputConvNorm)binarize_attention_parallelc                       s   e Zd ZdZddddg df fdd	Zedd	 ZdddZedd Zedd Z	edd Z
edddZedd ZdddZ  ZS )AlignmentEncodera;  
    Module for alignment text and mel spectrogram.

    Args:
        n_mel_channels: Dimension of mel spectrogram.
        n_text_channels: Dimension of text embeddings.
        n_att_channels: Dimension of model
        temperature: Temperature to scale distance by.
            Suggested to be 0.0005 when using dist_type "l2" and 15.0 when using "cosine".
        condition_types: List of types for nemo.collections.tts.modules.submodules.ConditionalInput.
        dist_type: Distance type to use for similarity measurement. Supports "l2" and "cosine" distance.
    P   i   gMb@?l2c                    s   t    || _t|||| _tjjdd| _tjj	dd| _
tt||d ddddtj t|d |ddd| _tt||d ddddtj t|d |dddtj t||ddd| _|d	krl| j| _d S |d
krv| j| _d S td| d)N   dim   Trelu)kernel_sizebiasw_init_gain   )r   r   r	   cosinezUnknown distance type '')super__init__temperaturer   
cond_inputtorchr   Softmaxsoftmax
LogSoftmaxlog_softmax
Sequentialr   ReLUkey_proj
query_projget_euclidean_distdist_fnget_cosine_dist
ValueError)selfn_mel_channelsn_text_channelsn_att_channelsr   condition_types	dist_type	__class__ a/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/tts/modules/aligner.pyr   &   s,   
	zAlignmentEncoder.__init__c                 C   s(   |d u rd S t |d}| j|| d S )NzB T2 1 -> B 1 1 T2)r   datamasked_fill_)inputsmask
mask_valuer.   r.   r/   _apply_maskI   s   
zAlignmentEncoder._apply_maskNc                 C   s>   |  |}| |}| j||d}| ||td |dS )a  Calculation of distance matrix.

        Args:
            queries (torch.tensor): B x C1 x T1 tensor (probably going to be mel data).
            keys (torch.tensor): B x C2 x T2 tensor (text data).
            mask (torch.tensor): B x T2 x 1 tensor, binary mask for variable length entries and also can be used
                for ignoring unnecessary elements from keys in the resulting distance matrix (True = mask element, False = leave unchanged).
        Output:
            dist (torch.tensor): B x T1 x T2 tensor.
        queries_enckeys_encinfr   )r!   r    r#   r5   floatsqueeze)r&   keysqueriesr3   r7   r8   distr.   r.   r/   get_distQ   s
   


zAlignmentEncoder.get_distc                 C   s2   t | d} t |d}| | d }|jddd}|S )NB C T1 -> B C T1 1B C T2 -> B C 1 T2r   r   T)axiskeepdim)r   sum)r7   r8   distancel2_distr.   r.   r/   r"   g   s
   

z#AlignmentEncoder.get_euclidean_distc                 C   s8   t | d} t |d}tjjj| |dd }t |d}|S )Nr@   rA   r   r   zB T1 T2 -> B 1 T1 T2)r   r   r   
functionalcosine_similarity)r7   r8   cosine_distr.   r.   r/   r$   q   s
   


z AlignmentEncoder.get_cosine_distc                 C   sJ   t | ||}|ddddddf }tt|jdd|s#J |S )zCalculation of durations.

        Args:
            attn_soft (torch.tensor): B x 1 x T1 x T2 tensor.
            text_len (torch.tensor): B tensor, lengths of text.
            spect_len (torch.tensor): B tensor, lengths of mel spectrogram.
        r   Nr   r   r   )r   rD   r   alleq)	attn_softtext_len	spect_len	attn_hard	durationsr.   r.   r/   get_durationsy   s   	zAlignmentEncoder.get_durationsc                 C   s   |   \}}}tt|jdd|sJ t| |d g }t|D ]}|t	| |t
|tjt
||| df  q#tj|| j| jdS )a  Select elements from the distance matrix for the given durations and mask and return mean distance.

        Args:
            dist (torch.tensor): B x T1 x T2 tensor.
            durations (torch.tensor): B x T2 tensor. Dim T2 should sum to T1.
            mask (torch.tensor): B x T2 x 1 binary mask for variable length entries and also can be used
                for ignoring unnecessary elements in dist by T2 dim (True = mask element, False = leave unchanged).
        Output:
            mean_dist (torch.tensor): B x 1 tensor.
        r   r   r   )repeats)dtypedevice)sizer   rJ   rK   rD   r   r5   rangeappendmeanarangerepeat_interleavetensorrS   rT   )r>   rP   r3   
batch_sizet1_sizet2_sizemean_dist_by_durationsdist_idxr.   r.   r/   get_mean_dist_by_durations   s    
z+AlignmentEncoder.get_mean_dist_by_durationsc           	      C   sx   t |d| j}d}d}t||| D ]"}t||||  D ]
}|| ||f 7 }q ||| 7 }||| 7 }q|| S )ap  Calculates the mean distance between text and audio embeddings given a range of text tokens.

        Args:
            l2_dists (torch.tensor): L2 distance matrix from Aligner inference. T1 x T2 tensor.
            durs (torch.tensor): List of durations corresponding to each text token. T2 tensor. Should sum to T1.
            start_token (int): Index of the starting token for the word of interest.
            num_tokens (int): Length (in tokens) of the word of interest.
        Output:
            mean_dist_for_word (float): Mean embedding distance between the word indicated and its predicted audio frames.
        Nr   )r   rD   r0   rV   )	l2_distsdursstart_token
num_tokensstart_frametotal_framesdist_sum	token_ind	frame_indr.   r.   r/   get_mean_distance_for_word   s   z+AlignmentEncoder.get_mean_distance_for_wordc                 C   s   |  |dd|dd}| |}| |}| j||d}| j | }	|dur;| |	t|dddf d  }	|		 }
| 
|	|td  | |	}	|	|
fS )a  Forward pass of the aligner encoder.

        Args:
            queries (torch.tensor): B x C1 x T1 tensor (probably going to be mel data).
            keys (torch.tensor): B x C2 x T2 tensor (text data).
            mask (torch.tensor): B x T2 x 1 tensor, binary mask for variable length entries (True = mask element, False = leave unchanged).
            attn_prior (torch.tensor): prior for attention matrix.
            conditioning (torch.tensor): B x 1 x C2 conditioning embedding
        Output:
            attn (torch.tensor): B x 1 x T1 x T2 attention mask. Final dim T2 should sum to 1.
            attn_logprob (torch.tensor): B x 1 x T1 x T2 log-prob attention mask.
        r   r   r6   Ng:0yE>r9   )r   	transposer!   r    r#   r   r   r   logcloner5   r:   r   )r&   r=   r<   r3   
attn_priorconditioningr7   r8   rE   attnattn_logprobr.   r.   r/   forward   s   

$
zAlignmentEncoder.forward)N)NNN)__name__
__module____qualname____doc__r   staticmethodr5   r?   r"   r$   rQ   ra   rk   rs   __classcell__r.   r.   r,   r/   r      s.    #


	


r   )r   einopsr   r   'nemo.collections.tts.modules.submodulesr   r   (nemo.collections.tts.parts.utils.helpersr   Moduler   r.   r.   r.   r/   <module>   s   