o
    }oin                     @   s  d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZ d dlmZmZmZmZ d d	lmZ d d
lmZ g dZdd Z	d<dededededededefddZG dd dejjZG dd dejjZ G dd dejj!Z"G dd dejj!ej#Z$G d d! d!ejj!Z%G d"d# d#ejj!Z&G d$d% d%ejj!Z'd&d' Z(G d(d) d)ejj!Z)G d*d+ d+ejj!Z*G d,d- d-ejj+Z,G d.d/ d/ejj!Z-G d0d1 d1eZ.G d2d3 d3ejj!Z/G d4d5 d5eZ0G d6d7 d7eZ1G d8d9 d9ejj!Z2G d:d; d;eZ3dS )=    )TupleN)Tensor)Variable)
functional)pack_padded_sequencepad_packed_sequence)NeuralModuleadapter_mixins)EncodedRepresentationIndexLengthsTypeMelSpectrogramType)
NeuralType)logging)addconcat	layernormc                 C   s$   | D ]}|t vrtd| qd S )NzUnknown conditioning type )SUPPORTED_CONDITION_TYPES
ValueError)condition_typestp r   [/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/modules/submodules.pycheck_support_condition_types    s
   r   h㈵>inputmaskweightbiasmomentumepsreturnc           
      C   s   | d}| |  d| }| |d  | d  d| }| |d  t|d |  }	|	|dddf d  |dddf d  }	|	S )zApplies Masked Instance Normalization for each channel in each data sample in a batch.

    See :class:`~MaskedInstanceNorm1d` for details.
    ).N   N)sumtorchsqrt)
r   r   r   r   r   r    lengthsmeanvaroutr   r   r   masked_instance_norm&   s   
,r,   c                       sZ   e Zd ZdZ				ddedededed	ed
df fddZdeded
efddZ	  Z
S )MaskedInstanceNorm1daR  Applies Instance Normalization over a masked 3D input
    (a mini-batch of 1D inputs with additional channel dimension)..

    See documentation of :class:`~torch.nn.InstanceNorm1d` for details.

    Shape:
        - Input: :math:`(N, C, L)`
        - Mask: :math:`(N, 1, L)`
        - Output: :math:`(N, C, L)` (same shape as input)
    r   皙?Fnum_featuresr    r   affinetrack_running_statsr!   Nc                    s   t t| ||||| d S N)superr-   __init__)selfr/   r    r   r0   r1   	__class__r   r   r4   B   s   zMaskedInstanceNorm1d.__init__r   r   c                 C   s   t ||| j| j| j| jS r2   )r,   r   r   r   r    )r5   r   r   r   r   r   forwardL   s   zMaskedInstanceNorm1d.forward)r   r.   FF)__name__
__module____qualname____doc__intfloatboolr4   r   r8   __classcell__r   r   r6   r   r-   6   s(    
r-   c                       s8   e Zd ZU dZdgZeed<  fddZdd Z  Z	S )PartialConv1da  
    Zero padding creates a unique identifier for where the edge of the data is, such that the model can almost always identify
    exactly where it is relative to either edge given a sufficient receptive field. Partial padding goes to some lengths to remove 
    this affect.
    slide_winsizec                    sX   t t| j|i | tdd| jd }| jd|dd | jjd | jjd  | _	d S )N   r   weight_maskUpdaterF)
persistentr$   )
r3   rA   r4   r&   oneskernel_sizeregister_bufferrD   shaperB   )r5   argskwargsrD   r6   r   r   r4   Z   s   zPartialConv1d.__init__c           
   
   C   s  |d u rt jdd|jd |j|jd}n|}t ||}t  4 tj|| j	d | j
| j| jdd}t ||dk| j}| j| }t |dd}t ||}W d    n1 sWw   Y  | || j| j}| jd ur| jd| jd}t || || }	t |	|}	|	S t ||}	|	S )NrC   r$   )dtypedevice)r   stridepaddingdilationgroupsr   )r&   rF   rI   rL   rM   mulno_gradFconv1drD   rN   rO   rP   masked_fillrB   clamp_conv_forwardr   r   viewout_channels)
r5   r   mask_inr   update_maskupdate_mask_filled
mask_ratioraw_out	bias_viewoutputr   r   r   r8   `   s6    
	

zPartialConv1d.forward)
r9   r:   r;   r<   __constants__r>   __annotations__r4   r8   r@   r   r   r6   r   rA   P   s   
 rA   c                       s&   e Zd Zd fdd	Zdd Z  ZS )
LinearNormTlinearc                    sB   t    tjj|||d| _tjjj| jjtjj	|d d S )Nr   gain)
r3   r4   r&   nnLinearlinear_layerinitxavier_uniform_r   calculate_gain)r5   in_dimout_dimr   w_init_gainr6   r   r   r4      s   
$zLinearNorm.__init__c                 C   
   |  |S r2   )rk   )r5   xr   r   r   r8         
zLinearNorm.forward)Tre   r9   r:   r;   r4   r8   r@   r   r   r6   r   rd      s    rd   c                       sJ   e Zd ZU dgZeed< 									d fdd	Zdd	d
Z  ZS )ConvNormuse_partial_paddingrC   NTre   Fc              	      s   t t|   |d u r|d dksJ t||d  d }|	| _tjj}|	r(t}||||||||d| _	tjj
j| j	jtjj
|d |
rOtjj| j	| _	|d ur\||dd| _d S d | _d S )Nr$   rC   )rG   rN   rO   rP   r   rg   T)r0   )r3   rv   r4   r=   rw   r&   ri   Conv1drA   convrl   rm   r   rn   utilsweight_normnorm)r5   in_channelsrZ   rG   rN   rO   rP   r   rq   rw   use_weight_normnorm_fnconv_fnr6   r   r   r4      s.    	
zConvNorm.__init__c                 C   s   | j r| ||}| jd ur| ||}n|d ur||}| |}| jd ur-| |}|  r>| |dddd}|S NrC   r$   )rw   ry   r|   rR   is_adapter_availableforward_enabled_adapters	transpose)r5   signalr   retr   r   r   r8      s   




zConvNorm.forward)	rC   rC   NrC   Tre   FFNr2   )	r9   r:   r;   rb   r?   rc   r4   r8   r@   r   r   r6   r   rv      s   
 'rv   c                       s$   e Zd Z fddZdd Z  ZS )LocationLayerc              	      sH   t    t|d d }td|||dddd| _t||ddd| _d S )NrC   r$   F)rG   rO   r   rN   rP   tanhr   rq   )r3   r4   r=   rv   location_convrd   location_dense)r5   attention_n_filtersattention_kernel_sizeattention_dimrO   r6   r   r   r4      s   
	zLocationLayer.__init__c                 C   s$   |  |}|dd}| |}|S r   )r   r   r   )r5   attention_weights_catprocessed_attentionr   r   r   r8      s   

zLocationLayer.forwardru   r   r   r6   r   r      s    r   c                       s,   e Zd Z fddZdd Zdd Z  ZS )	Attentionc                    s\   t    t||ddd| _t||ddd| _t|ddd| _t|||| _td | _	d S )NFr   r   rC   rf   inf)
r3   r4   rd   query_layermemory_layervr   location_layerr>   score_mask_value)r5   attention_rnn_dimembedding_dimr   attention_location_n_filtersattention_location_kernel_sizer6   r   r   r4      s   
zAttention.__init__c                 C   s@   |  |d}| |}| t|| | }|d}|S )aS  
        PARAMS
        ------
        query: decoder output (batch, n_mel_channels * n_frames_per_step)
        processed_memory: processed encoder outputs (B, T_in, attention_dim)
        attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
        RETURNS
        -------
        alignment (batch, max_time)
        rC   r"   )r   	unsqueezer   r   r&   r   squeeze)r5   queryprocessed_memoryr   processed_queryprocessed_attention_weightsenergiesr   r   r   get_alignment_energies   s
   

z Attention.get_alignment_energiesc           	      C   sX   |  |||}|dur|j|| j tj|dd}t|d|}|	d}||fS )a)  
        PARAMS
        ------
        attention_hidden_state: attention rnn last output
        memory: encoder outputs
        processed_memory: processed encoder outputs
        attention_weights_cat: previous and cummulative attention weights
        mask: binary mask for padded data
        NrC   dim)
r   datamasked_fill_r   rT   softmaxr&   bmmr   r   )	r5   attention_hidden_statememoryr   r   r   	alignmentattention_weightsattention_contextr   r   r   r8     s   
zAttention.forward)r9   r:   r;   r4   r   r8   r@   r   r   r6   r   r      s    r   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	Prenet      ?c                    sD   t    |g|d d  }|| _tjdd t||D | _d S )Nr"   c                 S   s   g | ]\}}t ||d dqS )Frf   )rd   ).0in_sizeout_sizer   r   r   
<listcomp>"  s    z#Prenet.__init__.<locals>.<listcomp>)r3   r4   	p_dropoutr&   ri   
ModuleListziplayers)r5   ro   sizesr   in_sizesr6   r   r   r4     s   

zPrenet.__init__Fc              	   C   s   |rE| j D ]=}t||}|d d}tjt|j	|j
 d| j }||
d|
d}|| d d| j  }q|S | j D ]}tjt||| jdd}qH|S )Nr   rC   T)ptraining)r   rT   relur   r&   autogradr   	bernoullir   newsizefill_r   expanddropout)r5   rs   	inferencere   x0r   r   r   r   r8   %  s   
,
zPrenet.forward)r   Fru   r   r   r6   r   r         r   c                 C   sT   | | }t |d d d |d d f }t |d d |d d d f }|| }|S r2   )r&   r   sigmoid)input_ainput_bn_channels_intin_actt_acts_actactsr   r   r   fused_add_tanh_sigmoid_multiply3  s
     r   c                       s0   e Zd ZdZ fddZddefddZ  ZS )	Invertible1x1Convz
    The layer outputs both the convolution, and the log determinant
    of its weight matrix.  If reverse=True it does convolution with
    inverse
    c                    s   t    tjj||ddddd| _tjt||	 d }t
|dk r7d|d d df  |d d df< |||d}|| jj_d | _d S )NrC   r   FrG   rN   rO   r   r"   )r3   r4   r&   ri   rx   ry   linalgqrFloatTensornormal_detrY   r   r   inv_conv)r5   cWr6   r   r   r4   B  s   
 

zInvertible1x1Conv.__init__Freversec           	      C   s   |rA| j d u r<tjj| jj| jjddddd| _ | jj j	
  }t|d }|| j j_	| j j| jjj| jjjd |  |S | jj }| \}}}|| t|
  }| |}||fS )NrC   r   Fr   r#   )rM   rL   )r   r&   ri   rx   ry   r}   rZ   r   r   r   r>   inverser   torM   rL   r   logdet)	r5   zr   	W_inverser   
batch_size
group_sizen_of_groups	log_det_Wr   r   r   r8   P  s"   



zInvertible1x1Conv.forwardr   )r9   r:   r;   r<   r4   r?   r8   r@   r   r   r6   r   r   ;  s    r   c                       s:   e Zd ZdZ fddZdeejejf fddZ  Z	S )WaveNetz
    This is the WaveNet like layer for the affine coupling.  The primary
    difference from WaveNet is the convolutions need not be causal.  There is
    also no dilation size reset.  The dilation only doubles on each layer
    c                    s  t    |d dksJ |d dksJ || _|| _tj | _tj | _tj	||d}tjj
j|dd}|| _tj	|d| d}|jj  |jj  || _tj	|d| | d}tjj
j|dd| _t|D ]P}	d|	 }
t||
 |
 d }tjj	|d| ||
|d}tjj
j|dd}| j| |	|d k rd| }n|}tj	||d}tjj
j|dd}| j| qnd S )Nr$   rC   r   r   )name)rP   rO   )r3   r4   n_layers
n_channelsr&   ri   r   	in_layersres_skip_layersrx   rz   r{   startr   r   zero_r   end
cond_layerranger=   append)r5   n_in_channelsn_mel_channelsr   r   rG   r   r   r   irP   rO   in_layerres_skip_channelsres_skip_layerr6   r   r   r4   r  s:   

zWaveNet.__init__forward_inputc           	   	   C   s   |d |d }}|  |}t|}| |}t| jD ]X}|d | j }t| j| ||d d ||d| j  d d f | j}| j	| |}|| jd k rq||d d d | jd d f  }||d d | jd d d f  }q|| }q| 
|S )Nr   rC   r$   )r   r&   
zeros_liker   r   r   r   r   r   r   r   )	r5   r   audiospectra   r   spect_offsetr   res_skip_actsr   r   r   r8     s"   


" "

zWaveNet.forward)
r9   r:   r;   r<   r4   r   r&   r   r8   r@   r   r   r6   r   r   k  s    "'r   c                       s<   e Zd ZdZdg f fdd	Zdd Zd	 fdd	Z  ZS )
ConditionalLayerNormz
    This module is used to condition torch.nn.LayerNorm.
    If we don't have any conditions, this will be a normal LayerNorm.
    Nc                    s\   t | d|v | _t j|| j d | jr,tj||| _tj||| _| 	  d S d S )Nr   )elementwise_affine)
r   	conditionr3   r4   r&   ri   rj   cond_weight	cond_biasinit_parametersr5   
hidden_dimcondition_dimr   r6   r   r   r4     s   
zConditionalLayerNorm.__init__c                 C   sT   t jj| jjd t jj| jjd t jj| jjd t jj| jjd d S )N        g      ?)r&   ri   rl   	constant_r  r   r   r  r5   r   r   r   r    s   z$ConditionalLayerNorm.init_parametersc                    sB   t  |}| jr|d u rtd|| | }|| | }|S )NYou should add additional data types as conditions (e.g. speaker id or reference audio) 
                                 and define speaker_encoder in your config.)r3   r8   r   r   r  r  r5   inputsconditioningr6   r   r   r8     s   zConditionalLayerNorm.forwardr2   )r9   r:   r;   r<   r4   r  r8   r@   r   r   r6   r   r     s
    
r   c                       s.   e Zd ZdZg f fdd	ZdddZ  ZS )ConditionalInputz}
    This module is used to condition any model inputs.
    If we don't have any conditions, this will be a normal pass.
    c                    s   t | t   ddg _ fdd|D  _| _| _d jv r/||kr/tj	|| _
d jv r@tj	|| | _d S d S )Nr   r   c                    s   g | ]	}| j v r|qS r   )support_types)r   r   r	  r   r   r     s    z-ConditionalInput.__init__.<locals>.<listcomp>)r   r3   r4   r  r   r  r  r&   ri   rj   add_projconcat_projr  r6   r	  r   r4     s   


zConditionalInput.__init__Nc                 C   s   t | jdkr@|du rtdd| jv r#| j| jkr| |}|| }d| jv r@|d|jd d}tj	||gdd}| 
|}|S )	z
        Args:
            inputs (torch.tensor): B x T x H tensor.
            conditioning (torch.tensor): B x 1 x C conditioning embedding.
        r   Nr
  r   r   rC   r"   r   )lenr   r   r  r  r  repeatrI   r&   catr  r  r   r   r   r8     s   



zConditionalInput.forwardr2   r9   r:   r;   r<   r4   r8   r@   r   r   r6   r   r    s    r  c                       s>   e Zd Zd fdd	Zedd Zedd	 Zd
d Z  ZS )StyleAttention   
      c              	      s\   t t|   || }tjt||| _tjj||dd||dd| _	tjj
| j d S )Nr  T)	embed_dim	num_headsr   r   kdimvdimbatch_first)r3   r  r4   r&   ri   	Parameterr   tokensMultiheadAttentionmharl   r   )r5   gst_sizen_style_tokenn_style_attn_head
token_sizer6   r   r   r4     s   	zStyleAttention.__init__c                 C   s   t dt t dt dddS )NBDr(  Toptional)r  token_id)r   r
   r   r	  r   r   r   input_types  s   
zStyleAttention.input_typesc                 C      dt dt iS )N	style_embr'  r   r
   r	  r   r   r   output_types     zStyleAttention.output_typesc                 C   sR   | d}|d}t| jd|dd}| j|||d\}}|d}|S )Nr   rC   r"   )r   keyvalue)r   r   rT   r   r   r   r"  r   )r5   r  r   r   r   r/  _r   r   r   r8   %  s   


zStyleAttention.forwardr  r  r  )	r9   r:   r;   r4   propertyr-  r1  r8   r@   r   r   r6   r   r    s    

r  c                       s(   e Zd Zd fdd	Zdd	d
Z  ZS )Conv2DReLUNorm   r$   rC   Tr  c                    sH   t t|   tjj||||||d| _tj|| _tj	|| _
d S )Nr   )r3   r8  r4   r&   ri   Conv2dry   	LayerNormr|   Dropoutr   )r5   r}   rZ   rG   rN   rO   r   r   r6   r   r   r4   0  s   zConv2DReLUNorm.__init__Nc                 C   s`   |d ur|| }|  dddd}t| |}|  dddd}| |}| |}|S )Nr   r9  rC   r$   )
contiguouspermuterT   r   ry   r|   r   )r5   rs   x_maskr   r   r   r8   8  s   

zConv2DReLUNorm.forward)r9  r$   rC   Tr  r2   ru   r   r   r6   r   r8  /  r   r8  c                       sZ   e Zd ZdZ fddZedd Zedd Zdd	 Ze	dddZ
e	dd Z  ZS )ReferenceEncoderz?
    Encode mel-spectrograms to an utterance level feature
    c	           
         s~   t t  dgt| _tj fddtt	|D _
j|t	|d}	tjj|d |	 |dd_d S )NrC   c                    s:   g | ]}t tj| tj|d    dqS )rC   )r}   rZ   rG   rN   rO   r   r   )r8  r=   filter_size)r   r   r   r   rG   rO   r5   rN   r   r   r   O  s    
z-ReferenceEncoder.__init__.<locals>.<listcomp>)n_convsr"   T)
input_sizehidden_sizer  )r3   r@  r4   listrA  r&   ri   r   r   r  r   calculate_post_conv_lengthsGRUgru)
r5   n_melscnn_filtersr   
gru_hiddenrG   rN   rO   r   post_conv_heightr6   rB  r   r4   K  s   

zReferenceEncoder.__init__c                 C      t dt t dt dS )Nr(  r)  T_specr(  )r  inputs_lengthsr   r   r   r	  r   r   r   r-  a     

zReferenceEncoder.input_typesc                 C   r.  )Nr+   r'  r0  r	  r   r   r   r1  h  r2  zReferenceEncoder.output_typesc           	      C   s   | ddd}|}| |dd}| jD ]}|||}| |}| |dd}q| |jd |jd d}| j	  t
|| ddd}| |\}}t|dd	\}}|tt||d d d f }|S )
NrC   r$   r9  r   r"   TF)r  enforce_sorted)r  )r   r   lengths_to_masksr   rG  r=  rY   rI   rI  flatten_parametersr   cpur   r&   aranger  )	r5   r  rQ  rs   x_lensx_maskslayerpacked_xr5  r   r   r   r8   n  s   



 zReferenceEncoder.forwardrC   r9  r$   c                 C   s*   t |D ]}| | d|  | d } q| S )z?Batch lengths after n convolution with fixed kernel/stride/pad.r$   rC   )r   )r(   rC  rG   rN   padr5  r   r   r   rG    s   z,ReferenceEncoder.calculate_post_conv_lengthsc                 C   s6   t |  | j| jd |  | dk }|S )z"Batch of lengths to batch of masksr   rC   )r&   rX  maxr   rM   r   rI   r   )r(   masksr   r   r   rU    s   z!ReferenceEncoder.lengths_to_masks)rC   r9  r$   rC   )r9   r:   r;   r<   r4   r7  r-  r1  r8   staticmethodrG  rU  r@   r   r   r6   r   r@  F  s    

r@  c                       sD   e Zd ZdZ	d fdd	Zedd Zed	d
 Zdd Z  Z	S )GlobalStyleTokenz4
    Global Style Token based Speaker Embedding
    r  r  r  c                    s(   t t|   || _t|||d| _d S )N)r#  r$  r%  )r3   ra  r4   reference_encoderr  style_attention)r5   rb  r#  r$  r%  r6   r   r   r4     s
   zGlobalStyleToken.__init__c                 C   rN  )NrO  r(  )inpinp_lengthsrR  r	  r   r   r   r-    rS  zGlobalStyleToken.input_typesc                 C   r.  )Ngstr'  r0  r	  r   r   r   r1    r2  zGlobalStyleToken.output_typesc                 C   s   |  ||}| |}|S r2   )rb  rc  )r5   rd  re  style_embeddingrf  r   r   r   r8     s   
zGlobalStyleToken.forwardr6  )
r9   r:   r;   r<   r4   r7  r-  r1  r8   r@   r   r   r6   r   ra    s    	

ra  c                       s(   e Zd ZdZ fddZdd Z  ZS )SpeakerLookupTablez-
    LookupTable based Speaker Embedding
    c                    s"   t t|   tj||| _d S r2   )r3   rh  r4   r&   ri   	Embeddingtable)r5   
n_speakersr   r6   r   r   r4     s   zSpeakerLookupTable.__init__c                 C   rr   r2   )rj  )r5   speakerr   r   r   r8     rt   zSpeakerLookupTable.forwardr  r   r   r6   r   rh    s    rh  c                       sL   e Zd ZdZd fdd	Zedd Zedd Zd	d
 ZdddZ	  Z
S )SpeakerEncoderz
    class SpeakerEncoder represents speakers representation. 
    This module can combine GST (global style token) based speaker embeddings and lookup table speaker embeddings.
    Nc                    sD   t t|   || _|| _|durtjt|| _	dS d| _	dS )a  
        lookup_module: Torch module to get lookup based speaker embedding
        gst_module: Neural module to get GST based speaker embedding
        precomputed_embedding_dim: Give precomputed speaker embedding dimension to use precompute speaker embedding
        N)
r3   rm  r4   lookup_module
gst_moduler&   ri   r  emptyprecomputed_emb)r5   rn  ro  precomputed_embedding_dimr6   r   r   r4     s   
zSpeakerEncoder.__init__c                 C   s8   t ddt dt ddt dt ddt dt dddS )NTr*  r(  rO  )r   rl  reference_specreference_spec_lens)r   r   r   r   r	  r   r   r   r-    s
   zSpeakerEncoder.input_typesc                 C   r.  )Nembsr'  r0  r	  r   r   r   r1    r2  zSpeakerEncoder.output_typesc                 C   s   t j|| _d S r2   )r&   ri   r  rq  )r5   embr   r   r   overwrite_precomputed_emb  s   z(SpeakerEncoder.overwrite_precomputed_embc                 C   s   d }| j d ur| j d|dS | jd ur|d ur| |}|d urE|d urE| jd ur@| ||}|d u r:|}|S || }|S td |S )Nr   rC   zCYou may add `gst_module` in speaker_encoder to use reference_audio.)rq  r   r  rn  ro  r   warning)r5   r   rl  rs  rt  ru  r+   r   r   r   r8     s   



zSpeakerEncoder.forward)NNN)NNNN)r9   r:   r;   r<   r4   r7  r-  r1  rw  r8   r@   r   r   r6   r   rm    s    

rm  )r   )4typingr   r&   r   torch.autogradr   torch.nnr   rT   torch.nn.utils.rnnr   r   nemo.core.classesr   r	   nemo.core.neural_types.elementsr
   r   r   r   "nemo.core.neural_types.neural_typer   
nemo.utilsr   r   r   r>   r,   ri   InstanceNorm1dr-   rx   rA   Modulerd   AdapterModuleMixinrv   r   r   r   r   r   r   r;  r   r  r  r8  r@  ra  rh  rm  r   r   r   r   <module>   s\   
1==0G'.(N!