o
    }o™içn  ã                   @   s  d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZ d dlmZmZmZmZ d d	lmZ d d
lmZ g d¢Zdd„ Z	d<dededededededefdd„ZG dd„ dejjƒZG dd„ dejjƒZ G dd„ dejj!ƒZ"G dd„ dejj!ej#ƒZ$G d d!„ d!ejj!ƒZ%G d"d#„ d#ejj!ƒZ&G d$d%„ d%ejj!ƒZ'd&d'„ Z(G d(d)„ d)ejj!ƒZ)G d*d+„ d+ejj!ƒZ*G d,d-„ d-ejj+ƒZ,G d.d/„ d/ejj!ƒZ-G d0d1„ d1eƒZ.G d2d3„ d3ejj!ƒZ/G d4d5„ d5eƒZ0G d6d7„ d7eƒZ1G d8d9„ d9ejj!ƒZ2G d:d;„ d;eƒZ3dS )=é    )ÚTupleN)ÚTensor)ÚVariable)Ú
functional)Úpack_padded_sequenceÚpad_packed_sequence)ÚNeuralModuleÚadapter_mixins)ÚEncodedRepresentationÚIndexÚLengthsTypeÚMelSpectrogramType)Ú
NeuralType)Úlogging)ÚaddÚconcatÚ	layernormc                 C   s$   | D ]}|t vrtd|› ƒ‚qd S )NzUnknown conditioning type )ÚSUPPORTED_CONDITION_TYPESÚ
ValueError)Úcondition_typesÚtp© r   ú[/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/modules/submodules.pyÚcheck_support_condition_types    s
   ÿÿr   çñhãˆµøä>ÚinputÚmaskÚweightÚbiasÚmomentumÚepsÚreturnc           
      C   sˆ   |  d¡}| |   d¡| }| |d  | d   d¡| }| |d  t |d | ¡ }	|	|ddd…f d  |ddd…f d  }	|	S )zApplies Masked Instance Normalization for each channel in each data sample in a batch.

    See :class:`~MaskedInstanceNorm1d` for details.
    )éÿÿÿÿ©.Né   N)ÚsumÚtorchÚsqrt)
r   r   r   r   r   r    ÚlengthsÚmeanÚvarÚoutr   r   r   Úmasked_instance_norm&   s   
,r,   c                       sZ   e Zd ZdZ				ddedededed	ed
df‡ fdd„Zdeded
efdd„Z	‡  Z
S )ÚMaskedInstanceNorm1daR  Applies Instance Normalization over a masked 3D input
    (a mini-batch of 1D inputs with additional channel dimension)..

    See documentation of :class:`~torch.nn.InstanceNorm1d` for details.

    Shape:
        - Input: :math:`(N, C, L)`
        - Mask: :math:`(N, 1, L)`
        - Output: :math:`(N, C, L)` (same shape as input)
    r   çš™™™™™¹?FÚnum_featuresr    r   ÚaffineÚtrack_running_statsr!   Nc                    s   t t| ƒ |||||¡ d S ©N)Úsuperr-   Ú__init__)Úselfr/   r    r   r0   r1   ©Ú	__class__r   r   r4   B   s   zMaskedInstanceNorm1d.__init__r   r   c                 C   s   t ||| j| j| j| jƒS r2   )r,   r   r   r   r    )r5   r   r   r   r   r   ÚforwardL   s   zMaskedInstanceNorm1d.forward)r   r.   FF)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚintÚfloatÚboolr4   r   r8   Ú__classcell__r   r   r6   r   r-   6   s(    úþýüûúù
r-   c                       s8   e Zd ZU dZdgZeed< ‡ fdd„Zdd„ Z‡  Z	S )ÚPartialConv1da  
    Zero padding creates a unique identifier for where the edge of the data is, such that the model can almost always identify
    exactly where it is relative to either edge given a sufficient receptive field. Partial padding goes to some lengths to remove 
    this affect.
    Úslide_winsizec                    sX   t t| ƒj|i |¤Ž t dd| jd ¡}| jd|dd | jjd | jjd  | _	d S )Né   r   Úweight_maskUpdaterF)Ú
persistentr$   )
r3   rA   r4   r&   ÚonesÚkernel_sizeÚregister_bufferrD   ÚshaperB   )r5   ÚargsÚkwargsrD   r6   r   r   r4   Z   s   zPartialConv1d.__init__c           
   
   C   s  |d u rt jdd|jd |j|jd}n|}t  ||¡}t  ¡ 4 tj|| j	d | j
| j| jdd}t  ||dk| j¡}| j| }t  |dd¡}t  ||¡}W d   ƒ n1 sWw   Y  |  || j| j¡}| jd ur…| j d| jd¡}t  || |¡| }	t  |	|¡}	|	S t  ||¡}	|	S )NrC   r$   )ÚdtypeÚdevice)r   ÚstrideÚpaddingÚdilationÚgroupsr   )r&   rF   rI   rL   rM   ÚmulÚno_gradÚFÚconv1drD   rN   rO   rP   Úmasked_fillrB   ÚclampÚ_conv_forwardr   r   ÚviewÚout_channels)
r5   r   Úmask_inr   Úupdate_maskÚupdate_mask_filledÚ
mask_ratioÚraw_outÚ	bias_viewÚoutputr   r   r   r8   `   s6    
ù	
ó
þzPartialConv1d.forward)
r9   r:   r;   r<   Ú__constants__r>   Ú__annotations__r4   r8   r@   r   r   r6   r   rA   P   s   
 rA   c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )Ú
LinearNormTÚlinearc                    sB   t ƒ  ¡  tjj|||d| _tjjj| jjtjj 	|¡d d S )N©r   ©Úgain)
r3   r4   r&   ÚnnÚLinearÚlinear_layerÚinitÚxavier_uniform_r   Úcalculate_gain)r5   Úin_dimÚout_dimr   Úw_init_gainr6   r   r   r4   ‚   s   
$zLinearNorm.__init__c                 C   ó
   |   |¡S r2   )rk   )r5   Úxr   r   r   r8   ˆ   ó   
zLinearNorm.forward)Tre   ©r9   r:   r;   r4   r8   r@   r   r   r6   r   rd      s    rd   c                       sJ   e Zd ZU dgZeed< 									d‡ fdd„	Zdd	d
„Z‡  ZS )ÚConvNormÚuse_partial_paddingrC   NTre   Fc              	      sÂ   t t| ƒ ¡  |d u r|d dksJ ‚t||d  d ƒ}|	| _tjj}|	r(t}||||||||d| _	tjj
j| j	jtjj
 |¡d |
rOtjj | j	¡| _	|d ur\||dd| _d S d | _d S )Nr$   rC   )rG   rN   rO   rP   r   rg   T)r0   )r3   rv   r4   r=   rw   r&   ri   ÚConv1drA   Úconvrl   rm   r   rn   ÚutilsÚweight_normÚnorm)r5   Úin_channelsrZ   rG   rN   rO   rP   r   rq   rw   Úuse_weight_normÚnorm_fnÚconv_fnr6   r   r   r4      s.   ù 	
zConvNorm.__init__c                 C   s€   | j r|  ||¡}| jd ur|  ||¡}n|d ur| |¡}|  |¡}| jd ur-|  |¡}|  ¡ r>|  | dd¡¡ dd¡}|S ©NrC   r$   )rw   ry   r|   rR   Úis_adapter_availableÚforward_enabled_adaptersÚ	transpose)r5   Úsignalr   Úretr   r   r   r8   ·   s   
€


zConvNorm.forward)	rC   rC   NrC   Tre   FFNr2   )	r9   r:   r;   rb   r?   rc   r4   r8   r@   r   r   r6   r   rv   Œ   s   
 ô'rv   c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚLocationLayerc              	      sH   t ƒ  ¡  t|d d ƒ}td|||dddd| _t||ddd| _d S )NrC   r$   F)rG   rO   r   rN   rP   Útanh©r   rq   )r3   r4   r=   rv   Úlocation_convrd   Úlocation_dense)r5   Úattention_n_filtersÚattention_kernel_sizeÚattention_dimrO   r6   r   r   r4   Ê   s   
ù	zLocationLayer.__init__c                 C   s$   |   |¡}| dd¡}|  |¡}|S r   )rŠ   r„   r‹   )r5   Úattention_weights_catÚprocessed_attentionr   r   r   r8   Ø   s   

zLocationLayer.forwardru   r   r   r6   r   r‡   É   s    r‡   c                       s,   e Zd Z‡ fdd„Zdd„ Zdd„ Z‡  ZS )Ú	Attentionc                    s\   t ƒ  ¡  t||ddd| _t||ddd| _t|ddd| _t|||ƒ| _tdƒ | _	d S )NFrˆ   r‰   rC   rf   Úinf)
r3   r4   rd   Úquery_layerÚmemory_layerÚvr‡   Úlocation_layerr>   Úscore_mask_value)r5   Úattention_rnn_dimÚembedding_dimrŽ   Úattention_location_n_filtersÚattention_location_kernel_sizer6   r   r   r4   à   s   
ÿzAttention.__init__c                 C   s@   |   | d¡¡}|  |¡}|  t || | ¡¡}| d¡}|S )aS  
        PARAMS
        ------
        query: decoder output (batch, n_mel_channels * n_frames_per_step)
        processed_memory: processed encoder outputs (B, T_in, attention_dim)
        attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
        RETURNS
        -------
        alignment (batch, max_time)
        rC   r"   )r“   Ú	unsqueezer–   r•   r&   rˆ   Úsqueeze)r5   ÚqueryÚprocessed_memoryr   Úprocessed_queryÚprocessed_attention_weightsÚenergiesr   r   r   Úget_alignment_energiesñ   s
   

z Attention.get_alignment_energiesc           	      C   sX   |   |||¡}|dur|j || j¡ tj|dd}t | d¡|¡}| 	d¡}||fS )a)  
        PARAMS
        ------
        attention_hidden_state: attention rnn last output
        memory: encoder outputs
        processed_memory: processed encoder outputs
        attention_weights_cat: previous and cummulative attention weights
        mask: binary mask for padded data
        NrC   ©Údim)
r£   ÚdataÚmasked_fill_r—   rT   Úsoftmaxr&   Úbmmrœ   r   )	r5   Úattention_hidden_stateÚmemoryrŸ   r   r   Ú	alignmentÚattention_weightsÚattention_contextr   r   r   r8     s   
zAttention.forward)r9   r:   r;   r4   r£   r8   r@   r   r   r6   r   r‘   ß   s    r‘   c                       s(   e Zd Zd‡ fdd„	Zddd„Z‡  ZS )	ÚPrenetç      à?c                    sD   t ƒ  ¡  |g|d d…  }|| _tj dd„ t||ƒD ƒ¡| _d S )Nr"   c                 S   s   g | ]\}}t ||d d‘qS )Frf   )rd   )Ú.0Úin_sizeÚout_sizer   r   r   Ú
<listcomp>"  s    z#Prenet.__init__.<locals>.<listcomp>)r3   r4   Ú	p_dropoutr&   ri   Ú
ModuleListÚzipÚlayers)r5   ro   Úsizesrµ   Úin_sizesr6   r   r   r4     s   

ÿzPrenet.__init__Fc              	   C   s¶   |rE| j D ]=}t ||ƒ¡}|d  d¡}tj t |j 	|j 
¡ ¡ d| j ¡¡¡}| | 
d¡| 
d¡¡}|| d d| j  }q|S | j D ]}tjt ||ƒ¡| jdd}qH|S )Nr   rC   T)ÚpÚtraining)r¸   rT   Úrelurœ   r&   Úautogradr   Ú	bernoullir¦   ÚnewÚsizeÚfill_rµ   ÚexpandÚdropout)r5   rs   Ú	inferencere   Úx0r   r   r   r   r8   %  s   
,
þzPrenet.forward)r°   ©Fru   r   r   r6   r   r¯     ó    r¯   c                 C   sT   | | }t  |d d …d |…d d …f ¡}t  |d d …|d …d d …f ¡}|| }|S r2   )r&   rˆ   Úsigmoid)Úinput_aÚinput_bÚn_channels_intÚin_actÚt_actÚs_actÚactsr   r   r   Úfused_add_tanh_sigmoid_multiply3  s
     rÑ   c                       s0   e Zd ZdZ‡ fdd„Zddefdd„Z‡  ZS )	ÚInvertible1x1Convz™
    The layer outputs both the convolution, and the log determinant
    of its weight matrix.  If reverse=True it does convolution with
    inverse
    c                    s   t ƒ  ¡  tjj||ddddd| _tj t ||¡ 	¡ ¡d }t 
|¡dk r7d|d d …df  |d d …df< | ||d¡}|| jj_d | _d S )NrC   r   F©rG   rN   rO   r   r"   )r3   r4   r&   ri   rx   ry   ÚlinalgÚqrÚFloatTensorÚnormal_ÚdetrY   r   r¦   Úinv_conv)r5   ÚcÚWr6   r   r   r4   B  s   
 

zInvertible1x1Conv.__init__FÚreversec           	      C   sÄ   |rA| j d u r<tjj| jj| jjddddd| _ | jj ¡ j	 
¡  ¡ }t|d ƒ}|| j j_	| j j| jjj| jjjd |   |¡S | jj ¡ }| ¡ \}}}|| t | 
¡ ¡ }|  |¡}||fS )NrC   r   FrÓ   r#   )rM   rL   )rÙ   r&   ri   rx   ry   r}   rZ   r   r   r¦   r>   Úinverser   ÚtorM   rL   rÁ   Úlogdet)	r5   ÚzrÜ   Ú	W_inverserÛ   Ú
batch_sizeÚ
group_sizeÚn_of_groupsÚ	log_det_Wr   r   r   r8   P  s"   
ÿ


þzInvertible1x1Conv.forwardrÇ   )r9   r:   r;   r<   r4   r?   r8   r@   r   r   r6   r   rÒ   ;  s    rÒ   c                       s:   e Zd ZdZ‡ fdd„Zdeejejf fdd„Z‡  Z	S )ÚWaveNetzæ
    This is the WaveNet like layer for the affine coupling.  The primary
    difference from WaveNet is the convolutions need not be causal.  There is
    also no dilation size reset.  The dilation only doubles on each layer
    c                    s‚  t ƒ  ¡  |d dksJ ‚|d dksJ ‚|| _|| _tj ¡ | _tj ¡ | _tj 	||d¡}tjj
j|dd}|| _tj 	|d| d¡}|jj ¡  |jj ¡  || _tj 	|d| | d¡}tjj
j|dd| _t|ƒD ]P}	d|	 }
t||
 |
 d ƒ}tjj	|d| ||
|d}tjj
j|dd}| j |¡ |	|d k r¥d| }n|}tj 	||d¡}tjj
j|dd}| j |¡ qnd S )Nr$   rC   r   r   )Úname)rP   rO   )r3   r4   Ún_layersÚ
n_channelsr&   ri   r¶   Ú	in_layersÚres_skip_layersrx   rz   r{   Ústartr   r¦   Úzero_r   ÚendÚ
cond_layerÚranger=   Úappend)r5   Ún_in_channelsÚn_mel_channelsrè   ré   rG   rì   rî   rï   ÚirP   rO   Úin_layerÚres_skip_channelsÚres_skip_layerr6   r   r   r4   r  s:   

òzWaveNet.__init__Úforward_inputc           	   	   C   sö   |d |d }}|   |¡}t |¡}|  |¡}t| jƒD ]X}|d | j }t| j| |ƒ|d d …||d| j  …d d …f | jƒ}| j	| |ƒ}|| jd k rq||d d …d | j…d d …f  }||d d …| jd …d d …f  }q|| }q|  
|¡S )Nr   rC   r$   )rì   r&   Ú
zeros_likerï   rð   rè   ré   rÑ   rê   rë   rî   )	r5   rø   ÚaudioÚspectra   rô   Úspect_offsetrÐ   Úres_skip_actsr   r   r   r8   ™  s"   


"ý "

zWaveNet.forward)
r9   r:   r;   r<   r4   r   r&   r   r8   r@   r   r   r6   r   ræ   k  s    "'ræ   c                       s<   e Zd ZdZdg f‡ fdd„	Zdd„ Zd	‡ fdd„	Z‡  ZS )
ÚConditionalLayerNormz„
    This module is used to condition torch.nn.LayerNorm.
    If we don't have any conditions, this will be a normal LayerNorm.
    Nc                    s\   t |ƒ d|v | _tƒ j|| j d | jr,tj ||¡| _tj ||¡| _|  	¡  d S d S )Nr   )Úelementwise_affine)
r   Ú	conditionr3   r4   r&   ri   rj   Úcond_weightÚ	cond_biasÚinit_parameters©r5   Ú
hidden_dimÚcondition_dimr   r6   r   r   r4   ¸  s   
ýzConditionalLayerNorm.__init__c                 C   sT   t jj | jjd¡ t jj | jjd¡ t jj | jjd¡ t jj | jjd¡ d S )Nç        g      ð?)r&   ri   rl   Ú	constant_r  r   r   r  ©r5   r   r   r   r  Â  s   z$ConditionalLayerNorm.init_parametersc                    sB   t ƒ  |¡}| jr|d u rtdƒ‚||  |¡ }||  |¡ }|S )Nú¤You should add additional data types as conditions (e.g. speaker id or reference audio) 
                                 and define speaker_encoder in your config.)r3   r8   r   r   r  r  ©r5   ÚinputsÚconditioningr6   r   r   r8   È  s   ÿzConditionalLayerNorm.forwardr2   )r9   r:   r;   r<   r4   r  r8   r@   r   r   r6   r   rþ   ²  s
    
rþ   c                       s.   e Zd ZdZg f‡ fdd„	Zddd„Z‡  ZS )ÚConditionalInputz}
    This module is used to condition any model inputs.
    If we don't have any conditions, this will be a normal pass.
    c                    s„   t |ƒ tƒ  ¡  ddgˆ _‡ fdd„|D ƒˆ _|ˆ _|ˆ _dˆ jv r/||kr/tj 	||¡ˆ _
dˆ jv r@tj 	|| |¡ˆ _d S d S )Nr   r   c                    s   g | ]	}|ˆ j v r|‘qS r   )Úsupport_types)r±   r   r	  r   r   r´   ã  s    z-ConditionalInput.__init__.<locals>.<listcomp>)r   r3   r4   r  r   r  r  r&   ri   rj   Úadd_projÚconcat_projr  r6   r	  r   r4   ß  s   


ÿzConditionalInput.__init__Nc                 C   s„   t | jƒdkr@|du rtdƒ‚d| jv r#| j| jkr|  |¡}|| }d| jv r@| d|jd d¡}tj	||gdd}|  
|¡}|S )	z—
        Args:
            inputs (torch.tensor): B x T x H tensor.
            conditioning (torch.tensor): B x 1 x C conditioning embedding.
        r   Nr
  r   r   rC   r"   r¤   )Úlenr   r   r  r  r  ÚrepeatrI   r&   Úcatr  r  r   r   r   r8   í  s   ÿ



zConditionalInput.forwardr2   ©r9   r:   r;   r<   r4   r8   r@   r   r   r6   r   r  Ù  s    r  c                       s>   e Zd Zd‡ fdd„	Zedd„ ƒZedd	„ ƒZd
d„ Z‡  ZS )ÚStyleAttentioné€   é
   é   c              	      s\   t t| ƒ ¡  || }tj t ||¡¡| _tjj||dd||dd| _	tjj
 | j¡ d S )Nr  T)Ú	embed_dimÚ	num_headsrÄ   r   ÚkdimÚvdimÚbatch_first)r3   r  r4   r&   ri   Ú	ParameterrÖ   ÚtokensÚMultiheadAttentionÚmharl   r×   )r5   Úgst_sizeÚn_style_tokenÚn_style_attn_headÚ
token_sizer6   r   r   r4     s   ù	zStyleAttention.__init__c                 C   s   t dtƒ ƒt dtƒ dddœS )N©ÚBÚDr(  T©Úoptional)r  Útoken_id)r   r
   r   r	  r   r   r   Úinput_types  s   
þzStyleAttention.input_typesc                 C   ó   dt dtƒ ƒiS )NÚ	style_embr'  ©r   r
   r	  r   r   r   Úoutput_types  ó   ÿzStyleAttention.output_typesc                 C   sR   |  d¡}| d¡}t | j¡ d¡ |dd¡}| j|||d\}}| d¡}|S )Nr   rC   r"   )rž   ÚkeyÚvalue)rÁ   rœ   rT   rˆ   r   rÃ   r"  r   )r5   r  râ   rž   r   r/  Ú_r   r   r   r8   %  s   


zStyleAttention.forward©r  r  r  )	r9   r:   r;   r4   Úpropertyr-  r1  r8   r@   r   r   r6   r   r    s    

r  c                       s(   e Zd Zd‡ fdd„	Zdd	d
„Z‡  ZS )ÚConv2DReLUNormé   r$   rC   Tr  c                    sH   t t| ƒ ¡  tjj||||||d| _tj |¡| _tj 	|¡| _
d S )NrÓ   )r3   r8  r4   r&   ri   ÚConv2dry   Ú	LayerNormr|   ÚDropoutrÄ   )r5   r}   rZ   rG   rN   rO   r   rÄ   r6   r   r   r4   0  s   ÿzConv2DReLUNorm.__init__Nc                 C   s`   |d ur|| }|  ¡  dddd¡}t |  |¡¡}|  ¡  dddd¡}|  |¡}|  |¡}|S )Nr   r9  rC   r$   )Ú
contiguousÚpermuterT   r½   ry   r|   rÄ   )r5   rs   Úx_maskr   r   r   r8   8  s   

zConv2DReLUNorm.forward)r9  r$   rC   Tr  r2   ru   r   r   r6   r   r8  /  rÈ   r8  c                       sZ   e Zd ZdZ‡ fdd„Zedd„ ƒZedd„ ƒZdd	„ Ze	ddd„ƒZ
e	dd„ ƒZ‡  ZS )ÚReferenceEncoderz?
    Encode mel-spectrograms to an utterance level feature
    c	           
         s~   t tˆƒ ¡  dgt|ƒ ˆ_tj ‡ ‡‡‡‡‡fdd„tt	|ƒƒD ƒ¡ˆ_
ˆj|t	|ƒd}	tjj|d |	 |ddˆ_d S )NrC   c                    s:   g | ]}t tˆj| ƒtˆj|d   ƒˆˆˆˆ ˆd‘qS )rC   )r}   rZ   rG   rN   rO   r   rÄ   )r8  r=   Úfilter_size)r±   rô   ©r   rÄ   rG   rO   r5   rN   r   r   r´   O  s    
÷ùÿz-ReferenceEncoder.__init__.<locals>.<listcomp>)Ún_convsr"   T)Ú
input_sizeÚhidden_sizer  )r3   r@  r4   ÚlistrA  r&   ri   r¶   rð   r  r¸   Úcalculate_post_conv_lengthsÚGRUÚgru)
r5   Ún_melsÚcnn_filtersrÄ   Ú
gru_hiddenrG   rN   rO   r   Úpost_conv_heightr6   rB  r   r4   K  s   

öÿÿzReferenceEncoder.__init__c                 C   ó   t dtƒ ƒt dtƒ ƒdœS )N©r(  r)  ÚT_specr(  )r  Úinputs_lengths©r   r   r   r	  r   r   r   r-  a  ó   

þzReferenceEncoder.input_typesc                 C   r.  )Nr+   r'  r0  r	  r   r   r   r1  h  r2  zReferenceEncoder.output_typesc           	      C   sà   |  dd¡ d¡}|}|  |¡ d¡ d¡}| jD ]}|||ƒ}|  |¡}|  |¡ d¡ d¡}q| ¡  |jd |jd d¡}| j 	¡  t
|| ¡ ddd}|  |¡\}}t|dd	\}}|t t|ƒ¡|d d d …f }|S )
NrC   r$   r9  r   r"   TF)r  Úenforce_sorted)r  )r„   rœ   Úlengths_to_masksr¸   rG  r=  rY   rI   rI  Úflatten_parametersr   Úcpur   r&   Úaranger  )	r5   r  rQ  rs   Úx_lensÚx_masksÚlayerÚpacked_xr5  r   r   r   r8   n  s   



 zReferenceEncoder.forwardrC   r9  r$   c                 C   s*   t |ƒD ]}| | d|  | d } q| S )z?Batch lengths after n convolution with fixed kernel/stride/pad.r$   rC   )rð   )r(   rC  rG   rN   Úpadr5  r   r   r   rG  ƒ  s   z,ReferenceEncoder.calculate_post_conv_lengthsc                 C   s6   t  |  ¡ ¡ | j¡ | jd |  ¡ ¡|  d¡k }|S )z"Batch of lengths to batch of masksr   rC   )r&   rX  ÚmaxrÞ   rM   rÃ   rI   rœ   )r(   Úmasksr   r   r   rU  Š  s   ÿþz!ReferenceEncoder.lengths_to_masks)rC   r9  r$   rC   )r9   r:   r;   r<   r4   r7  r-  r1  r8   ÚstaticmethodrG  rU  r@   r   r   r6   r   r@  F  s    

r@  c                       sD   e Zd ZdZ	d‡ fdd„	Zedd„ ƒZed	d
„ ƒZdd„ Z‡  Z	S )ÚGlobalStyleTokenz4
    Global Style Token based Speaker Embedding
    r  r  r  c                    s(   t t| ƒ ¡  || _t|||d| _d S )N)r#  r$  r%  )r3   ra  r4   Úreference_encoderr  Ústyle_attention)r5   rb  r#  r$  r%  r6   r   r   r4   ™  s
   ÿzGlobalStyleToken.__init__c                 C   rN  )NrO  r(  )ÚinpÚinp_lengthsrR  r	  r   r   r   r-  ¢  rS  zGlobalStyleToken.input_typesc                 C   r.  )NÚgstr'  r0  r	  r   r   r   r1  ©  r2  zGlobalStyleToken.output_typesc                 C   s   |   ||¡}|  |¡}|S r2   )rb  rc  )r5   rd  re  Ústyle_embeddingrf  r   r   r   r8   ¯  s   
zGlobalStyleToken.forwardr6  )
r9   r:   r;   r<   r4   r7  r-  r1  r8   r@   r   r   r6   r   ra  ”  s    ÿ	

ra  c                       s(   e Zd ZdZ‡ fdd„Zdd„ Z‡  ZS )ÚSpeakerLookupTablez-
    LookupTable based Speaker Embedding
    c                    s"   t t| ƒ ¡  tj ||¡| _d S r2   )r3   rh  r4   r&   ri   Ú	EmbeddingÚtable)r5   Ú
n_speakersr™   r6   r   r   r4   º  s   zSpeakerLookupTable.__init__c                 C   rr   r2   )rj  )r5   Úspeakerr   r   r   r8   ¾  rt   zSpeakerLookupTable.forwardr  r   r   r6   r   rh  µ  s    rh  c                       sL   e Zd ZdZd‡ fdd„	Zedd„ ƒZedd„ ƒZd	d
„ Zddd„Z	‡  Z
S )ÚSpeakerEncoderz¶
    class SpeakerEncoder represents speakers representation. 
    This module can combine GST (global style token) based speaker embeddings and lookup table speaker embeddings.
    Nc                    sD   t t| ƒ ¡  || _|| _|durtj t |¡¡| _	dS d| _	dS )a  
        lookup_module: Torch module to get lookup based speaker embedding
        gst_module: Neural module to get GST based speaker embedding
        precomputed_embedding_dim: Give precomputed speaker embedding dimension to use precompute speaker embedding
        N)
r3   rm  r4   Úlookup_moduleÚ
gst_moduler&   ri   r  ÚemptyÚprecomputed_emb)r5   rn  ro  Úprecomputed_embedding_dimr6   r   r   r4   È  s   
zSpeakerEncoder.__init__c                 C   s8   t ddt dtƒ ddt dtƒ ddt dtƒ dddœS )NTr*  r(  rO  )râ   rl  Úreference_specÚreference_spec_lens)r   r   r   r   r	  r   r   r   r-  Û  s
   üzSpeakerEncoder.input_typesc                 C   r.  )NÚembsr'  r0  r	  r   r   r   r1  ä  r2  zSpeakerEncoder.output_typesc                 C   s   t j |¡| _d S r2   )r&   ri   r  rq  )r5   Úembr   r   r   Úoverwrite_precomputed_embê  s   z(SpeakerEncoder.overwrite_precomputed_embc                 C   sŽ   d }| j d ur| j  d¡ |d¡S | jd ur|d ur|  |¡}|d urE|d urE| jd ur@|  ||¡}|d u r:|}|S || }|S t d¡ |S )Nr   rC   zCYou may add `gst_module` in speaker_encoder to use reference_audio.)rq  rœ   r  rn  ro  r   Úwarning)r5   râ   rl  rs  rt  ru  r+   r   r   r   r8   í  s   


ü
þzSpeakerEncoder.forward)NNN)NNNN)r9   r:   r;   r<   r4   r7  r-  r1  rw  r8   r@   r   r   r6   r   rm  Â  s    

rm  )r   )4Útypingr   r&   r   Útorch.autogradr   Útorch.nnr   rT   Útorch.nn.utils.rnnr   r   Únemo.core.classesr   r	   Únemo.core.neural_types.elementsr
   r   r   r   Ú"nemo.core.neural_types.neural_typer   Ú
nemo.utilsr   r   r   r>   r,   ri   ÚInstanceNorm1dr-   rx   rA   ÚModulerd   ÚAdapterModuleMixinrv   r‡   r‘   r¯   rÑ   rÒ   ræ   r;  rþ   r  r  r8  r@  ra  rh  rm  r   r   r   r   Ú<module>   s\   ÿÿÿÿÿÿÿ
þ1==0G'.(N!