o
    }o™i´  ã                   @   s2  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	 d dl
mZmZmZmZ d dlmZ d dlmZmZmZmZ d dlmZ dZejjd	d
„ ƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejjƒZ G dd„ dejƒZ!G dd„ dejƒZ"G dd„ dejƒZ#G dd„ dejƒZ$G dd„ dejƒZ%G dd„ dejƒZ&G dd „ d ejƒZ'G d!d"„ d"ejƒZ(G d#d$„ d$ejƒZ)G d%d&„ d&ejƒZ*G d'd(„ d(ejjƒZ+G d)d*„ d*ejjƒZ,G d+d,„ d,ejjƒZ-G d-d.„ d.ejjƒZ.G d/d0„ d0ejƒZ/G d1d2„ d2ejƒZ0G d3d4„ d4ejƒZ1G d5d6„ d6ejƒZ2dS )7é    N)Ú
functional)Úremove_weight_normÚspectral_normÚweight_norm)Ú	ResBlock1Ú	ResBlock2Úget_paddingÚinit_weights)Úmaximum_path)Úconvert_pad_shapeÚgenerate_pathÚget_mask_from_lengthsÚrand_slice_segments)Ú&piecewise_rational_quadratic_transformgš™™™™™¹?c                 C   s\   |d }| | }t  |d d …d |…d d …f ¡}t  |d d …|d …d d …f ¡}|| }|S ©Nr   )ÚtorchÚtanhÚsigmoid)Úinput_aÚinput_bÚ
n_channelsÚn_channels_intÚin_actÚt_actÚs_actÚacts© r   ú]/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/modules/vits_modules.pyÚfused_add_tanh_sigmoid_multiply9   s     r   c                       ó&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )Ú	LayerNormçñhãˆµøä>c                    s>   t ƒ  ¡  || _|| _t t |¡¡| _t t 	|¡¡| _
d S ©N)ÚsuperÚ__init__ÚchannelsÚepsÚnnÚ	Parameterr   ÚonesÚgammaÚzerosÚbeta)Úselfr%   r&   ©Ú	__class__r   r   r$   D   s
   
zLayerNorm.__init__c                 C   s4   |  dd¡}t || jf| j| j| j¡}|  dd¡S ©Né   éÿÿÿÿ)Ú	transposeÚFÚ
layer_normr%   r*   r,   r&   )r-   Úxr   r   r   ÚforwardL   s   zLayerNorm.forward)r!   ©Ú__name__Ú
__module__Ú__qualname__r$   r7   Ú__classcell__r   r   r.   r   r    C   ó    r    c                       ó$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚConvReluNormc              	      s  t ƒ  ¡  || _|| _|| _|| _|| _|| _|dksJ dƒ‚t 	¡ | _
t 	¡ | _| j
 tj||||d d¡ | j t|ƒ¡ t t ¡ t |¡¡| _t|d ƒD ]}| j
 tj||||d d¡ | j t|ƒ¡ qRt ||d¡| _| jjj ¡  | jjj ¡  d S )Nr1   z)Number of layers should be larger than 0.é   ©Úpadding)r#   r$   Úin_channelsÚhidden_channelsÚout_channelsÚkernel_sizeÚn_layersÚ	p_dropoutr'   Ú
ModuleListÚconv_layersÚnorm_layersÚappendÚConv1dr    Ú
SequentialÚReLUÚDropoutÚ	relu_dropÚrangeÚprojÚweightÚdataÚzero_Úbias)r-   rC   rD   rE   rF   rG   rH   Ú_r.   r   r   r$   S   s&   


zConvReluNorm.__init__c                 C   sT   |}t | jƒD ]}| j| || ƒ}| j| |ƒ}|  |¡}q||  |¡ }|| S r"   )rR   rG   rJ   rK   rQ   rS   )r-   r6   Úx_maskÚx_orgÚir   r   r   r7   i   s   zConvReluNorm.forwardr8   r   r   r.   r   r?   R   s    r?   c                       s,   e Zd ZdZd‡ fdd„	Zd	dd„Z‡  ZS )
ÚDDSConvz1
    Dilated and Depth-Separable Convolution
    ç        c                    sÔ   t ƒ  ¡  || _|| _|| _|| _t |¡| _t 	¡ | _
t 	¡ | _t 	¡ | _t 	¡ | _t|ƒD ]8}|| }|| | d }| j
 tj||||||d¡ | j t ||d¡¡ | j t|ƒ¡ | j t|ƒ¡ q/d S )Nr@   )ÚgroupsÚdilationrB   r1   )r#   r$   r%   rF   rG   rH   r'   rP   ÚdroprI   Ú	convs_sepÚ	convs_1x1Únorms_1Únorms_2rR   rL   rM   r    )r-   r%   rF   rG   rH   r[   r_   rB   r.   r   r   r$   x   s(   




ÿøzDDSConv.__init__Nc                 C   sŠ   |d ur|| }t | jƒD ]3}| j| || ƒ}| j| |ƒ}t |¡}| j| |ƒ}| j| |ƒ}t |¡}|  |¡}|| }q|| S r"   )	rR   rG   ra   rc   r4   Úgelurb   rd   r`   )r-   r6   rY   Úgr[   Úyr   r   r   r7   Ž   s   



zDDSConv.forward)r]   r"   )r9   r:   r;   Ú__doc__r$   r7   r<   r   r   r.   r   r\   s   s    r\   c                       s0   e Zd Zd	‡ fdd„	Zd
dd„Zdd„ Z‡  ZS )ÚWNr   c                    sJ  t t| ƒ ¡  |d dksJ ‚|| _|f| _|| _|| _|| _|| _t	j
 ¡ | _t	j
 ¡ | _t
 |¡| _|dkrNt	j
 |d| | d¡}t	j
jj|dd| _t|ƒD ]P}|| }	t||	 |	 d ƒ}
t	j
j|d| ||	|
d}t	j
jj|dd}| j |¡ ||d k r‰d| }n|}t	j
 ||d¡}t	j
jj|dd}| j |¡ qRd S )Nr@   r1   r   rT   )Úname)r_   rB   )r#   ri   r$   rD   rF   Údilation_raterG   Úgin_channelsrH   r   r'   rI   Ú	in_layersÚres_skip_layersrP   r`   rM   Úutilsr   Ú
cond_layerrR   ÚintrL   )r-   rD   rF   rk   rG   rl   rH   rp   r[   r_   rB   Úin_layerÚres_skip_channelsÚres_skip_layerr.   r   r   r$   ž   s:   ÿ
ïzWN.__init__Nc                 K   s  t  |¡}t  | jg¡}|d ur|  |¡}t| jƒD ]n}| j| |ƒ}|d urA|d | j }	|d d …|	|	d| j  …d d …f }
nt  |¡}
t||
|ƒ}|  	|¡}| j
| |ƒ}|| jd k r„|d d …d | j…d d …f }|| | }||d d …| jd …d d …f  }q|| }q|| S )Nr@   r1   )r   Ú
zeros_likeÚ	IntTensorrD   rp   rR   rG   rm   r   r`   rn   )r-   r6   rY   rf   ÚkwargsÚoutputÚn_channels_tensorr[   Úx_inÚcond_offsetÚg_lr   Úres_skip_actsÚres_actsr   r   r   r7   Ã   s&   

&

"
z
WN.forwardc                 C   sR   | j dkrtjj | j¡ | jD ]	}tjj |¡ q| jD ]	}tjj |¡ qd S r   )rl   r   r'   ro   r   rp   rm   rn   ©r-   Úlr   r   r   r   Þ   s   


ÿzWN.remove_weight_norm)r   r   r"   ©r9   r:   r;   r$   r7   r   r<   r   r   r.   r   ri      s    
%ri   c                   @   s   e Zd Zddd„ZdS )ÚLogFc                 K   sF   |st  t  |d¡¡| }t  | ddg¡}||fS t  |¡| }|S )Nr!   r1   r@   )r   ÚlogÚ	clamp_minÚsumÚexp©r-   r6   rY   Úreverserw   rg   Úlogdetr   r   r   r7   è   s   zLog.forwardN©F©r9   r:   r;   r7   r   r   r   r   r‚   ç   s    r‚   c                   @   s   e Zd Zddœdd„ZdS )ÚFlipF)rˆ   c                O   s<   t  |dg¡}|st  | d¡¡j|j|jd}||fS |S )Nr1   r   ©ÚdtypeÚdevice)r   Úflipr+   ÚsizeÚtorŽ   r   )r-   r6   rˆ   Úargsrw   r‰   r   r   r   r7   ó   s
   zFlip.forwardNr‹   r   r   r   r   rŒ   ò   s    rŒ   c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )ÚElementwiseAffinec                    s<   t ƒ  ¡  || _t t |d¡¡| _t t |d¡¡| _d S )Nr1   )	r#   r$   r%   r'   r(   r   r+   ÚmÚlogs)r-   r%   r.   r   r   r$   ý   s   
zElementwiseAffine.__init__Fc                 K   s`   |s | j t | j¡|  }|| }t | j| ddg¡}||fS || j  t | j ¡ | }|S )Nr1   r@   )r•   r   r†   r–   r…   r‡   r   r   r   r7     s   zElementwiseAffine.forwardrŠ   r8   r   r   r.   r   r”   ü   s    r”   c                       s.   e Zd Z			d‡ fdd„	Zd	dd„Z‡  ZS )
ÚResidualCouplingLayerr   Fc	           	         s®   |d dks
J dƒ‚t ƒ  ¡  || _|| _|| _|| _|| _|d | _|| _t	 
| j|d¡| _t||||||d| _t	 
|| jd|  d¡| _| jjj ¡  | jjj ¡  d S )Nr@   r   z!channels should be divisible by 2r1   )rH   rl   )r#   r$   r%   rD   rF   rk   rG   Úhalf_channelsÚ	mean_onlyr'   rM   Úpreri   ÚencÚpostrT   rU   rV   rW   )	r-   r%   rD   rF   rk   rG   rH   rl   r™   r.   r   r   r$     s    

ÿzResidualCouplingLayer.__init__Nc                 C   sä   t  || jgd d¡\}}|  |¡| }| j|||d}|  |¡| }| js4t  || jgd d¡\}	}
n|}	t  |	¡}
|s\|	|t  |
¡ |  }t  	||gd¡}t  
|
ddg¡}||fS ||	 t  |
 ¡ | }t  	||gd¡}|S )Nr@   r1   ©rf   )r   Úsplitr˜   rš   r›   rœ   r™   ru   r†   Úcatr…   )r-   r6   rY   rf   rˆ   Úx0Úx1ÚhÚstatsr•   r–   r‰   r   r   r   r7   ,  s    
zResidualCouplingLayer.forward)r   r   F©NFr8   r   r   r.   r   r—     s    ÷r—   c                       ó(   e Zd Zd	‡ fdd„	Zd
dd„Z‡  ZS )ÚConvFlowé
   ç      @c                    sš   t ƒ  ¡  || _|| _|| _|| _|| _|| _|d | _t	 
| j|d¡| _t|||dd| _t	 
|| j|d d  d¡| _| jjj ¡  | jjj ¡  d S )Nr@   r1   r]   ©rH   é   )r#   r$   rC   Úfilter_channelsrF   rG   Únum_binsÚ
tail_boundr˜   r'   rM   rš   r\   ÚconvsrS   rT   rU   rV   rW   )r-   rC   r«   rF   rG   r¬   r­   r.   r   r   r$   C  s   

zConvFlow.__init__NFc              	   C   s  t  || jgd d¡\}}|  |¡}| j|||d}|  |¡| }|j\}}	}
| ||	d|
¡ dddd¡}|dd | j	…f t
 | j¡ }|d| j	d| j	 …f t
 | j¡ }|dd| j	 d …f }t|||||d| jd	\}}t  ||gd¡| }t  || ddg¡}|s‰||fS |S )
Nr@   r1   r   r2   r   rª   .Úlinear)ÚinverseÚtailsr­   )r   rž   r˜   rš   r®   rS   ÚshapeÚreshapeÚpermuter¬   ÚmathÚsqrtr«   r   r­   rŸ   r…   )r-   r6   rY   rf   rˆ   r    r¡   r¢   ÚbÚcÚtÚunnormalized_widthsÚunnormalized_heightsÚunnormalized_derivativesÚ	logabsdetr‰   r   r   r   r7   S  s.   
$
ù
zConvFlow.forward)r§   r¨   r¤   r8   r   r   r.   r   r¦   B  s    r¦   c                       s(   e Zd Zd
‡ fdd„	Zddd	„Z‡  ZS )ÚStochasticDurationPredictoré   r   c              	      sZ  t ƒ  ¡  |}|| _|| _|| _|| _|| _|| _tƒ | _	t
 ¡ | _| j tdƒ¡ t|ƒD ]}| j td||dd¡ | j tƒ ¡ q.t
 d|d¡| _t
 ||d¡| _t||d|d| _t
 ¡ | _| j tdƒ¡ tdƒD ]}| j td||dd¡ | j tƒ ¡ qnt
 ||d¡| _t
 ||d¡| _t||d|d| _|dkr«t
 ||d¡| _d S d S )Nr@   rª   )rG   r1   )rG   rH   r¿   r   )r#   r$   rC   r«   rF   rH   Ún_flowsrl   r‚   Úlog_flowr'   rI   ÚflowsrL   r”   rR   r¦   rŒ   rM   Úpost_preÚ	post_projr\   Ú
post_convsÚ
post_flowsrš   rS   r®   Úcond)r-   rC   r«   rF   rH   rÀ   rl   r[   r.   r   r   r$   s  s8   


ÿz$StochasticDurationPredictor.__init__NFç      ð?c                 C   s„  t  |¡}|  |¡}|d urt  |¡}||  |¡ }|  ||¡}|  |¡| }|sý| j}|d us2J ‚d}|  |¡}	|  |	|¡}	|  	|	¡| }	t  
| d¡d| d¡¡j|j|jd| }
|
}| jD ]}|||||	 d\}}||7 }qat  |ddgd¡\}}t  |¡| }|| | }|t  t |¡t | ¡ | ddg¡7 }t  dt dtj ¡|
d   | ddg¡| }d}|  ||¡\}}||7 }t  ||gd¡}|D ]}|||||d\}}|| }qÐt  dt dtj ¡|d   | ddg¡| }|| S tt| jƒƒ}|d d	… |d
 g }t  
| d¡d| d¡¡j|j|jd| }|D ]}|||||d}q't  |ddgd¡\}}|}|S )Nr   r@   ©r   rŽ   r   r1   ç      à¿©rf   rˆ   ç      à?éþÿÿÿr2   )r   Údetachrš   rÇ   r®   rS   rÂ   rÃ   rÅ   rÄ   Úrandnr‘   r’   r   rŽ   rÆ   rž   r   r…   r4   Ú
logsigmoidrµ   rƒ   ÚpirÁ   rŸ   ÚlistÚreversed)r-   r6   rY   Úwrf   rˆ   Únoise_scalerÂ   Úlogdet_tot_qÚh_wÚe_qÚz_qÚflowÚlogdet_qÚz_uÚz1ÚuÚz0ÚlogqÚ
logdet_totr‰   ÚzÚnllÚlogwr   r   r   r7   “  sR   



,

*0
0,z#StochasticDurationPredictor.forward©r¿   r   )NNFrÈ   r8   r   r   r.   r   r¾   r  s     r¾   c                       s(   e Zd Zd‡ fdd„	Zddd„Z‡  ZS )	ÚDurationPredictorr   c                    s¨   t ƒ  ¡  || _|| _|| _|| _|| _t |¡| _	tj
||||d d| _t|ƒ| _tj
||||d d| _t|ƒ| _t 
|dd¡| _|dkrRt 
||d¡| _d S d S )Nr@   rA   r1   r   )r#   r$   rC   r«   rF   rH   rl   r'   rP   r`   rM   Úconv_1r    Únorm_1Úconv_2Únorm_2rS   rÇ   )r-   rC   r«   rF   rH   rl   r.   r   r   r$   Æ  s   


ÿzDurationPredictor.__init__Nc                 C   s˜   t  |¡}|d urt  |¡}||  |¡ }|  || ¡}t  |¡}|  |¡}|  |¡}|  || ¡}t  |¡}|  |¡}|  |¡}|  	|| ¡}|| S r"   )
r   rÎ   rÇ   rç   Úrelurè   r`   ré   rê   rS   )r-   r6   rY   rf   r   r   r   r7   Ù  s   







zDurationPredictor.forward©r   r"   r8   r   r   r.   r   ræ   Å  s    ræ   c                       r>   )ÚTextEncoderc
           
         s   t ƒ  ¡  || _|| _|| _|| _|| _|| _|| _|| _	t
j|||	d| _t
j | jjd|d ¡ t||||||ƒ| _t
 ||d d¡| _d S )N)Úpadding_idxr]   rÊ   r@   r1   )r#   r$   Ún_vocabrE   rD   r«   Ún_headsrG   rF   rH   r'   Ú	EmbeddingÚembÚinitÚnormal_rT   ÚAttentionEncoderÚencoderrM   rS   )
r-   rï   rE   rD   r«   rð   rG   rF   rH   rî   r.   r   r   r$   ë  s   
zTextEncoder.__init__c                 C   s~   |   |¡t | j¡ }t |dd¡}t t||ƒd¡ |j	¡}|  
|| |¡}|  |¡| }tj|| jdd\}}||||fS )Nr1   r2   ©Údim)rò   rµ   r¶   rD   r   r3   Ú	unsqueezer   r’   rŽ   rö   rS   rž   rE   )r-   r6   Ú	x_lengthsrY   r£   r•   r–   r   r   r   r7     s   zTextEncoder.forwardr8   r   r   r.   r   rí   ê  s    rí   c                       r¥   )ÚResidualCouplingBlockr¿   r   c           	         s|   t ƒ  ¡  || _|| _|| _|| _|| _|| _|| _t	 
¡ | _t|ƒD ]}| j t||||||dd¡ | j tƒ ¡ q#d S )NT)rl   r™   )r#   r$   r%   rD   rF   rk   rG   rÀ   rl   r'   rI   rÂ   rR   rL   r—   rŒ   )	r-   r%   rD   rF   rk   rG   rÀ   rl   r[   r.   r   r   r$     s.   

ùÿôzResidualCouplingBlock.__init__NFc                 C   sL   |s| j D ]}|||||d\}}q|S t| j ƒD ]
}|||||d}q|S )NrË   )rÂ   rÓ   )r-   r6   rY   rf   rˆ   rÚ   rX   r   r   r   r7   -  s   
þzResidualCouplingBlock.forwardrå   r¤   r8   r   r   r.   r   rû     s    rû   c                       s*   e Zd Z	d‡ fdd„	Zddd„Z‡  ZS )	ÚPosteriorEncoderr   c                    sp   t ƒ  ¡  || _|| _|| _|| _|| _|| _|| _t	 
||d¡| _t|||||d| _t	 
||d d¡| _d S )Nr1   ©rl   r@   )r#   r$   rC   rE   rD   rF   rk   rG   rl   r'   rM   rš   ri   r›   rS   )r-   rC   rE   rD   rF   rk   rG   rl   r.   r   r   r$   8  s   
zPosteriorEncoder.__init__Nc           	      C   sŽ   t  t||ƒd¡ |j¡j|jd}|  |¡| }| j|||d}|  |¡| }t j	|| j
dd\}}|t  |¡t  |¡  | }||||fS )Nr1   )r   r   r÷   )r   rù   r   r’   rŽ   r   rš   r›   rS   rž   rE   Ú
randn_liker†   )	r-   r6   rú   rf   rY   r£   r•   r–   râ   r   r   r   r7   H  s   $zPosteriorEncoder.forwardrì   r"   r8   r   r   r.   r   rü   7  s    ÿrü   c                       s2   e Zd Z	d	‡ fdd„	Zd
dd„Zdd„ Z‡  ZS )Ú	Generatorr   c	                    sL  t t| ƒ ¡  t|ƒ| _t|ƒ| _tj||dddd| _|dkr"t	nt
}t ¡ | _tt||ƒƒD ]$\}	\}
}| j ttj|d|	  |d|	d   ||
||
 d dƒ¡ q0t ¡ | _tt| jƒƒD ]"}	|d|	d   }tt||ƒƒD ]\}\}}| j ||||ƒ¡ qrqatj|dddddd| _| j t¡ |d	kr¤t ||d¡| _d S d S )
Né   r1   rª   rA   Ú1r@   F)rB   rW   r   )r#   rÿ   r$   ÚlenÚnum_kernelsÚnum_upsamplesr'   rM   Úconv_prer   r   rI   ÚupsÚ	enumerateÚziprL   r   ÚConvTranspose1dÚ	resblocksrR   Ú	conv_postÚapplyr	   rÇ   )r-   Úinitial_channelÚresblockÚresblock_kernel_sizesÚresblock_dilation_sizesÚupsample_ratesÚupsample_initial_channelÚupsample_kernel_sizesrl   r[   rÞ   ÚkÚchÚjÚdr.   r   r   r$   S  s:   




ûÿÿ
ÿÿzGenerator.__init__Nc                 C   s¸   |   |¡}|d ur||  |¡ }t| jƒD ]5}t |t¡}| j| |ƒ}tj	|j
|j|jd}t| jƒD ]}|| j|| j |  |ƒ7 }q4|| j }qt |¡}|  |¡}t |¡}|S )Nr   )r  rÇ   rR   r  r4   Ú
leaky_reluÚLRELU_SLOPEr  r   r+   r²   rŽ   r   r  r
  r  r   )r-   r6   rf   r[   Úxsr  r   r   r   r7   ~  s   



zGenerator.forwardc                 C   s4   t dƒ | jD ]}t|ƒ q| jD ]}| ¡  qd S )NzRemoving weight norm...)Úprintr  r   r
  r   r   r   r   r     s   



ÿzGenerator.remove_weight_normrì   r"   r   r   r   r.   r   rÿ   R  s
    
÷
+rÿ   c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )	ÚDiscriminatorPé   rª   Fc                    s$  t t| ƒ ¡  || _|| _|dkrtnt}t |tj	dd|df|dft
|dƒdfdƒ|tj	dd|df|dft
|dƒdfdƒ|tj	dd|df|dft
|dƒdfdƒ|tj	dd|df|dft
|dƒdfdƒ|tj	dd|dfdt
|dƒdfdƒg¡| _t d	¡| _|tj	ddd
dddƒ| _d S )NFr1   é    r   rA   é€   i   é   ç333333Ó?)rª   r1   )r1   r   )r#   r  r$   ÚperiodÚuse_spectral_normr   r   r'   rI   ÚConv2dr   r®   rP   Údropoutr  )r-   r"  rF   Ústrider#  Únorm_fr.   r   r   r$   ™  s   (((($ûÿ	zDiscriminatorP.__init__c                 C   s¾   g }|j \}}}|| j dkr$| j|| j  }t |d|fd¡}|| }| |||| j | j¡}| jD ]}||ƒ}|  |¡}t |t¡}| 	|¡ q3|  
|¡}| 	|¡ t |dd¡}||fS )Nr   Úreflectr1   r2   )r²   r"  r4   ÚpadÚviewr®   r%  r  r  rL   r  r   Úflatten)r-   r6   Úfmapr·   r¸   r¹   Ún_padr€   r   r   r   r7   ª  s    



zDiscriminatorP.forward)r  rª   Fr8   r   r   r.   r   r  ˜  s    r  c                       r   )ÚDiscriminatorSFc                    sà   t t| ƒ ¡  |dkrtnt}t |tjddddddƒ|tjdddd	d	d
dƒ|tjdddd	dd
dƒ|tjdddd	dd
dƒ|tjdddd	dd
dƒ|tjddddddƒg¡| _t 	d¡| _
|tjddddddƒ| _d S )NFr1   é   é   r   rA   é@   é)   r¿   é   )r^   rB   é   r   r  r@   r!  rª   )r#   r.  r$   r   r   r'   rI   rM   r®   rP   r%  r  )r-   r#  r'  r.   r   r   r$   Â  s   úÿ
zDiscriminatorS.__init__c                 C   sX   g }| j D ]}||ƒ}t |t¡}| |¡ q|  |¡}| |¡ t |dd¡}||fS r0   )r®   r4   r  r  rL   r  r   r+  )r-   r6   r,  r€   r   r   r   r7   Ò  s   


zDiscriminatorS.forwardrŠ   r8   r   r   r.   r   r.  Á  s    r.  c                       r   )ÚMultiPeriodDiscriminatorFc                    sH   t t| ƒ ¡  g d¢}tˆ dg}|‡ fdd„|D ƒ }t |¡| _d S )N)r@   rª   r  r   é   ©r#  c                    s   g | ]}t |ˆ d ‘qS )r7  )r  )Ú.0r[   r7  r   r   Ú
<listcomp>æ  s    z5MultiPeriodDiscriminator.__init__.<locals>.<listcomp>)r#   r5  r$   r.  r'   rI   Údiscriminators)r-   r#  ÚperiodsÚdiscsr.   r7  r   r$   á  s
   z!MultiPeriodDiscriminator.__init__c                 C   sp   g }g }g }g }t | jƒD ]$\}}||ƒ\}	}
||ƒ\}}| |	¡ | |¡ | |
¡ | |¡ q||||fS r"   )r  r:  rL   )r-   rg   Úy_hatÚy_d_rsÚy_d_gsÚfmap_rsÚfmap_gsr[   r  Úy_d_rÚfmap_rÚy_d_gÚfmap_gr   r   r   r7   é  s   


z MultiPeriodDiscriminator.forwardrŠ   r8   r   r   r.   r   r5  à  r=   r5  c                       sD   e Zd ZdZ			d‡ fdd„	Zddd„Zddd„Zdd„ Z‡  ZS )ÚSynthesizerTrnz"
    Synthesizer for Training
    r   Tc              
      s2  t ƒ  ¡  || _|| _|| _|| _|| _|| _|| _|	| _	|
| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _t|||||||	|
|ƒ	| _t||||||||d| _t|||ddd|d| _t||ddd|d| _|r€t|dddd|d| _n
t |d	dd|d| _|dkr—t! "||¡| _#d S d S )
Nrý   r  r1   r/  r¿   éÀ   rª   rÌ   r4  )$r#   r$   rï   Úspec_channelsÚinter_channelsrD   r«   rð   rG   rF   rH   rî   r  r  r  r  r  r  Úsegment_sizeÚ
n_speakersrl   Úuse_sdprí   Úenc_prÿ   Údecrü   Úenc_qrû   rÚ   r¾   Údpræ   r'   rñ   Úemb_g)r-   rï   rH  rJ  rI  rD   r«   rð   rG   rF   rH   rî   r  r  r  r  r  r  rK  rl   rL  rw   r.   r   r   r$   þ  sh   
÷ø
ÿÿzSynthesizerTrn.__init__Nc                 C   sD  |   ||¡\}}}}	| jdkr|  |¡ d¡}
nd }
| j|||
d\}}}}| j|||
d}t ¡ l t d| ¡}tj	dt
 dt
j ¡ | dgdd}t d|d  dd¡ |¡}t | dd¡|| ¡}tj	d|d  | dgdd}|| | | }t |	d¡t |d¡ }t|| d¡ƒ d¡ ¡ }W d   ƒ n1 s¡w   Y  | 	d¡}| jr¿| j||	||
d}|t 	|	¡ }n"t |d	 ¡|	 }| j||	|
d}t 	|| d ddg¡t 	|	¡ }t | d¡| dd¡¡ dd¡}t | d¡| dd¡¡ dd¡}t||| jƒ\}}| j||
d}|||||	|||||||ffS )
Nr1   r2   r   rÍ   rÊ   r@   T)Úkeepdimgíµ ÷Æ°>)rM  rK  rQ  rù   rO  rÚ   r   Úno_gradr†   r…   rµ   rƒ   rÑ   Úmatmulr3   r
   ÚsqueezerÎ   rL  rP  r   rJ  rN  )r-   ÚtextÚtext_lenÚspecÚspec_lenÚspeakersr6   Ú
mean_priorÚlogscale_priorÚ	text_maskrf   râ   Úmean_posteriorÚlogscale_posteriorÚ	spec_maskÚz_pÚs_p_sq_rÚ	neg_cent1Ú	neg_cent2Ú	neg_cent3Ú	neg_cent4Úneg_centÚ	attn_maskÚattnrÔ   Úl_lengthÚlogw_rä   Úz_sliceÚ	ids_sliceÚaudior   r   r   r7   P  sV   

&ÿÿò
"ÿÿùzSynthesizerTrn.forwardr1   rÈ   c                 C   sŠ  |   ||¡\}}	}
}| jdkr|d ur|  |¡ d¡}nd }| jr,| j|||d|d}n| j|||d}t |¡| | }t |¡}t 	t 
|ddg¡d¡ ¡ }t t|d ƒd¡ |j¡}t |d¡t |d¡ }t||ƒ}t | d¡|	 dd¡¡ dd¡}	t | d¡|
 dd¡¡ dd¡}
|	t |	¡t |
¡ |  }| j|||dd}| j|| d d …d d …d |…f |d}||||||	|
ffS )Nr1   r2   T)rf   rˆ   rÕ   r   r@   rË   )rM  rK  rQ  rù   rL  rP  r   r†   Úceilr„   r…   Úlongr   r’   rŽ   r   rT  rU  r3   rþ   rÚ   rN  )r-   rV  rW  rZ  rÕ   Úlength_scaleÚnoise_scale_wÚmax_lenr6   r[  r\  r]  rf   rä   rÔ   Úw_ceilÚaudio_lengthsÚ
audio_maskrh  ri  ra  râ   rn  r   r   r   Úinfer‡  s.   

ÿÿ(zSynthesizerTrn.inferc                 C   sŽ   | j dks	J dƒ‚|  |¡ d¡}|  |¡ d¡}| j|||d\}}}	}
| j||
|d}| j||
|dd}| j||
 |d}||
|||ffS )Nr1   z$n_speakers have to be larger than 1.r2   r   TrË   )rK  rQ  rù   rO  rÚ   rN  )r-   rg   Ú	y_lengthsÚspeaker_srcÚspeaker_tgtÚg_srcÚg_tgtrâ   Úm_qÚlogs_qÚy_maskra  Úz_hatÚo_hatr   r   r   Úvoice_conversion¦  s   zSynthesizerTrn.voice_conversion)r   r   Tr"   )Nr1   r1   rÈ   N)	r9   r:   r;   rh   r$   r7   rw  r‚  r<   r   r   r.   r   rF  ù  s    ë
R
7rF  c                       s,   e Zd Z			d‡ fdd„	Zdd„ Z‡  ZS )	rõ   r1   r]   r¿   c           
   
      sÐ   t ƒ  ¡  || _|| _|| _|| _|| _|| _|| _t	 
|¡| _t	 ¡ | _t	 ¡ | _t	 ¡ | _t	 ¡ | _t| jƒD ],}	| j t|||||d¡ | j t|ƒ¡ | j t|||||d¡ | j t|ƒ¡ q9d S )N)rH   Úwindow_sizer©   )r#   r$   rD   r«   rð   rG   rF   rH   rƒ  r'   rP   r`   rI   Úattn_layersÚnorm_layers_1Ú
ffn_layersÚnorm_layers_2rR   rL   ÚMultiHeadAttentionr    ÚFFN)
r-   rD   r«   rð   rG   rF   rH   rƒ  rw   rX   r.   r   r   r$   µ  s2   





ÿÿÿözAttentionEncoder.__init__c                 C   s’   |  d¡|  d¡ }|| }t| jƒD ]/}| j| |||ƒ}|  |¡}| j| || ƒ}| j| ||ƒ}|  |¡}| j| || ƒ}q|| }|S )Nr@   r2   )rù   rR   rG   r„  r`   r…  r†  r‡  )r-   r6   rY   rh  r[   rg   r   r   r   r7   Ú  s   

zAttentionEncoder.forward)r1   r]   r¿   r8   r   r   r.   r   rõ   ´  s    ø%rõ   c                       sn   e Zd Z						d‡ fdd„	Zddd„Zdd	d
„Zdd„ Zdd„ Zdd„ Zdd„ Z	dd„ Z
dd„ Z‡  ZS )rˆ  r]   NTFc
                    sœ  t ƒ  ¡  || dksJ ‚|| _|| _|| _|| _|| _|| _|| _|| _	|	| _
d | _|| | _t ||d¡| _t ||d¡| _t ||d¡| _t ||d¡| _t |¡| _|d ur‰|r^dn|}
| jd }t t |
|d d | j¡| ¡| _t t |
|d d | j¡| ¡| _tj | jj¡ tj | jj¡ tj | jj¡ |	rÌt ¡  | jj | jj¡ | jj | jj¡ W d   ƒ d S 1 sÅw   Y  d S d S )Nr   r1   rÊ   r@   ) r#   r$   r%   rE   rð   rH   rƒ  Úheads_shareÚblock_lengthÚproximal_biasÚproximal_initri  Ú
k_channelsr'   rM   Úconv_qÚconv_kÚconv_vÚconv_orP   r`   r(   r   rÏ   Ú	emb_rel_kÚ	emb_rel_vró   Úxavier_uniform_rT   rS  Úcopy_rW   )r-   r%   rE   rð   rH   rƒ  rŠ  r‹  rŒ  r  Ún_heads_relÚ
rel_stddevr.   r   r   r$   é  s@   


$$
"þÿzMultiHeadAttention.__init__c                 C   sD   |   |¡}|  |¡}|  |¡}| j||||d\}| _|  |¡}|S )N)Úmask)r  r  r‘  Ú	attentionri  r’  )r-   r6   r¸   rh  Úqr  Úvr   r   r   r7     s   



zMultiHeadAttention.forwardc                 C   sú  |  d¡|  d¡|  d¡|  d¡f\}}}}| || j| j|¡ dd¡}| || j| j|¡ dd¡}| || j| j|¡ dd¡}t |t | j¡ | dd¡¡}	| j	d ury||ks]J dƒ‚|  
| j|¡}
|  |t | j¡ |
¡}|  |¡}|	| }	| jr’||ks„J dƒ‚|	|  |¡j|	j|	jd	 }	|d urÁ|	 |dkd
¡}	| jd urÁ||ks«J dƒ‚t |	¡ | j ¡ | j¡}|	 |dkd
¡}	tj|	dd}|  |¡}t ||¡}| j	d urì|  |¡}|  
| j|¡}||  ||¡ }| dd¡ ¡  |||¡}||fS )Nr   r1   r@   rª   rÍ   r2   z8Relative attention is only available for self-attention.z3Proximal bias is only available for self-attention.rÉ   g     ˆÃÀz5Local attention is only available for self-attention.r÷   )r‘   r*  rð   rŽ  r3   r   rT  rµ   r¶   rƒ  Ú_get_relative_embeddingsr“  Ú_matmul_with_relative_keysÚ'_relative_position_to_absolute_positionrŒ  Ú_attention_bias_proximalr’   r   rŽ   Úmasked_fillr‹  Ú	ones_likeÚtriuÚtrilr4   Úsoftmaxr`   Ú'_absolute_position_to_relative_positionr”  Ú_matmul_with_relative_valuesÚ
contiguous)r-   ÚqueryÚkeyÚvaluer™  r·   r  Út_sÚt_tÚscoresÚkey_relative_embeddingsÚ
rel_logitsÚscores_localÚ
block_maskÚp_attnrx   Úrelative_weightsÚvalue_relative_embeddingsr   r   r   rš  "  s:   , 





zMultiHeadAttention.attentionc                 C   s   t  || d¡¡}|S )zU
        x: [b, h, l, m]
        y: [h or 1, m, d]
        ret: [b, h, l, d]
        r   )r   rT  rù   ©r-   r6   rg   Úretr   r   r   r§  C  s   z/MultiHeadAttention._matmul_with_relative_valuesc                 C   s   t  || d¡ dd¡¡}|S )zU
        x: [b, h, l, d]
        y: [h or 1, m, d]
        ret: [b, h, l, m]
        r   rÍ   r2   )r   rT  rù   r3   r¶  r   r   r   rž  L  s   z-MultiHeadAttention._matmul_with_relative_keysc                 C   s€   t || jd  dƒ}t | jd | dƒ}|d|  d }|dkr2t |tddg||gddggƒ¡}n|}|d d …||…f }|S )Nr1   r   r@   )Úmaxrƒ  r4   r)  r   )r-   Úrelative_embeddingsÚlengthÚ
pad_lengthÚslice_start_positionÚslice_end_positionÚpadded_relative_embeddingsÚused_relative_embeddingsr   r   r   r  U  s   ÿz+MultiHeadAttention._get_relative_embeddingsc              	   C   s¸   |  ¡ \}}}}t |tddgddgddgddggƒ¡}| |||d | g¡}t |tddgddgd|d ggƒ¡}| |||d d| d g¡dd…dd…d|…|d d…f }|S )z?
        x: [b, h, l, 2*l-1]
        ret: [b, h, l, l]
        r   r1   r@   N©r‘   r4   r)  r   r*  ©r-   r6   ÚbatchÚheadsrº  rX   Úx_flatÚx_finalr   r   r   rŸ  c  s   (&>z:MultiHeadAttention._relative_position_to_absolute_positionc              
   C   s´   |  ¡ \}}}}t |tddgddgddgd|d ggƒ¡}| |||d ||d   g¡}t |tddgddg|dggƒ¡}| |||d| g¡dd…dd…dd…dd…f }|S )z?
        x: [b, h, l, l]
        ret: [b, h, l, 2*l-1]
        r   r1   r@   NrÀ  rÁ  r   r   r   r¦  t  s   , "2z:MultiHeadAttention._absolute_position_to_relative_positionc              	   C   sJ   t j|t jd}t  |d¡t  |d¡ }t  t  t  t  |¡¡ d¡d¡S )zÈBias for self-attention to encourage attention to close positions.
        Args:
            length: an integer scalar.
        Returns:
            a Tensor with shape [1, 1, length, length]
        )rŽ   r   r1   )r   ÚarangeÚfloat32rù   Úlog1pÚabs)r-   rº  ÚrÚdiffr   r   r   r   ‚  s   "z+MultiHeadAttention._attention_bias_proximal)r]   NTNFFr"   )r9   r:   r;   r$   r7   rš  r§  rž  r  rŸ  r¦  r   r<   r   r   r.   r   rˆ  è  s     ö
/

!		rˆ  c                       s8   e Zd Z	d‡ fdd„	Zdd„ Zdd	„ Zd
d„ Z‡  ZS )r‰  r]   NFc                    sz   t ƒ  ¡  || _|| _|| _|| _|| _|| _|| _|r!| j	| _
n| j| _
t |||¡| _t |||¡| _t |¡| _d S r"   )r#   r$   rC   rE   r«   rF   rH   Ú
activationÚcausalÚ_causal_paddingrB   Ú_same_paddingr'   rM   rç   ré   rP   r`   )r-   rC   rE   r«   rF   rH   rÌ  rÍ  r.   r   r   r$     s   

zFFN.__init__c                 C   sb   |   |  || ¡¡}| jdkr|t d| ¡ }nt |¡}|  |¡}|  |  || ¡¡}|| S )Nre   g¬Zd;û?)rç   rB   rÌ  r   r   rë   r`   ré   )r-   r6   rY   r   r   r   r7   ¤  s   


zFFN.forwardc                 C   sF   | j dkr|S | j d }d}ddgddg||gg}t |t|ƒ¡}|S )Nr1   r   ©rF   r4   r)  r   ©r-   r6   Úpad_lÚpad_rrB   r   r   r   rÎ  ®  s   

zFFN._causal_paddingc                 C   sP   | j dkr|S | j d d }| j d }ddgddg||gg}t |t|ƒ¡}|S )Nr1   r@   r   rÐ  rÑ  r   r   r   rÏ  ·  s   

zFFN._same_padding)r]   NF)r9   r:   r;   r$   r7   rÎ  rÏ  r<   r   r   r.   r   r‰  Ž  s    ÿ
	r‰  )3rµ   r   Útorch.nnr'   r   r4   Útorch.nn.utilsr   r   r   Ú,nemo.collections.tts.modules.hifigan_modulesr   r   r   r	   Ú,nemo.collections.tts.modules.monotonic_alignr
   Ú(nemo.collections.tts.parts.utils.helpersr   r   r   r   Ú(nemo.collections.tts.parts.utils.splinesr   r  ÚjitÚscriptr   ÚModuler    r?   r\   ri   r‚   rŒ   r”   r—   r¦   r¾   ræ   rí   rû   rü   rÿ   r  r.  r5  rF  rõ   rˆ  r‰  r   r   r   r   Ú<module>   sH   $
	!*J
40S%)$F) <4 '