o
    }oiS                    @   s  d dl Z d dlZd dlmZmZ d dlmZmZmZm	Z	 d dl
Zd dlZd dlmZ d dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d dlmZ d d	lm Z  d d
l!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ zd dl,Z,dZ-W n e.y   dZ-Y nw zd dl/Z/dZ0W n e.y   dZ0Y nw dbde1de1de1fddZ2de	e1e1f de	e1e1f de	e1e1f fddZ3de1de1de1fddZ4de1de1de	e1e1f fddZ5G dd de Z6G dd de Z7d d! Z8dcd"e9d#e9fd$d%Z:G d&d' d'e Z;G d(d) d)e Z<G d*d+ d+e Z=G d,d- d-e Z>G d.d/ d/ej?Z@G d0d1 d1e ZAG d2d3 d3e ZBG d4d5 d5e ZCG d6d7 d7e ZDG d8d9 d9e ZEG d:d; d;e ZFG d<d= d=e ZGG d>d? d?e ZHG d@dA dAe ZIG dBdC dCe ZJG dDdE dEe ZKG dFdG dGe eZLG dHdI dIeLZMG dJdK dKeLZNG dLdM dMe ZOG dNdO dOe ZPG dPdQ dQe ZQG dRdS dSe ZRG dTdU dUe ZSG dVdW dWe ZTG dXdY dYe ZUG dZd[ d[e ZVG d\d] d]e ZWG d^d_ d_e ZXG d`da dae ZYdS )d    N)ABCabstractmethod)IterableListOptionalTuple)	rearrange)	AutoModel)!AudioToMelSpectrogramPreprocessor)ClampActivation	HalfSnakeSnakemask_sequence_tensor)	typecheck)NeuralModule)AudioSignalEncodedRepresentationIndexLengthsTypeMelSpectrogramTypeVoidType)
NeuralType)loggingTF   kernel_sizedilationreturnc                 C   s   | | | d S )N    r   r   r   r   d/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/modules/audio_codec_modules.pyget_padding9      r!   c                 C   s(   t | d |d t | d |d f}|S )Nr   r   )r!   )r   r   paddingsr   r   r    get_padding_2d=   s   $r$   stridec                 C   s   | | d d S )Nr   r   r   r   r%   r   r   r    get_down_sample_paddingB   r"   r'   c                 C   s$   | | d }| | d d }||fS )Nr   r   r   )r   r%   output_paddingpaddingr   r   r    get_up_sample_paddingF   s   r*   c                       s$   e Zd Z fddZdd Z  ZS )SSLModelc                       t    t|| _d S N)super__init__r	   from_pretrained	ssl_model)selfslm_model_name	__class__r   r    r/   M      
zSSLModel.__init__c                 O   s   | j |i |S r-   )r1   )r2   argskwargsr   r   r    forwardQ   r"   zSSLModel.forward__name__
__module____qualname__r/   r9   __classcell__r   r   r4   r    r+   L   s    r+   c                       s^   e Zd ZdZ							d fd	d
	Zdd Zedd Zedd Ze	 dd Z
  ZS )SLMDiscriminatora'  SLM Discriminator, as described in both the StyleTTS2 and Low Frame-Rate Speech Codec papers.

    Args:
        slm_model_name: Hugging Face Speech Language Models name.
        slm_sr: Speech Language Models input sampling rate.
        input_sr: Audio input sampling rate.
        slm_hidden: Speech Language Model hidden dim.
        slm_layers: Speech Language Model number of layers.
        initial_channel: discriminative head number of channels.
        use_spectral_norm: If True uses spectral normalization otherwise uses weight norm.

    microsoft/wavlm-base-plus>  "V        @   Fc           	         s   t    trtj||| _nd | _t|| _| j	  |dkr't
jjjjnt
jjj}|tj|| |dddd| _t|tj||d ddd|tj|d |d ddd|tj|d |d ddddg| _|tj|d dd	ddd| _d S )
NFr   r   )r)   r      )r   r)         )r.   r/   HAVE_TORCHAUDIO
torchaudio
transformsResampleresampler+   	slm_modelfreezetorchnnutilsparametrizationsweight_normspectral_normConv1dpre
ModuleListconvs	conv_post)	r2   r3   slm_srinput_sr
slm_hidden
slm_layersinitial_channeluse_spectral_normnorm_fr4   r   r    r/   c   s    



"zSLMDiscriminator.__init__c                 C   s   | j | |ddj}tj|ddddjddd}| |}g }| jD ]}||}t	
|d	}||d q%| |}t|dd}||fS )
NT)input_valuesoutput_hidden_statesr   dimr   )	start_dimend_dim皙?)rN   rM   hidden_statesrP   stack	transposeflattenrW   rY   F
leaky_reluappend	unsqueezerZ   )r2   xfmaplayerr   r   r    _forward   s    


zSLMDiscriminator._forwardc                 C      t dt t dt dS NBT_audio
audio_real	audio_genr   r   r2   r   r   r    input_types      

zSLMDiscriminator.input_typesc                 C   :   t dt gt dt gt dt ggt dt ggdS Nrz   CT_outrz   DT_layerr   scores_real
scores_gen
fmaps_real	fmaps_genr   r   r   r   r   r    output_types   
   zSLMDiscriminator.output_typesc                 C   s<   |  |\}}|  |\}}|dg|dg|g|gfS Nr   )rv   rr   )r2   r}   r~   y_d_rfmap_ry_d_gfmap_gr   r   r    r9      s    zSLMDiscriminator.forward)r@   rA   rB   rC   rD   rE   F)r;   r<   r=   __doc__r/   rv   propertyr   r   r   r9   r>   r   r   r4   r    r?   U   s"    %

r?   c                 C   s6   | | j ddd t| jdddd  }|S )z8
    Normalized to have zero mean and unit variance
    r   rd   rf   gHz>)meanrr   rP   sqrtvar)rb   normed_input_valuesr   r   r    zero_mean_unit_var_norm   s   r   pathmap_locationc                 K   s   t j| pt j| }|rtj| fd|i|S tr>t| d}tj|fd|i|W  d   S 1 s7w   Y  dS t	
d td)a  Like torch.load but can load from other locations (e.g. s3:// , gs://).

    Args:
        path: Any path or url supported by fsspec.
        map_location: torch.device or str.
        cache: If True, cache a remote file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to True.
        **kwargs: Keyword arguments forwarded to torch.load.

    Returns:
        Object stored in path.
    r   rbNzDCould not import fsspec. Loading a checkpoint link is not supported!zJfsspec is not installed but is necessary to download remote checkpoints !!)osr   isdirisfilerP   loadHAVE_FSSPECfsspecopenr   errorModuleNotFoundError)r   r   r8   is_localfr   r   r    load_fsspec   s   $
r   c                       &   e Zd Zd fdd	Zdd Z  ZS )PreEmphasis
ףp=
?c                    s:   t    || _| dt| j dgdd d S )Nfilter      ?r   )r.   r/   coefficientregister_bufferrP   FloatTensorrr   )r2   r   r4   r   r    r/      s   
*zPreEmphasis.__init__c                 C   sD   t | dks
J tjj|ddd}tjj|| j	dS )Nr   r   r   r   reflect)
lensizerP   rQ   
functionalpadrr   conv1dr   squeezer2   rs   r   r   r    r9      s   zPreEmphasis.forward)r   r:   r   r   r4   r    r      s    r   c                       r   )SELayer   c                    sT   t t|   td| _tt||| tjddt|| |t	 | _
d S )Nr   Tinplace)r.   r   r/   rQ   AdaptiveAvgPool2davg_pool
SequentialLinearReLUSigmoidfc)r2   channel	reductionr4   r   r    r/      s   

zSELayer.__init__c                 C   s@   |  \}}}}| |||}| |||dd}|| S r   )r   r   viewr   )r2   rs   bc_yr   r   r    r9      s   zSELayer.forward)r   r:   r   r   r4   r    r      s    
r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	SEBasicBlockr   Nr   c                    s~   t t|   tj||d|ddd| _t|| _tj||dddd| _t|| _	tj
dd| _t||| _|| _|| _d S )NrH   r   F)r   r%   r)   bias)r   r)   r   Tr   )r.   r   r/   rQ   Conv2dconv1BatchNorm2dbn1conv2bn2r   relur   se
downsampler%   )r2   inplanesplanesr%   r   r   r4   r   r    r/      s   
zSEBasicBlock.__init__c                 C   sj   |}|  |}| |}| |}| |}| |}| |}| jd ur*| |}||7 }| |}|S r-   )r   r   r   r   r   r   r   )r2   rs   residualoutr   r   r    r9     s   








zSEBasicBlock.forward)r   Nr   )r;   r<   r=   	expansionr/   r9   r>   r   r   r4   r    r      s    r   c                       s   e Zd ZdZddg dg ddddi ddd	d
ddddddddddddddddddddddddd dd!dd"d#d$ddd%dd&dddd'd(
f fd)d*	Zd+d, Zd9d.d/Zd0d1 Zd:d2d3Zd4d5 Z	d;d6e
fd7d8Z  ZS )<ResNetSpeakerEncoderzImplementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
    Adapted from: https://github.com/clovaai/voxceleb_trainer
    rE      )rH   rG      rH   )    rE         ASPTfft_size
win_lengthi  
hop_length   frame_shift_msNframe_length_msstft_pad_moder   sample_raterA   rM   Fpreemphasisr   ref_level_db   do_sound_normdo_trim_silencetrim_db<   powerg      ?griffin_lim_itersnum_melsmel_fmin        g     @@ig      @g      ;)
mel_fmax	spec_gainsignal_normmin_level_dbsymmetric_normmax_norm	clip_norm
stats_pathdo_rms_normdb_levelc	              
      s  t t|   || _|| _|| _|| _|| _|| _t	j
d|d dddd| _t	jdd| _t	|d | _|d | _| t|d |d | _| jt|d |d dd| _| jt|d	 |d	 dd| _| jt|d |d dd| _t	|| _| jrtr| || _nd | _t| jd
 }	t	t	j|d |	 dddt	 t	dt	jd|d |	 ddt	j d	d| _!| jdkr|d |	 }
n| jdkr|d |	 d	 }
nt"dt	#|
|| _$| %  d S )Nr   r   rH   )r   r%   r)   Tr   )r   r   )r%   r   r   r   r   rd   SAPr   zUndefined encoder)&r.   r   r/   encoder_type	input_dim	log_inputuse_torch_specaudio_configproj_dimrQ   r   r   r   r   r   r   r   create_layerr   layer1layer2layer3layer4InstanceNorm1dinstancenormrI   get_torch_mel_spectrogram_class
torch_specintr   rV   BatchNorm1dSoftmax	attention
ValueErrorr   r   _init_layers)r2   r	  r  layersnum_filtersr  r
  r  r  outmap_sizeout_dimr4   r   r    r/     sD   '




zResNetSpeakerEncoder.__init__c                 C   s`   |   D ])}t|tjrtjj|jddd qt|tjr-tj|jd tj|j	d qd S )Nfan_outr   )modenonlinearityr   r   )
modules
isinstancerQ   r   initkaiming_normal_weightr   	constant_r   )r2   mr   r   r    r  r  s   z!ResNetSpeakerEncoder._init_layersr   c              	   C   s   d }|dks| j ||j kr&ttj| j ||j d|ddt||j }g }||| j ||| ||j | _ td|D ]}||| j | q>tj| S )Nr   F)r   r%   r   )r   r   rQ   r   r   r   rq   range)r2   blockr   blocksr%   r   r  r   r   r   r    r  z  s   
z!ResNetSpeakerEncoder.create_layerc                 G   s    t tj| }t j| |S r-   )rQ   	ParameterrP   r   r&  xavier_normal_)r2   r   r   r   r   r    new_parameter  s   z"ResNetSpeakerEncoder.new_parameterc                 C   sZ  | d | jr| |}| jr|d  }| |d}| |}| |}| 	|}| 
|}| |}| |}| |}|| d d| d }| |}| jdkrdtj|| dd}n,| jdkrtj|| dd}ttj|d | dd|d  jd	d
}t||fd}|| d d}| |}|rtjjj|ddd}|S )a{  Forward pass of the model.

        Args:
            x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
                to compute the spectrogram on-the-fly.
            l2_norm (bool): Whether to L2-normalize the outputs.

        Shapes:
            - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
        r   gư>r   rf   r  r   rd   r   gh㈵>)min)pre   )squeeze_r  r  r
  logr  rr   r   r   r   r  r  r  r  reshaper   r  r  rP   sumr   clampcatr   r   rQ   r   	normalize)r2   rs   l2_normwmusgr   r   r    r9     s4   











,
zResNetSpeakerEncoder.forwardc                 C   s@   t jt|d tjj|d |d |d |d t j|d dS )Nr   r   r   r   r   r   )r   n_fftr   r   	window_fnn_mels)rP   rQ   r   r   rJ   rK   MelSpectrogramhamming_window)r2   r  r   r   r    r    s   
z4ResNetSpeakerEncoder.get_torch_mel_spectrogram_classcheckpoint_pathc                 C   s(   t |tdd}| j|d |d d S )Ncpu)r   model)strict)r   rP   deviceload_state_dict)r2   rC  rF  stater   r   r    load_checkpoint  s   z$ResNetSpeakerEncoder.load_checkpointr   )F)T)r;   r<   r=   r   r/   r  r  r0  r9   r  strrJ  r>   r   r   r4   r    r     s|    	
V

/r   c                       s4   e Zd ZdZd
dedef fddZdd	 Z  ZS )CodecActivationz
    Choose between activation based on the input parameter.

    Args:
        activation: Name of activation to use. Valid options are "elu" (default), "lrelu", and "snake".
        channels: Input dimension.
    elur   
activationchannelsc                    sz   t    | }|dkrt | _d S |dkr tj | _d S |dkr+t|| _d S |dkr6t	|| _d S t
d| )NrN  lrelusnake
half_snakezUnknown activation )r.   r/   lowerrQ   ELUrO  rP   	LeakyReLUr   r   r  )r2   rO  rP  r4   r   r    r/     s   
zCodecActivation.__init__c                 C   
   |  |S r-   )rO  r   r   r   r    r9     s   
zCodecActivation.forward)rN  r   )	r;   r<   r=   r   rL  r  r/   r9   r>   r   r   r4   r    rM    s    rM  c                       s\   e Zd ZdZ				ddedededed	ed
ef fddZdd Zdd Zdd Z  Z	S )CausalConvTranspose1dNormz1ConvTranspose1d causal padding and normalization.r   NTin_channelsout_channelsr   r%   groupstrim_right_ratioc           	         s   t    || _|d u r|n|}tj||||||d| _| jjd }| jjd }|| }t	|| j | _
|| j
 | _tjj| j| _d S )N)r[  r   r   )r.   r/   r\  rQ   ConvTranspose1dconvr   r%   mathceilpadding_rightpadding_leftrR   rS   rT   )	r2   rY  rZ  r   r%   r[  r\  r   padding_totalr4   r   r    r/     s   

z"CausalConvTranspose1dNorm.__init__c                 C   s0   t jjj}tt jjdrt jjj}|| j d S )NrT   )rQ   rR   rS   rT   hasattrr^  )r2   rT   r   r   r    apply_weight_norm  s   

z+CausalConvTranspose1dNorm.apply_weight_normc                 C      t j| j d S r-   rQ   rR   remove_weight_normr^  r   r   r   r    rh       z,CausalConvTranspose1dNorm.remove_weight_normc                 C   s:   |  |}|jd | j }|d| j|f }t||}|S )Nrf   .)r^  shapera  rb  r   )r2   inputs	input_lenrk   endr   r   r    r9     s
   

z!CausalConvTranspose1dNorm.forward)r   Nr   T)
r;   r<   r=   r   r  r/   re  rh  r9   r>   r   r   r4   r    rX    s,    rX  c                       s   e Zd ZdZ						d dededed	ed
ededededef fddZdd Zde	j
de	j
fddZed!de	j
deeef dedefddZdd Z  ZS )"CausalConv1dNormz-Conv1d with causal padding and normalization.r   zerosconstantTrY  rZ  r   r%   r   r[  pad_modeextra_pad_moder   c
           
   
      s   t    || _|dkr|dkrtd| d| d| d tj|||||||	|d| _| jjd }tj	| jj
d tjd}| jjd }tj	|d | d tjd}| jd	|d
d | jd|d
d | jdtj	|| tjdd
d tjj| j| _d S )Nr   zTCausalConv1dNorm has been initialized with stride > 1 and dilation > 1 (kernel_size=z stride=z, dilation=z).)r   r[  r   padding_moder   dtyper%   F)
persistentr   rc  )r.   r/   rr  printrQ   rV   r^  r   rP   tensorr%   int64r   r   rR   rS   rT   )
r2   rY  rZ  r   r%   r   r[  rq  rr  r   r4   r   r    r/   %  s<   
 zCausalConv1dNorm.__init__c                 C   rf  r-   rg  r   r   r   r    rh  T  ri  z#CausalConv1dNorm.remove_weight_normrk   r   c                 C   sX   |j d }|| j | j | j d }t|tjd }|| j | j | j }|| S )zSee `pad_for_conv1d`.rf   r   )rj  r   rc  r%   rP   r`  tory  )r2   rk   lengthn_framesideal_lengthr   r   r    _get_extra_padding_for_conv1dX  s
   
z.CausalConv1dNorm._get_extra_padding_for_conv1dzeror   r#   r"  valuec                 C   s   | j d }|\}}|dkstj| |||S t||}d}||kr0|| d }tj| d|f} tj| |||}	|	j d | }
|	dd|
f S )zTiny wrapper around torch.nn.functional.pad, just to allow for reflect padding on small input.
        If this is the case, we insert extra 0 padding to the right before the reflection happens.
        rf   r   r   r   .N)rj  rQ   r   r   max)rk   r#   r"  r  r{  rb  ra  max_pad	extra_padpaddedrm  r   r   r    _pad1dd  s   

zCausalConv1dNorm._pad1dc                 C   s:   |  |}| j|| j|f| jd}| |}t||}|S )N)r"  )r~  r  rc  rr  r^  r   )r2   rk  rl  extra_paddingrk   r   r   r    r9   x  s
   


zCausalConv1dNorm.forward)r   r   r   ro  rp  T)r  r   )r;   r<   r=   r   r  rL  boolr/   rh  rP   Tensorr~  staticmethodr   floatr  r9   r>   r   r   r4   r    rn  "  sH    	
/
(rn  c                       sv   e Zd Z				ddededededed	ee d
ef fddZedd Zedd Z	dd Z
e dd Z  ZS )
Conv1dNormr   Nr   rY  rZ  r   r%   r   r)   rq  c           	   	      sF   t    |st||d}tj|||||||d}tjj|| _d S )Nr   )rY  rZ  r   r%   r)   r   rs  )	r.   r/   r!   rQ   rV   rR   rS   rT   r^  )	r2   rY  rZ  r   r%   r   r)   rq  r^  r4   r   r    r/     s   

	zConv1dNorm.__init__c                 C      t dt t tdt dS Nrz   r   Trz   rk  rl  r   r   tupler   r   r   r   r    r        
zConv1dNorm.input_typesc                 C      dt dt iS Nr   r  r   r   r   r   r    r        zConv1dNorm.output_typesc                 C   rf  r-   rg  r   r   r   r    rh    ri  zConv1dNorm.remove_weight_normc                 C      |  |}t||}|S r-   r^  r   r2   rk  rl  r   r   r   r    r9        

zConv1dNorm.forward)r   r   Nr   )r;   r<   r=   r  r   rL  r/   r   r   r   rh  r   r9   r>   r   r   r4   r    r    s6    

r  c                       sb   e Zd Zddededededef
 fddZed	d
 Zedd Zdd Ze	 dd Z
  ZS )ConvTranspose1dNormr   rY  rZ  r   r%   r[  c           	   
      sF   t    t||\}}tj||||||d|d}tjj|| _d S )Nro  )rY  rZ  r   r%   r)   r(   rs  r[  )	r.   r/   r*   rQ   r]  rR   rS   rT   r^  )	r2   rY  rZ  r   r%   r[  r)   r(   r^  r4   r   r    r/     s   

zConvTranspose1dNorm.__init__c                 C   r  r  r  r   r   r   r    r     r  zConvTranspose1dNorm.input_typesc                 C   r  r  r   r   r   r   r    r     r  z ConvTranspose1dNorm.output_typesc                 C   rf  r-   rg  r   r   r   r    rh    ri  z&ConvTranspose1dNorm.remove_weight_normc                 C   r  r-   r  r  r   r   r    r9     r  zConvTranspose1dNorm.forwardr   r   )r;   r<   r=   r  r/   r   r   r   rh  r   r9   r>   r   r   r4   r    r    s    $

r  c                       s~   e Zd Z		ddededeeef deeef deeef f
 fddZed	d
 Zedd Zdd Z	e
 dd Z  ZS )
Conv2dNormr  rY  rZ  r   r%   r   c              	      sT   t    t|t|ksJ t||}tj||||||dd}tjj|| _	d S )Nr   )rY  rZ  r   r%   r   r)   rs  )
r.   r/   r   r$   rQ   r   rR   rS   rT   r^  )r2   rY  rZ  r   r%   r   r)   r^  r4   r   r    r/     s   

	zConv2dNorm.__init__c                 C   r  )Nrk  rz   r   Hr  r   r   r   r   r    r     r  zConv2dNorm.input_typesc                 C   r  )Nr   r  r   r   r   r   r    r     r  zConv2dNorm.output_typesc                 C   rf  r-   rg  r   r   r   r    rh    ri  zConv2dNorm.remove_weight_normc                 C   rW  r-   )r^  )r2   rk  r   r   r    r9     s   
zConv2dNorm.forward)r  r  )r;   r<   r=   r  r   r/   r   r   r   rh  r   r9   r>   r   r   r4   r    r    s*    




r  c                       sH   e Zd ZdZd fdd	Zedd Zedd Ze d	d
 Z	  Z
S )PeriodDiscriminatora  
    Period discriminator introduced in HiFi-GAN https://arxiv.org/abs/2010.05646 which attempts to
    discriminate phase information by looking at equally spaced audio samples.

    Args:
        period: Spacing between audio sample inputs.
        lrelu_slope: Slope to use for activation. Leaky relu with slope of 0.1 or 0.2 is recommended for the
           stability of the feature matching loss.
    rj   c                    s   t    || _t|| _ttdddddtdddddtdddddtdddddtdddd	dg| _tdddd
| _	d S )Nr   r   )rF   r   )rH   r   r&   r   r   i   r  r  )
r.   r/   periodrQ   rV  rO  rX   r  conv_layersrZ   )r2   r  lrelu_sloper4   r   r    r/     s   
	zPeriodDiscriminator.__init__c                 C   r  Naudiory   r   r   r   r   r    r   !  r  zPeriodDiscriminator.input_typesc                 C      t dt t dt gdS )Nr   r   )scorert   r   r   r   r   r    r   '     
z PeriodDiscriminator.output_typesc           	      C   s   |j \}}t|d}|| j dkr&| j|| j  }t|d|fd}|| }||d|| j | j}g }| jD ]}||d}| |}|| q7| j	|d}|| t|d}||fS )NB T -> B 1 Tr   r   r   rk  B 1 T C -> B C T)
rj  r   r  ro   r   r   r  rO  rq   rZ   )	r2   r  
batch_sizetimer   n_padrt   r^  r  r   r   r    r9   .  s    






zPeriodDiscriminator.forward)rj   )r;   r<   r=   r   r/   r   r   r   r   r9   r>   r   r   r4   r    r    s    


r  c                       sR   e Zd ZdZddee f fddZedd Zed	d
 Z	e
 dd Z  ZS )MultiPeriodDiscriminatorz
    Wrapper class to aggregate results of multiple period discriminators.

    The periods are expected to be increasing prime numbers in order to maximize coverage and minimize overlap
    r   rH   rF         rj   periodsc                    (   t    t fdd|D | _d S )Nc                       g | ]}t | d qS ))r  r  )r  ).0r  r  r   r    
<listcomp>S      z5MultiPeriodDiscriminator.__init__.<locals>.<listcomp>r.   r/   rQ   rX   discriminators)r2   r  r  r4   r  r    r/   P     

z!MultiPeriodDiscriminator.__init__c                 C   rw   rx   r   r   r   r   r    r   V  r   z$MultiPeriodDiscriminator.input_typesc                 C   r   r   r   r   r   r   r    r   ]  r   z%MultiPeriodDiscriminator.output_typesc                 C   sl   g }g }g }g }| j D ]$}||d\}}	||d\}
}|| ||	 ||
 || q||||fS N)r  )r  rq   )r2   r}   r~   r   r   r   r   discriminator
score_real	fmap_real	score_genfmap_genr   r   r    r9   f  s   



z MultiPeriodDiscriminator.forward)r  rj   )r;   r<   r=   r   r   r  r/   r   r   r   r   r9   r>   r   r   r4   r    r  I  s    

r  c                       sR   e Zd ZdZddedef fddZedd	 Zed
d Z	e
 dd Z  ZS )DiscriminatorSTFTaA  
    Discriminator network from EnCodec for Complex STFT input, but without dilations.

    Args:
        filters: number of filters to use in Conv2d layers
        lrelu_slope: Slope to use for activations. Leaky relu with slope of 0.1 or 0.2 is recommended for the
           stability of the feature matching loss
    r   rj   filtersr  c                    sx   t    t|| _ttd|ddt||dddt||dddt||dddt||ddg| _t|ddd| _d S )Nr   )rH   	   r  )r   r   r&   )rH   rH   r   )	r.   r/   rQ   rV  rO  rX   r  r  rZ   )r2   r  r  r4   r   r    r/     s   
	zDiscriminatorSTFT.__init__c                 C   r  )Nspec)rz   r   T_specr   r   r   r   r   r    r     r  zDiscriminatorSTFT.input_typesc                 C   r  )Nrz   r   r  rz   r   r  r   )scoresrt   r   r   r   r   r    r     r  zDiscriminatorSTFT.output_typesc                 C   sZ   g }|}| j D ]}||d}| |}|| q| j|d}|| t|d}||fS )Nr  r  )r  rO  rq   rZ   r   )r2   r  rt   r   r^  r  r   r   r    r9     s   




zDiscriminatorSTFT.forward)r   rj   r;   r<   r=   r   r  r  r/   r   r   r   r   r9   r>   r   r   r4   r    r  w  s    	

r  c                       sd   e Zd ZdZdee deee  f fddZdd Ze	dd	 Z
e	d
d Ze dd Z  ZS )MultiBandDiscriminatorSTFTa  
    Multi-band STFT discriminator proposed in DAC (https://arxiv.org/abs/2306.06546).

    Computes the complex STFT for a given resolution and splits it into sub-bands,
    which are given to separate discriminator networks.

    Args:
        resolution: STFT resolution, provided as a tuple of 3 integers ordered (num_fft, hop_length, window_length)
        stft_bands: List of tuples, with each tuple having 2 float values (band_start, band_end).
            The floats are in the range [0, 1] representing the fraction of all stft bands.
            For example for n_fft=1024, the stft output has 513 dimensions.
            For band input [(0, 0.25), (0.25, 1.0)] it would use stft dimensions [0 through 127] and [128 through 512].
    
resolution
stft_bandsc                    sn   t    |\| _| _| _| dtj| jdd t	dd |D | _
| jd d   fdd|D | _d S )	NwindowF)periodicc                 S   s   g | ]}t  qS r   )r  r  r   r   r   r    r    s    z7MultiBandDiscriminatorSTFT.__init__.<locals>.<listcomp>r   r   c                    s,   g | ]}t |d    t |d   fqS )r   r   )r  r  bandn_stftr   r    r    s   , )r.   r/   r>  r   r   r   rP   hann_windowrQ   rX   r  r  )r2   r  r  r4   r  r    r/     s   
z#MultiBandDiscriminatorSTFT.__init__c              
   C   sF   t j|| j| j| j| jdddd}t|d}t j|j|j	gdd}|S )NT)r>  r   r   r  
normalizedcenterreturn_complexzB fft T -> B T fftr   rd   )
rP   stftr>  r   r   r  r   rl   realimag)r2   r  fftr   r   r   r    compute_stft  s   

z'MultiBandDiscriminatorSTFT.compute_stftc                 C   r  r  r   r   r   r   r    r     r  z&MultiBandDiscriminatorSTFT.input_typesc                 C   s    t dt gt dt ggdS )Nr  r  )scores_list
fmaps_listr   r   r   r   r    r     s   z'MultiBandDiscriminatorSTFT.output_typesc           
      C   s|   g }g }|  |}t| j| jD ])\}}|d d d d d d |d |d f }||d\}}	|| ||	 q||fS )Nr   r   )r  )r  zipr  r  rq   )
r2   r  r  	fmap_listr  r  disc	spec_bandr  rt   r   r   r    r9     s   
(
z"MultiBandDiscriminatorSTFT.forward)r;   r<   r=   r   r   r  r   r/   r  r   r   r   r   r9   r>   r   r   r4   r    r    s    "	

r  c                       s`   e Zd ZdZdeee  deee  f fddZedd Z	edd	 Z
e d
d Z  ZS ) MultiResolutionDiscriminatorSTFTa  
    Multi-resolution discriminator which creates a multi-band discriminator for each input resolution.

    Args:
        resolutions: List of STFT resolutions, each resolution provided as a tuple of 3 integers ordered
            (num_fft, hop_length, window_length)
        stft_bands: List of tuples, with each tuple having 2 float values (band_start, band_end).
            The floats are in the range [0, 1] representing the fraction of all stft bands.
            For example for n_fft=1024, the stft output has 513 dimensions.
            For band input [(0, 0.25), (0.25, 1.0)] it would use stft dimensions [0 through 127] and [128 through 512].
    resolutionsr  c                    r  )Nc                    r  ))r  r  )r  )r  r  r  r   r    r    r  z=MultiResolutionDiscriminatorSTFT.__init__.<locals>.<listcomp>r  )r2   r  r  r4   r  r    r/     r  z)MultiResolutionDiscriminatorSTFT.__init__c                 C   rw   rx   r   r   r   r   r    r     r   z,MultiResolutionDiscriminatorSTFT.input_typesc                 C   r   )Nr  r  r   r   r   r   r   r    r     r   z-MultiResolutionDiscriminatorSTFT.output_typesc                 C   sd   g }g }g }g }| j D ] }||d\}}	|| }||	 }||d\}
}||
 }|| }q||||fS r  r  )r2   r}   r~   r   r   r   r   r  score_real_ifmap_real_iscore_gen_i
fmap_gen_ir   r   r    r9     s   

z(MultiResolutionDiscriminatorSTFT.forward)r;   r<   r=   r   r   r   r  r/   r   r   r   r   r9   r>   r   r   r4   r    r    s    &

r  c                       sP   e Zd ZdZdee f fddZedd Zedd Z	e
 d	d
 Z  ZS )Discriminatorzd
    Wrapper class which takes a list of discriminators and aggregates the results across them.
    r  c                    r,   r-   r  )r2   r  r4   r   r    r/   /  r6   zDiscriminator.__init__c                 C   rw   rx   r   r   r   r   r    r   3  r   zDiscriminator.input_typesc                 C   r   r   r   r   r   r   r    r   :  r   zDiscriminator.output_typesc                 C   s\   g }g }g }g }| j D ]}|||d\}}	}
}||7 }||
7 }||	7 }||7 }q||||fS )Nr|   r  )r2   r}   r~   r   r   r   r   r  r  r  r  r  r   r   r    r9   C  s   

zDiscriminator.forward)r;   r<   r=   r   r   r   r/   r   r   r   r   r9   r>   r   r   r4   r    r  *  s    

r  c                
   @   s   e Zd Zedd Zedd Ze edej	dej	de
ej	ej	f fdd	Zeed
e eede ddede idedej	dej	dej	fddZeede eede dded
e idedej	dej	dej	fddZdS )VectorQuantizerBasec                 C   r  Nrz   r   r  rz   r  r   r   r  r   r   r   r   r    r   T  r  zVectorQuantizerBase.input_typesc                 C   s   t dt t dt dS )Nr  r   rz   r  )dequantizedindices)r   r   r   r   r   r   r    r   [  r   z VectorQuantizerBase.output_typesrk  rl  r   c                 C      d S r-   r   r2   rk  rl  r   r   r    r9   b  s   zVectorQuantizerBase.forwardr  rz   r  r  r  r   r   c                 C   r  r-   r   r  r   r   r    encodeg  s   	zVectorQuantizerBase.encoder  rl  r  c                 C   r  r-   r   )r2   r  rl  r   r   r    decoder  s   zVectorQuantizerBase.decodeN)r;   r<   r=   r   r   r   r   r   rP   r  r   r9   r   r   r  r   r   r  r  r   r   r   r    r  S  s2    

*
 
	$r  c                
       s  e Zd ZdZd3dee def fddZedd Z	ed	d
 Z
edd Zedd Zedd ZedejdejdejfddZdejdejdejfddZeede eede ddede iddejdejdejfddZdejdejfdd Zd!ejdejfd"d#Zdejdejfd$d%Ze 	&d4dejdeej deejejf fd'd(Zeede eede d)d*dd+ed,e idd4dejdeej dejfd-d.Z eed,e eede d)d*d/d0ede idd4d+ejdeej dejfd1d2Z!  Z"S )5FiniteScalarQuantizera  This quantizer is based on the Finite Scalar Quantization (FSQ) method.
    It quantizes each element of the input vector independently into a number of levels.

    Args:
        num_levels: number of levels for each dimension/element of the input vector
        eps: small regularization constant for scaling

    References:
        Mentzer et al., Finite Scalar Quantization: VQ-VAE Made Simple (https://arxiv.org/abs/2309.15505v1)
    MbP?
num_levelsepsc                    s   t    tjtdg|d d  dtjd}t|d}| d| tj|tjd}t|d}| d| || _t	
d	| jj t	
d
| j t	
d| j t	
d| j t	
d| j d S )Nr   rf   r   )re   ru  z
D -> 1 D 1dim_base_indexrt  r  zInitializing %s withz	dim:           %sz	num_levels:    %sz	codebook_size: %sz	eps:           %s)r.   r/   rP   cumprodrx  int32r   r   r  r   debugr5   r;   re   r  codebook_size)r2   r  r  r  r4   r   r    r/     s   
&

zFiniteScalarQuantizer.__init__c                 C   s   | j   S )z/Returns the size of the corresponding codebook.)r  proditemr   r   r   r    r    s   z#FiniteScalarQuantizer.codebook_sizec                 C   s
   | j  S )z*Returns the dimension of the input vector.)r  numelr   r   r   r    re     s   
zFiniteScalarQuantizer.dimc                 C      | j S )z{Returns the dimension of the input vector.
        Keeping for compatiblitiy with the original RVQ implementation.
        rd   r   r   r   r    codebook_dim     z"FiniteScalarQuantizer.codebook_dimc                 C   s2   t | j}t|d}| j|dd}|d}|S )zReturns the codebooks entries.

        Note that the codebook entries are implicitly defined by the number of levels.
        z
B -> 1 B 1Nr  rf   )rP   aranger  r   r  r   )r2   r  codesr   r   r    r    s
   

zFiniteScalarQuantizer.codesc                 C   r  )zPReturns the codebooks entries.
        See self.codes for more details.
        r  r   r   r   r    codebook  r	  zFiniteScalarQuantizer.codebookrk  rl  r   c                 C   s   t | }| ||    S )zqRound the input tensor to nearest integer
        and use a straight-through estimator for the gradient.
        )rP   rounddetach)rk  rl  inputs_roundedr   r   r    r    s   
zFiniteScalarQuantizer.roundc                 C   sX   | j d d }|d| j  }t| j d dkdd}||  }|||   | }|S )z3Apply compression to the input, to limit to values.r   r   r   g      ?)r  r  rP   wheretantanh)r2   rk  rl  output_scaleoutput_offsetinput_shiftoutputr   r   r    compress  s   zFiniteScalarQuantizer.compressr  rz   r  r  r  c                 C   s2   | j ||d}| j||d}| jd }|| }|S )Nr  r   )r  r  r  )r2   rk  rl  
compressedr  scaler   r   r    inputs_to_codes  s
   	
z%FiniteScalarQuantizer.inputs_to_codesc                 C   s   | j d  }}|| | S )z;Convert values centered arouund zero to nonnegative values.r   r  )r2   r  r  offsetr   r   r    codes_to_nonnegative     z*FiniteScalarQuantizer.codes_to_nonnegativecodes_nonnegativec                 C   s   | j d  }}|| | S )z;Convert nonnegative values to values centered arouund zero.r   r  )r2   r   r  r  r   r   r    nonnegative_to_codes  r  z*FiniteScalarQuantizer.nonnegative_to_codesc                 C   s^   | d| jkrtd| d d| j d|j | |}tj|| j dd}|tj	S )z)Converts a code vector to a single index.r   zInput code dimension % not matching the expected dimension z, input codes shape rd   )
r   re   RuntimeErrorrj  r  rP   r6  r  rz  r  )r2   r  r  r   r   r    codes_to_indices  s   
z&FiniteScalarQuantizer.codes_to_indicesNc                 C   s|   | d| jkrtd| d d| j d|j | j||d}| j|d}|d ur5t||}t||}|d}||fS )Nr   zInput dimension r"  z, inputs shape r  r  r   )r   re   r#  rj  r  r$  r   rr   )r2   rk  rl  r  r  r   r   r    r9     s   


zFiniteScalarQuantizer.forwardT)optionalr  r  c                 C   s   | ||d\}}|S )z3Convert a continuous code vector to a single index.r  r   )r2   rk  rl  r   r  r   r   r    r    s   	zFiniteScalarQuantizer.encoder  r  c                 C   sf   | ddkrtd| d d|j dt|d}|| j | j }| |}|dur1t||}|S )z3Convert a single index to a continuous code vector.r   r   z Expected a single codebook, got z" codebooks for indices with shape .zD B T -> B D TN)r   r  rj  r   r  r  r!  r   )r2   r  rl  r   r  r   r   r    r  )  s   


zFiniteScalarQuantizer.decode)r  r-   )#r;   r<   r=   r   r   r  r  r/   r   r  re   r  r  r  r  rP   r  r  r  r   r   r   r  r   r   r  r  r!  r$  r   r   r9   r  r  r>   r   r   r4   r    r    sd    







$
,	r  c                       s   e Zd ZdZdedee f fddZedd Zedd	 Z	ed
d Z
e dd Zeede eede ddede iddejdejdejfddZeede eede ddede iddejdejdejfddZ  ZS )GroupFiniteScalarQuantizera  Split the input vector into groups and apply FSQ on each group separately.
    This class is for convenience. Since FSQ is applied on each group separately,
    groups can be defined arbitrarily by splitting the input vector. However, this
    class makes it easy to construct several groups with the same quantization num_levels.

    Args:
        num_groups: number of groups to split the input into, each group will be quantized separately using num_codebooks//num_groups codebooks
        codebook_dim: embedding dimension, will be split into num_groups
        **kwargs: parameters of FiniteScalarQuantizer

    References:
        Yang et al, HiFi-Codec: Group-residual Vector quantization for High Fidelity Audio Codec, 2023 (http://arxiv.org/abs/2305.02765).
    
num_groupsnum_levels_per_groupc                    s   t    || _t| _tj fddt| jD | _	t
d| jj t
d| j t
d| j t
d t
d| j d S )Nc                    s   g | ]}t dd i qS )r  r   )r  r  r8   r)  r   r    r  ]      z7GroupFiniteScalarQuantizer.__init__.<locals>.<listcomp>zInitialized %s withz	num_groups:              %dz	codebook_dim:            %dz	num_levels_per_group:    %sz	codebook_dim_per_group:  %d)r.   r/   r(  r   codebook_dim_per_grouprP   rQ   rX   r+  fsqsr   r  r5   r;   r  )r2   r(  r)  r8   r4   r*  r    r/   U  s   

z#GroupFiniteScalarQuantizer.__init__c                 C   s   | j | j S )zInput vector dimension.)r,  r(  r   r   r   r    r  f     z'GroupFiniteScalarQuantizer.codebook_dimc                 C   s   | j d jS )z9Returns the size of the implicit codebook for each group.r   )r-  r  r   r   r   r    codebook_size_per_groupk  r.  z2GroupFiniteScalarQuantizer.codebook_size_per_groupc                 C   s   | j | j S )z*Returns the size of the implicit codebook.)r/  r(  r   r   r   r    r  p  r.  z(GroupFiniteScalarQuantizer.codebook_sizec           
      C   sx   |j | jdd}g g }}t|| jD ]\}}|||d\}}	|| ||	 qtj|dd}tj|dd}||fS )z=Quantize each group separately, then concatenate the results.r   rd   r  r   )chunkr(  r  r-  rq   rP   r8  )
r2   rk  rl  inputs_groupedr  r  in_group	fsq_groupdequantized_groupindices_groupr   r   r    r9   u  s   

z"GroupFiniteScalarQuantizer.forwardr  rz   r  r  r  r  rk  rl  r   c                 C   T   |j | jdd}g }t|| jD ]\}}|j||d}|| qtj|dd}|S )z`Input is split into groups, each group is encoded separately, then the results are concatenated.r   rd   r  r   )r0  r(  r  r-  r  rq   rP   r8  )r2   rk  rl  r1  r  r2  r3  r5  r   r   r    r    s   	z!GroupFiniteScalarQuantizer.encoder  r  c                 C   r6  )ziInput indices are split into groups, each group is decoded separately, then the results are concatenated.r   rd   r  r   )r0  r(  r  r-  r  rq   rP   r8  )r2   r  rl  indices_groupedr  r5  r3  r4  r   r   r    r    s   z!GroupFiniteScalarQuantizer.decode)r;   r<   r=   r   r  r   r/   r   r  r/  r  r   r9   r   r   r  r   r   rP   r  r  r  r>   r   r   r4   r    r'  F  s4    





&	r'  c                       s~   e Zd ZdZ						dded	ed
edededededef fddZdd Z	e
dd Ze
dd Ze dd Z  ZS )ResidualBlocka  
    The residual block structure defined by the HiFi-GAN V1 and V2 configurations.

    Args:
        channels: Input dimension.
        filters: Number of channels in the residual convolutions.
        kernel_size: Kernel size of the residual convolutions.
        dilation: Dilation of the residual convolutions.
        dropout_rate: Dropout to apply to residuals.
        activation: Activation to apply in between residual convolutions.
    rH   r   r   rQ  Fr   rP  r  r   r   dropout_raterO  	is_causalrq  c	           	         s   t t|   t||d| _t||d| _tj|| _	|s3t
|||||d| _t
||||d| _d S t|||||d| _t||||d| _d S )N)rO  rP  )rY  rZ  r   r   rq  rY  rZ  r   rq  )r.   r8  r/   rM  input_activationskip_activationrP   rQ   Dropoutdropoutr  
input_conv	skip_convrn  )	r2   rP  r  r   r   r9  rO  r:  rq  r4   r   r    r/     s2   zResidualBlock.__init__c                 C   s   | j   | j  d S r-   )r@  rh  rA  r   r   r   r    rh    s   
z ResidualBlock.remove_weight_normc                 C   r  r  r  r   r   r   r    r     s   zResidualBlock.input_typesc                 C   r  r  r   r   r   r   r   r    r        zResidualBlock.output_typesc                 C   sF   |  |}| j||d}| |}| j||d}| |}|| }|S Nr  )r<  r@  r=  rA  r?  )r2   rk  rl  
conv_input
skip_inputresr   r   r   r    r9     s   


zResidualBlock.forward)rH   r   r   rQ  Fr   )r;   r<   r=   r   r  r  rL  r  r/   rh  r   r   r   r   r9   r>   r   r   r4   r    r8    s@    	'

r8  c                       sr   e Zd ZdZ		ddededee deded	ef fd
dZdd Z	e
dd Ze
dd Ze dd Z  ZS )HiFiGANResBlockab  
    Residual block wrapper for HiFi-GAN which creates a block for multiple dilations.

    Args:
        channels: Input dimension.
        kernel_size: Kernel size of the residual blocks.
        dilations: List of dilations. One residual block will be created for each dilation in the list.
        activation: Activation for the residual blocks.
    Fr   rP  r   	dilationsrO  r:  rq  c                    s0   t    t fdd|D | _d S )Nc                    s"   g | ]}t | d qS ))rP  r  r   r   rO  r:  rq  r8  )r  r   rO  rP  r:  r   rq  r   r    r    s    
z,HiFiGANResBlock.__init__.<locals>.<listcomp>r.   r/   rQ   rX   
res_blocks)r2   rP  r   rI  rO  r:  rq  r4   rK  r    r/     s   
	

zHiFiGANResBlock.__init__c                 C      | j D ]}|  qd S r-   rM  rh  r2   	res_blockr   r   r    rh  &     

z"HiFiGANResBlock.remove_weight_normc                 C   r  r  r  r   r   r   r    r   *  r  zHiFiGANResBlock.input_typesc                 C   r  r  r   r   r   r   r    r   1  rC  zHiFiGANResBlock.output_typesc                 C   s    |}| j D ]}|||d}q|S rD  )rM  )r2   rk  rl  r   rQ  r   r   r    r9   5  s   
zHiFiGANResBlock.forwardFr   r;   r<   r=   r   r  r   rL  r  r/   rh  r   r   r   r   r9   r>   r   r   r4   r    rH    s0    

rH  c                       sv   e Zd ZdZ		ddedee dee deded	ef fd
dZdd Z	e
dd Ze
dd Ze dd Z  ZS )HiFiGANResLayeray  
    Residual block wrapper for HiFi-GAN which creates a block for multiple kernel sizes and dilations.
    One residual block is created for each combination of kernel size and dilation.

    Args:
        channels: Input dimension.
        kernel_sizes: List of kernel sizes.
        dilations: List of dilations.
        activation: Activation for the residual layers.

    Fr   rP  kernel_sizesrI  rO  r:  rq  c                    s0   t    t fdd|D | _d S )Nc              
      s    g | ]}t | d qS ))rP  r   rI  rO  r:  rq  )rH  )r  r   rO  rP  rI  r:  rq  r   r    r  V      	z,HiFiGANResLayer.__init__.<locals>.<listcomp>rL  )r2   rP  rV  rI  rO  r:  rq  r4   rW  r    r/   J  s   
		
zHiFiGANResLayer.__init__c                 C   rN  r-   rO  rP  r   r   r    rh  c  rR  z"HiFiGANResLayer.remove_weight_normc                 C   r  r  r  r   r   r   r    r   g  r  zHiFiGANResLayer.input_typesc                 C   r  )Nr   r  r   r   r   r   r    r   n  rC  zHiFiGANResLayer.output_typesc                    s*    fdd| j D }t|t| }|S )Nc                    s   g | ]}| d qS )r  r   )r  rQ  rl  rk  r   r    r  t  r  z+HiFiGANResLayer.forward.<locals>.<listcomp>)rM  r6  r   )r2   rk  rl  	residualsr   r   rY  r    r9   r  s   zHiFiGANResLayer.forwardrS  rT  r   r   r4   r    rU  =  s0    

rU  c                          e Zd ZdZ								dd	ed
ee dedededee dee dedef fddZedd Z	edd Z
dd Ze dd Z  ZS )CausalHiFiGANEncodera  
    Causal Audio encoder created by inverting the HiFi-GAN decoder and replacing Conv1D by CausalConv1D.

    Args:
        encoded_dim: Dimension of encoder output.
        down_sample_rates: Rate to upsample for each decoder block. The product of the downsample rates will
            determine the output token rate. For example 2 * 2 * 8 * 8 = 256 samples per token.
        base_channels: Number of filters in the first convolution. The number of channels will be doubled after each
            downsample layer.
        in_kernel_size: Kernel size of the input convolution.
        out_kernel_size: Kernel size of the output convolution.
        resblock_kernel_sizes: List of kernel sizes to use in each residual block.
        resblock_dilation_sizes: List of dilations to use in each residual block.
        activation: Activation to use in residual and downsample layers, defaults to leaky relu.
    r   r   r   r   r   r  rH   r  r  r   rH   rF   rQ  ro  encoded_dimdown_sample_ratesbase_channelsin_kernel_sizeout_kernel_sizeresblock_kernel_sizesresblock_dilation_sizesrO  rq  c
              	      s  |dksJ |dksJ t    || _td|||	d| _|}
tg | _tg | _tg | _	t
| jD ]9\}}t|
|||d|	d}| j	| t||
d}| j| d|
 }d| }t|
||||	d}|}
| j| q6t||
d| _t|
|||	d| _d S )	Nr   r   r;  TrP  rV  rI  rO  r:  rq  rP  r   )rY  rZ  r   r%   rq  )r.   r/   ra  rn  pre_convrQ   rX   activationsdown_sample_conv_layers
res_layers	enumeraterU  rq   rM  post_activation	post_conv)r2   r`  ra  rb  rc  rd  re  rf  rO  rq  rY  idown_sample_rate	res_layeractrZ  r   down_sample_convr4   r   r    r/     sL   
zCausalHiFiGANEncoder.__init__c                 C   r  Nry   rz   r  	audio_lenr   r   r  r   r   r   r   r    r     r  z CausalHiFiGANEncoder.input_typesc                 C   r  Nrz   r   	T_encodedrz   encodedencoded_lenr  r   r   r   r    r     r  z!CausalHiFiGANEncoder.output_typesc                 C   @   | j   | j  | jD ]}|  q| jD ]}|  qd S r-   ri  rh  ro  rl  rk  r2   rr  rt  r   r   r    rh       





z'CausalHiFiGANEncoder.remove_weight_normc           
      C      |}t |d}| j||d}t| j| j| j| jD ]\}}}}|||d}||}|| }|||d}q| |}| j||d}	|	|fS Nr  r  	r   ri  r  rj  rl  rk  ra  rn  ro  
r2   r  rw  r~  r   rs  rr  rt  rq  r}  r   r   r    r9        

zCausalHiFiGANEncoder.forward)r]  r   r  r  r^  r_  rQ  ro  r;   r<   r=   r   r  r   rL  r/   r   r   r   rh  r   r9   r>   r   r   r4   r    r\  y  H    	
;

r\  c                       r[  )HiFiGANEncoderac  
    Audio encoder created by inverting the HiFi-GAN decoder.

    Args:
        encoded_dim: Dimension of encoder output.
        down_sample_rates: Rate to upsample for each decoder block. The product of the downsample rates will
            determine the output token rate. For example 2 * 2 * 8 * 8 = 256 samples per token.
        base_channels: Number of filters in the first convolution. The number of channels will be doubled after each
            downsample layer.
        in_kernel_size: Kernel size of the input convolution.
        out_kernel_size: Kernel size of the output convolution.
        resblock_kernel_sizes: List of kernel sizes to use in each residual block.
        resblock_dilation_sizes: List of dilations to use in each residual block.
        activation: Activation to use in residual and downsample layers, defaults to leaky relu.
    r]  r   r  r^  r_  rQ  r   r`  ra  rb  rc  rd  re  rf  rO  rq  c
              	      s  |dksJ |dksJ t    || _td|||	d| _|}
tg | _tg | _tg | _	t
| jD ]?\}}t|
||||	d}| j	| t||
d}| j| d|
 }d| }t||d}t|
|||||	d}|}
| j| q6t||
d| _t|
|||	d| _d S )	Nr   r   r;  rP  rV  rI  rO  rq  rh  r   r&   )rY  rZ  r   r%   r)   rq  )r.   r/   ra  r  ri  rQ   rX   rj  rk  rl  rm  rU  rq   rM  r'   rn  ro  )r2   r`  ra  rb  rc  rd  re  rf  rO  rq  rY  rp  rq  rr  rs  rZ  r   r)   rt  r4   r   r    r/     sN   
zHiFiGANEncoder.__init__c                 C   r  ru  rx  r   r   r   r    r   >  r  zHiFiGANEncoder.input_typesc                 C   r  ry  r  r   r   r   r    r   E  r  zHiFiGANEncoder.output_typesc                 C   r  r-   r  r  r   r   r    rh  L  r  z!HiFiGANEncoder.remove_weight_normc           
      C   r  r  r  r  r   r   r    r9   T  r  zHiFiGANEncoder.forward)r]  r   r  r  r^  r_  rQ  r   r  r   r   r4   r    r    r  r  c                          e Zd ZdZ										
	d!dedee dedededee dee dedededef fddZe	dd Z
e	dd Zdd Ze dd  Z  ZS )"CausalHiFiGANDecodera  
    Codec decoder using the HiFi-GAN generator architecture with Causal Convolutions.

    Args:
        input_dim: Input dimension.
        up_sample_rates: Rate to upsample for each decoder block. The product of the upsample rates should be the same
            as the overall downsample rate for your encoder. For example, a symmetric encoder/decoder can be created
            with encoder downsample rates [2, 2, 8, 8] and decoder upsample rates [8, 8, 2, 2].
        base_channels: Number of filters in the first convolution. The number of channels will be cut in
            half after each upsample layer.
        in_kernel_size: Kernel size of the input convolution.
        out_kernel_size: Kernel size of the output convolution.
        resblock_kernel_sizes: List of kernel sizes to use in each residual block.
        resblock_dilation_sizes: List of dilations to use in each residual block.
        activation: Activation to use in residual and upsample layers, defaults to leaky relu.
        output_activation: Activation to apply to output. To produce a valid audio signal, it should output values in
         the range [-1.0, 1.0]. Supports "tanh" and "clamp".
    r   r   r   r   r   r  rH   r^  r_  rQ  r  ro  Tr	  up_sample_ratesrb  rc  rd  re  rf  rO  output_activationrq  n_groups_equal_to_out_channelsc              	      s@  |dksJ |dksJ t    || _t||||
d| _|}tg | _tg | _tg | _	t
| jD ]=\}}|d }d| }t||d}| j| t|||||rW|ndd}|}| j| t||||d|
d}| j	| q6t||d| _t|d||
d| _|	d	krt | _d S |	d
krt | _d S td|	 )Nr   r;  r   rh  r   rY  rZ  r   r%   r[  Trg  r  r7   Invalid audio output activation )r.   r/   r  rn  ri  rQ   rX   rj  up_sample_conv_layersrl  rm  rM  rq   rX  rU  rn  ro  Tanhout_activationr   r  r2   r	  r  rb  rc  rd  re  rf  rO  r  rq  r  rY  rp  up_sample_raterZ  r   rs  up_sample_convrr  r4   r   r    r/     sV   

zCausalHiFiGANDecoder.__init__c                 C   r  Nrz  rz   r  r  r   r   r   r    r     r  z CausalHiFiGANDecoder.input_typesc                 C   r  ru  rx  r   r   r   r    r     r  z!CausalHiFiGANDecoder.output_typesc                 C   6   | j   | jD ]}|  q| jD ]}|  qd S r-   ri  rh  r  rl  r2   r  rr  r   r   r    rh       




z'CausalHiFiGANDecoder.remove_weight_normc           
      C      |}| j ||d}t| j| j| j| jD ]\}}}}|| }||}|||d}|||d}q| |}| j||d}| |}	t	|	d}	|	|fS Nr  zB 1 T -> B T
ri  r  rj  rl  r  r  rn  ro  r  r   
r2   rk  rl  rw  r   rs  rr  r  r  r  r   r   r    r9        


zCausalHiFiGANDecoder.forward)
r  r   r  rH   r^  r_  rQ  r  ro  Tr;   r<   r=   r   r  r   rL  r  r/   r   r   r   rh  r   r9   r>   r   r   r4   r    r  k  sT    	
C

r  c                       r  )"HiFiGANDecodera  
    Codec decoder using the HiFi-GAN generator architecture.

    Default parameters match the HiFi-GAN V1 configuration for 22.05khz.

    Args:
        input_dim: Input dimension.
        up_sample_rates: Rate to upsample for each decoder block. The product of the upsample rates should be the same
            as the overall downsample rate for your encoder. For example, a symmetric encoder/decoder can be created
            with encoder downsample rates [2, 2, 8, 8] and decoder upsample rates [8, 8, 2, 2].
        base_channels: Number of filters in the first convolution. The number of channels will be cut in
            half after each upsample layer.
        in_kernel_size: Kernel size of the input convolution.
        out_kernel_size: Kernel size of the output convolution.
        resblock_kernel_sizes: List of kernel sizes to use in each residual block.
        resblock_dilation_sizes: List of dilations to use in each residual block.
        activation: Activation to use in residual and upsample layers, defaults to leaky relu.
        output_activation: Activation to apply to output. To produce a valid audio signal, it should output values in
         the range [-1.0, 1.0]. Supports "tanh" and "clamp".
    r  r   r  rH   r^  r_  rQ  r  r   Fr	  r  rb  rc  rd  re  rf  rO  r  rq  r  c                    s>  |dksJ |dksJ t    || _t||||
d| _|}tg | _tg | _tg | _	t
| jD ]<\}}|d }d| }t||d}| j| t|||||rW|ndd}|}| j| t|||||
d}| j	| q6t||d| _t|d||
d| _|	dkrt | _d S |	d	krt | _d S td
|	 )Nr   r;  r   rh  r   r  r  r  r7  r  )r.   r/   r  r  ri  rQ   rX   rj  r  rl  rm  rM  rq   r  rU  rn  ro  r  r  r   r  r  r4   r   r    r/     sT   

zHiFiGANDecoder.__init__c                 C   r  r  r  r   r   r   r    r   E  r  zHiFiGANDecoder.input_typesc                 C   r  ru  rx  r   r   r   r    r   L  r  zHiFiGANDecoder.output_typesc                 C   r  r-   r  r  r   r   r    rh  S  r  z!HiFiGANDecoder.remove_weight_normc           
      C   r  r  r  r  r   r   r    r9   Z  r  zHiFiGANDecoder.forward)
r  r   r  rH   r^  r_  rQ  r  r   Fr  r   r   r4   r    r    sT    	
B

r  c                       s^   e Zd ZdZddededededef
 fd	d
Zedd Zedd Z	e
 dd Z  ZS )MelSpectrogramProcessorzM
    Wrapper interface for computing mel spectrogram for codec training.
    P   r   r   r   r   mel_dim	log_guardc                    s   t t|   || _|| _tdi d|dd d|ddddd|d	|d
dddd|ddddddd|dd dd dd dd| _d S )Nr   highfreqfeaturespad_tor   	exact_padTn_window_sizen_window_stridewindow_sizeFwindow_strider>  	mag_powerr   r4  log_zero_guard_typeaddlog_zero_guard_valuemel_normr9  preemphditherr   r   )r.   r  r/   r  r   r
   preprocessor)r2   r   r   r   r  r  r4   r   r    r/   u  sP   	

z MelSpectrogramProcessor.__init__c                 C   r  ru  rx  r   r   r   r    r     r  z#MelSpectrogramProcessor.input_typesc                 C   r  )N)rz   r   r  rz   )r  spec_len)r   r   r  r   r   r   r   r    r     r  z$MelSpectrogramProcessor.output_typesc                 C   s   | j ||d\}}||fS )N)input_signalr{  )r  )r2   r  rw  r  r  r   r   r    r9     s   zMelSpectrogramProcessor.forward)r  r   r  r   r   r4   r    r  p  s    $

r  c                       s   e Zd ZdZ							dd	ed
edededededededef fddZdd Ze	dd Z
e	dd Ze dd Z  ZS )ResNetEncoderaB  
    Residual network which uses HiFi-GAN residual blocks to encode spectrogram features without changing
    the time dimension.

    Args:
        in_channels: input dimension
        out_channels: output dimension
        num_layers: number of residual blocks to use
        hidden_channels: encoder hidden dimension
        filters: number of filters in residual block layers
        kernel_size: kernel size in residual block convolutions
        dropout_rate: Optional dropout rate to apply to residuals.
        activation: Activation to use, defaults to leaky relu.
    r   r   rC   rH   rj   rQ  r   rY  rZ  
num_layershidden_channelsr  r   r9  rO  rq  c
           
         sl   t t|   t|d| _t fddt|D | _t	 d| _
t|d| _d S )Nr;  c              
      s    g | ]}t  d qS ))rP  r  r   r9  rO  rq  rJ  r  rO  r9  r  r  r   rq  r   r    r    rX  z*ResNetEncoder.__init__.<locals>.<listcomp>rh  )r.   r  r/   r  ri  rQ   rX   r+  rl  rM  rn  ro  )
r2   rY  rZ  r  r  r  r   r9  rO  rq  r4   r  r    r/     s   	zResNetEncoder.__init__c                 C   s,   | j   | j  | jD ]}|  qd S r-   )ri  rh  ro  rl  )r2   rr  r   r   r    rh    s
   



z ResNetEncoder.remove_weight_normc                 C   r  r  r  r   r   r   r    r     r  zResNetEncoder.input_typesc                 C   r  )Nr}  r  rB  r   r   r   r    r     rC  zResNetEncoder.output_typesc                 C   sB   | j ||d}| jD ]}|||d}q
| |}| j||d}|S rD  )ri  rl  rn  ro  )r2   rk  rl  r}  rr  r   r   r    r9     s   

zResNetEncoder.forward)r   r   rC   rH   rj   rQ  r   )r;   r<   r=   r   r  r  rL  r/   rh  r   r   r   r   r9   r>   r   r   r4   r    r    sF    	
#

r  c                       sX   e Zd ZdZdedef fddZdd Zedd	 Zed
d Z	e
 dd Z  ZS )FullBandMelEncoderaE  
    Encoder which encodes the entire mel spectrogram with a single encoder network.

    Args:
        mel_processor: MelSpectrogramProcessor or equivalent class instance for computing the mel spectrogram from
            input audio.
        encoder: ResNetEncoder or equivalent class for encoding the mel spectrogram.
    mel_processorencoderc                    s   t t|   || _|| _d S r-   )r.   r  r/   r  r  )r2   r  r  r4   r   r    r/     s   
zFullBandMelEncoder.__init__c                 C   s   | j   d S r-   )r  rh  r   r   r   r    rh    s   z%FullBandMelEncoder.remove_weight_normc                 C   r  ru  rx  r   r   r   r    r   	  r  zFullBandMelEncoder.input_typesc                 C   r  N)rz   r   r{  rz   r|  r  r   r   r   r    r   		  r  zFullBandMelEncoder.output_typesc                 C   s(   | j ||d\}}| j||d}||fS )Nrv  r  )r  r  )r2   r  rw  r   r  r}  r   r   r    r9   	  s   zFullBandMelEncoder.forward)r;   r<   r=   r   r   r/   rh  r   r   r   r   r9   r>   r   r   r4   r    r    s    	

r  c                       s   e Zd ZdZdeeeef  def fddZe	dedeeeef  fddZ
d	d
 Zedd Zedd Ze dd Z  ZS )MultiBandMelEncodera  
    Encoder which splits mel spectrogram into bands and encodes each using separate residual networks.

    Args:
        mel_bands: List of mel spectrogram bands to encode.
            Each list element is tuple of 2 elements with the start and end index of the mel features to use.
        mel_processor: MelSpectrogramProcessor or equivalent class instance for computing the mel spectrogram from
            input audio.
        encoder_kwargs: Arguments for constructing encoder for each mel band.
    	mel_bandsr  c                    sX   t t|   | j|j|d || _|| _dd | jD }t fdd|D | _	d S )N)r  r  c                 S   s   g | ]
}|d  |d  qS r   r   r  r   r   r    r  (	  s    z0MultiBandMelEncoder.__init__.<locals>.<listcomp>c                    s   g | ]}t dd |i qS )rY  r   )r  )r  band_dimencoder_kwargsr   r    r  *	  r+  )
r.   r  r/   validate_mel_bandsr  r  r  rQ   rX   encoders)r2   r  r  r  	band_dimsr4   r  r    r/   #	  s   
zMultiBandMelEncoder.__init__r  c                 C   s\   t j| gtd}|D ]}d||d |d < q
t|s,t | }td|  d| dd S )Nrt  Tr   r   zMel bands must cover all z dimensions. Missing r&  )npro  r  allr  r  )r  r  mel_dims_usedr  missing_dimsr   r   r    r  -	  s   z&MultiBandMelEncoder.validate_mel_bandsc                 C   rN  r-   )r  rh  )r2   r  r   r   r    rh  9	  rR  z&MultiBandMelEncoder.remove_weight_normc                 C   r  ru  rx  r   r   r   r    r   =	  r  zMultiBandMelEncoder.input_typesc                 C   r  r  r  r   r   r   r    r   D	  r  z MultiBandMelEncoder.output_typesc                 C   sx   | j ||d\}}g }t| j| jD ]\\}}}|d d ||d d f }	||	|d}
||
 qtj|dd}||fS )Nrv  r  r   rd   )r  r  r  r  rq   rP   r8  )r2   r  rw  r  r  outputs
band_startband_endr  r  band_outr}  r   r   r    r9   K	  s   zMultiBandMelEncoder.forward)r;   r<   r=   r   r   r   r  r   r/   r  r  rh  r   r   r   r   r9   r>   r   r   r4   r    r  	  s    "
 

r  rK  r-   )Zr_  r   abcr   r   typingr   r   r   r   numpyr  rP   torch.nnrQ   torch.nn.functionalr   ro   einopsr   transformersr	   nemo.collections.asr.modulesr
   #nemo.collections.common.parts.utilsr   r   r   r   nemo.core.classes.commonr   nemo.core.classes.moduler   nemo.core.neural_types.elementsr   r   r   r   r   r   "nemo.core.neural_types.neural_typer   
nemo.utilsr   rJ   rI   r   r   r   r  r!   r$   r'   r*   r+   r?   r   rL  r   r   r   r   r   ModulerM  rX  rn  r  r  r  r  r  r  r  r  r  r  r  r'  r8  rH  rU  r\  r  r  r  r  r  r  r  r   r   r   r    <module>   s    .	]! :7c0'+B.9D6)- GpK<<yy  2N'