o
    }oip                     @   s&  d dl mZmZmZ d dlZd dlmZ d dlm  mZ	 d dl
mZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlm Z m!Z!m"Z"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ G dd deZ,G dd deZ-G dd deZ.G dd deZ/G dd deZ0G dd deZ1dedede2ddfdd Z3d>d"ed#e4d$e2defd%d&Z5d'ed(edefd)d*Z6d+ed,e4defd-d.Z7d?d+ed0e4d1e4deeef fd2d3Z8d4ed5efd6d7Z9e+G d8d9 d9eZ:G d:d; d;eZ;G d<d= d=eZ<dS )@    )IterableOptionalTupleN)	rearrangerepeat)Tensor)mask_sequence_tensor)MaskedMSELoss)CodecActivation
Conv1dNorm
Conv2dNormConvTranspose1dNormVectorQuantizerBaseget_down_sample_padding)broadcast_tensors)	typecheck)NeuralModule)AudioSignalEncodedRepresentationIndexLengthsTypeLossTypeVoidType)
NeuralType)logging)experimentalc                       sV   e Zd Zddedef fddZedd Zedd	 Zd
d Z	e
 dd Z  ZS )SEANetResnetBlockeluchannels
activationc                    sb   t    t||d| _|d }t||dd| _t||dd| _t||d| _t||dd| _d S )Nr   r         in_channelsout_channelskernel_size   )	super__init__r
   pre_activationr   pre_conv	res_conv1post_activation	res_conv2)selfr   r   hidden_channels	__class__ `/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/modules/encodec_modules.pyr)   B   s   
zSEANetResnetBlock.__init__c                 C      t dt t tdt dS )N)BCT_inputr6   inputs	input_lenr   r   tupler   r/   r3   r3   r4   input_typesK      
zSEANetResnetBlock.input_typesc                 C      dt dt iS )Nout)r6   r7   T_outr   r   r>   r3   r3   r4   output_typesR      zSEANetResnetBlock.output_typesc                 C   s"   | j   | j  | j  d S N)r+   remove_weight_normr,   r.   r>   r3   r3   r4   rH   X   s   

z$SEANetResnetBlock.remove_weight_normc                 C   sP   |  |}| j||d}| |}| j||d}| j||d| }t||}|S )Nr9   )r*   r,   r-   r.   r+   r   )r/   r:   r;   resrB   r3   r3   r4   forward]   s   


zSEANetResnetBlock.forward)r   )__name__
__module____qualname__intstrr)   propertyr?   rE   rH   r   rJ   __classcell__r3   r3   r1   r4   r   A   s    	

r   c                	       sV   e Zd Zddedededef fddZed	d
 Zedd Z	e
 dd Z  ZS )	SEANetRNNlstmFdim
num_layersrnn_typeuse_skipc                    s^   t    || _|dkrtjj|||d| _d S |dkr(tjj|||d| _d S td| )NrS   )
input_sizehidden_sizerU   gruzUnknown RNN type )	r(   r)   rW   torchnnLSTMrnnGRU
ValueError)r/   rT   rU   rV   rW   r1   r3   r4   r)   j   s   
zSEANetRNN.__init__c                 C   r5   )Nr6   r7   Tr6   r9   r<   r>   r3   r3   r4   r?   t   r@   zSEANetRNN.input_typesc                 C   rA   )NrB   ra   rD   r>   r3   r3   r4   rE   {   rF   zSEANetRNN.output_typesc                 C   sd   t |d}tjjj|| ddd}| |\}}tjjj|dd\}}| jr+|| }t |d}|S )NzB C T -> B T CTF)lengthsbatch_firstenforce_sorted)rd   zB T C -> B C T)r   r\   utilsr^   pack_padded_sequencecpupad_packed_sequencerW   )r/   r:   r;   packed_inputs
packed_out_rB   r3   r3   r4   rJ      s   

zSEANetRNN.forward)rS   F)rK   rL   rM   rN   rO   boolr)   rP   r?   rE   r   rJ   rQ   r3   r3   r1   r4   rR   i   s     


rR   c                       s   e Zd Z									dd	ee d
edededededededef fddZedd Z	edd Z
dd Ze dd Z  ZS )SEANetEncoderr!                      r   r!   rS   Tdown_sample_ratesbase_channelsin_kernel_sizeout_kernel_sizeencoded_dimr   
rnn_layersrV   rnn_skipc
              	      s
  |dksJ |dksJ t    || _td||d| _|}
tg | _tg | _tg | _	t
| jD ]5\}}t|
d}| j| | j	t||
d d|
 }d| }t|
|||t||d}|}
| j| q5t||
d| _t|
|||	d| _t|
||d| _d S )	Nr   r"   r#   r   r    r!   )r$   r%   r&   stridepaddingrT   rU   rV   rW   )r(   r)   rv   r   r+   r\   
ModuleList
res_blocksdown_sample_conv_layersactivations	enumerater   appendr
   r   r-   rR   r^   	post_conv)r/   rv   rw   rx   ry   rz   r   r{   rV   r|   r$   idown_sample_rate	res_blockr%   r&   down_sample_convr1   r3   r4   r)      s6   

zSEANetEncoder.__init__c                 C   r5   Nr6   T_audior6   )audio	audio_lenr   r   r=   r   r>   r3   r3   r4   r?      r@   zSEANetEncoder.input_typesc                 C   r5   )Nr6   D	T_encodedr6   )encodedencoded_lenr   r   r=   r   r>   r3   r3   r4   rE      r@   zSEANetEncoder.output_typesc                 C   s@   | j   | j  | jD ]}|  q| jD ]}|  qd S rG   )r+   rH   r   r   r   )r/   r   r   r3   r3   r4   rH      s   





z SEANetEncoder.remove_weight_normc           
      C   s   |}t |d}| j||d}t| j| j| j| jD ]\}}}}|||d}||}|| }|||d}q| j||d}| |}| j	||d}	|	|fS )NzB T -> B 1 Tr9   )
r   r+   zipr   r   rv   r   r^   r-   r   )
r/   r   r   r   rB   r   r   r   r   r   r3   r3   r4   rJ      s   

zSEANetEncoder.forward)	ro   rs   rt   rt   ru   r   r!   rS   TrK   rL   rM   r   rN   rO   rm   r)   rP   r?   rE   rH   r   rJ   rQ   r3   r3   r1   r4   rn      sH    	
-

rn   c                       s   e Zd Z										dd
ee dedededededededef fddZedd Z	edd Z
dd Ze dd Z  ZS )SEANetDecoderrr   rq   rp   r!      rt   r'   ru   r   r!   rS   Tup_sample_ratesrw   rx   ry   rz   r   r{   rV   r|   c
                    s  |dksJ |dksJ t    || _t|||d| _t||||	d| _|}
tg | _	tg | _
tg | _t| jD ]1\}}| jt||
d |
d }d| }t|
|||d}|}
| j
| t|
d}| j	| q>t||
d| _t|
d|d| _t | _d S )	Nr   r#   r   r    r!   )r$   r%   r&   r~   r}   r"   )r(   r)   r   r   r+   rR   r^   r\   r   r   up_sample_conv_layersr   r   r   r
   r   r   r-   r   Tanhout_activation)r/   r   rw   rx   ry   rz   r   r{   rV   r|   r$   r   up_sample_rater%   r&   up_sample_convr   r1   r3   r4   r)      s6   

zSEANetDecoder.__init__c                 C   s"   t dt gt tdt gdS )Nr   r6   r9   r   r>   r3   r3   r4   r?     s   zSEANetDecoder.input_typesc                 C   r5   r   r   r>   r3   r3   r4   rE   #  r@   zSEANetDecoder.output_typesc                 C   s6   | j   | jD ]}|  q| jD ]}|  qd S rG   )r+   rH   r   r   )r/   r   r   r3   r3   r4   rH   *  s   




z SEANetDecoder.remove_weight_normc           
      C   s   |}| j ||d}| j||d}t| j| j| j| jD ]\}}}}|| }||}|||d}|||d}q| |}| j||d}| 	|}	t
|	d}	|	|fS )Nr9   zB 1 T -> B T)r+   r^   r   r   r   r   r   r-   r   r   r   )
r/   r:   r;   r   rB   r   r   r   r   r   r3   r3   r4   rJ   1  s   


zSEANetDecoder.forward)	r   r   rt   r'   ru   r   r!   rS   Tr   r3   r3   r1   r4   r      sH    	
-

r   c                       sL   e Zd Zd fdd	Zdd Zedd Zedd	 Ze d
d Z	  Z
S )DiscriminatorSTFT皙?c                    s   t    |\| _| _| _| dtj| jdd t	|| _
ttddddtddddd	d
tddddd	d
tddddd	d
tddddg| _tdddd| _d S )NwindowF)periodicr!   rs   )r'   	   )r&   )r"   r"   )r"   r!   )r&   dilationr~   )r!   r"   )rp   r"   )r'   r'   r"   )r(   r)   n_fft
hop_length
win_lengthregister_bufferr[   hann_windowr\   	LeakyReLUr   r   r   conv_layers	conv_post)r/   
resolutionlrelu_sloper1   r3   r4   r)   I  s   
	zDiscriminatorSTFT.__init__c              
   C   sF   t j|| j| j| j| jdddd}t|d}t j|j|j	gdd}|S )NT)r   r   r   r   
normalizedcenterreturn_complexzB fft T -> B 1 T fftr"   rT   )
r[   stftr   r   r   r   r   catrealimag)r/   r   rB   r3   r3   r4   r   [  s   

zDiscriminatorSTFT.stftc                 C   rA   )Nr   r   r   r   r>   r3   r3   r4   r?   l  rF   zDiscriminatorSTFT.input_typesc                 C   s   t dt t dt gdS )Nr6   r7   T_specr6   r   r   r7   )scoresfmaprD   r>   r3   r3   r4   rE   r  s   
zDiscriminatorSTFT.output_typesc                 C   s`   g }|  |}| jD ]}||d}| |}|| q
| j|d}|| t|d}||fS )Nr:   zB 1 T C -> B C T)r   r   r   r   r   r   )r/   r   r   rB   convr   r3   r3   r4   rJ   y  s   





zDiscriminatorSTFT.forward)r   )rK   rL   rM   r)   r   rP   r?   rE   r   rJ   rQ   r3   r3   r1   r4   r   H  s    

r   c                       sB   e Zd Z fddZedd Zedd Ze dd Z  Z	S )	 MultiResolutionDiscriminatorSTFTc                    s$   t    tdd |D | _d S )Nc                 S   s   g | ]}t |qS r3   )r   ).0rI   r3   r3   r4   
<listcomp>  s    z=MultiResolutionDiscriminatorSTFT.__init__.<locals>.<listcomp>)r(   r)   r\   r   discriminators)r/   resolutionsr1   r3   r4   r)     s   
z)MultiResolutionDiscriminatorSTFT.__init__c                 C   s   t dt t dt dS )Nr   )
audio_real	audio_genr   r>   r3   r3   r4   r?        

z,MultiResolutionDiscriminatorSTFT.input_typesc                 C   s:   t dt gt dt gt dt ggt dt ggdS )Nr   r   )scores_real
scores_gen
fmaps_real	fmaps_genrD   r>   r3   r3   r4   rE     s
   z-MultiResolutionDiscriminatorSTFT.output_typesc                 C   sl   g }g }g }g }| j D ]$}||d\}}	|| ||	 ||d\}
}||
 || q||||fS )N)r   )r   r   )r/   r   r   r   r   r   r   disc
score_real	fmap_real	score_genfmap_genr3   r3   r4   rJ     s   



z(MultiResolutionDiscriminatorSTFT.forward)
rK   rL   rM   r)   rP   r?   rE   r   rJ   rQ   r3   r3   r1   r4   r     s    

r   
moving_avgnewdecayreturnc                 C   s   | j |j|d| d d S )Nr"   )alpha)datamul_add_r   r   r   r3   r3   r4   _ema_inplace  s   r   h㈵>r:   n_categoriesepsilonc                 C   s$   |   }| | |||   }|| S rG   )sum)r:   r   r   	input_sumsmoothedr3   r3   r4   _laplace_smoothing  s   r   input1input2c                 C   sB   t |d}| djdddd|  |  |djddd }|S )z
    Compute pairwise L2 distance between two input tensors

    Args:
        input1: [B, D] first tensor.
        input2: [N, D] second tensor.

    Returns:
        [(B, D)] tensor of distances.
    z
N D -> D Nr!   r"   T)keepdimr   )r   powr   )r   r   	distancesr3   r3   r4   _compute_distances  s   
4r   samples
num_samplec                 C   sR   | j }| jd }||krtj||dd| }| | S tjd||f|d}| | S )a  
    Randomly sample from the input batch.

    Args:
        samples: [B, D] tensor with features to sample.
        num_sample: Number of samples to draw.
            If the value is less than or equal to B, then the samples will be unique.
            If the value is greater than B, then samples will be drawn with replacement.

    Returns:
        Tensor with num_sample values randomly sampled from the input batch.
    r   deviceN)lowhighsizer   )r   shaper[   randpermrandint)r   r   r   total_samplesindicesr3   r3   r4   _sample_vectors  s   
r   
   num_clusters	num_itersc                 C   s   |dksJ | j d }t| |d}t|D ]D}t| |}|jddj}t|d|d}tj||d}	t	|	d}
|j
||| jd	}|jd|| d
 |tj|
dd }t|
dk||}q||	fS )aE  
    K-means clustering algorithm.

    Args:
        samples: [B, D] tensor with features to cluster
        num_clusters: K, the number of clusters.
        num_iters: Number of iterations of K-means to run.

    Returns:
        [K, D] cluster means and [K] bins counting how many input samples belong to each cluster
    r   r"   r   r   r   zB -> B D)r   )	minlength	K -> K ())dtype)rT   indexsrc)min)r   r   ranger   r  r   r   r[   bincountr   	new_zerosr   scatter_add_clampwhere)r   r   r   	input_dimmeansrl   distsbucketsbuckets_repeated
bin_countsbin_counts_expanded	new_meansr3   r3   r4   _k_means  s   


r  tensorrc   c                 C   s>   | j \}}}t||djdd|}|t|dk}| | S )z
    Mask 3d tensor with time on 1st axis.

    Args:
        tensor: tensor of shape (B, T, D)
        lengths: LongTensor of shape (B,)
    Returns:
        Masked Tensor (B, T, D)
    r"   r   z
b -> b 1 1)r   r[   onescumsumtype_asr   )r  rc   
batch_sizemax_lengthsrl   maskr3   r3   r4   _mask_3d  s   
r  c                       s<  e Zd ZdZ			d+dedededee d	ee f
 fd
dZej	j
dd ZdeddfddZdededdfddZdedefddZdedefddZedd Zedd Ze dd Zeed e eed!e d"ded#e id$d%d& Zeed#e eed!e d'd(ed e id$d)d* Z  ZS ),EuclideanCodebooka  
    Codebook with Euclidean distance.

    Args:
        codebook_size: Number of codes to use.
        codebook_dim: Dimension of each code.
        decay: Decay for exponential moving average over the codebooks.
        threshold_ema_dead_code: Threshold for dead code expiration.
            During every iteration, replace codes with exponential moving average cluster size less than threshold
            with randomly selected values from the current batch.
        kmeans_iters: Optional int, if provided codes will be initialized from the centroids learned from
            kmeans_iters iterations of k-means clustering on the first training batch.
    Gz?       @2   codebook_sizecodebook_dimr   threshold_ema_dead_codekmeans_itersc                    s   t    || _|rtjt||}nt||}|| _	|| _
|| _| dt| g | dt| | d| | d|  d S )Ninitializedcluster_sizecodes	codes_avg)r(   r)   r   r\   initkaiming_uniform_r[   emptyzerosr  r   r  r   r   clone)r/   r  r  r   r  r   r#  r1   r3   r4   r)   ,  s   
zEuclideanCodebook.__init__c                 C   sr   | j rd S t|| j| jd\}}| jj| | jj|  | j	j| | j jt
dg t|   d S )N)r   r   r   T)r!  r  r  r   r#  r   copy_r$  r)  r"  r   r   buffers)r/   r   r#  r"  r3   r3   r4   _init_codesF  s   zEuclideanCodebook._init_codesr:   r   Nc                 C   sj   | j sd S | j| j k }t|sd S t|| jd}t|d}t||| j}| jj	
| t|   d S )Nr   r   )r  r"  r[   anyr   r  r   r  r#  r   r*  r   r+  )r/   r:   expired_codesr   modified_codesr3   r3   r4   _expire_codesR  s   

zEuclideanCodebook._expire_codesr   c                 C   s   t || j|j}t|d}|d}t| j|| j	d || }t| j
|| j	d t| j| jd}t|d}| j
| }| jj| d S )Nz
B N -> N Br"   r   )r   z	N -> N ())Fone_hotr  typer   r   r   r   r"  r   r$  r   r#  r   r*  )r/   r:   r   code_onehotcode_countscode_sumcluster_size_smoothedcodes_normalizedr3   r3   r4   _update_codesa  s   



zEuclideanCodebook._update_codesc                 C   s   t || j}|jddj}|S )Nr"   r   )r   r#  r  r   )r/   r:   distr   r3   r3   r4   	_quantizep  s   zEuclideanCodebook._quantizec                 C   s   t || j}|S rG   )r1  	embeddingr#  )r/   r   dequantizedr3   r3   r4   _dequantizew  s   zEuclideanCodebook._dequantizec                 C   r5   )Nr6   rb   r   r6   r9   r   r>   r3   r3   r4   r?   |  r@   zEuclideanCodebook.input_typesc                 C   s   t dt t dt dS )Nr?  r6   rb   )r=  r   )r   r   r   r>   r3   r3   r4   rE     r   zEuclideanCodebook.output_typesc                 C   s|   t |d}| | | j|d}|j|jd d  }| j|d}| jr0| j|d | j||d t	||}t
||}||fS )NB T D -> (B T) Dr   r   )r:   r   )r   r,  r;  viewr   r>  trainingr0  r9  r  r   )r/   r:   r;   
input_flatindices_flatr   r=  r3   r3   r4   rJ     s   



zEuclideanCodebook.forwardr?  r6   r9   r@  r?   rE   c                 C   s8   t |d}| j|d}|j|jd d  }t||}|S )NrA  r   rB  )r   r;  rD  r   r   )r/   r:   r;   rF  rG  r   r3   r3   r4   encode  s
   

zEuclideanCodebook.encoder   r;   r=  c                 C   s   | j |d}t||}|S )NrC  )r>  r  )r/   r   r;   r=  r3   r3   r4   decode  s   	
zEuclideanCodebook.decode)r  r  r  )rK   rL   rM   __doc__rN   floatr   r)   r[   jitignorer,  r   r0  r9  r;  r>  rP   r?   rE   r   rJ   r   r   r=   r   r   rI  rK  rQ   r3   r3   r1   r4   r    sT    





	
r  c                       s  e Zd ZdZ					d"deded	ed
edee dee f fddZedd Z	e
 dededeeeef fddZe
ede eede ddede iddededefddZe
ede eede ddede iddededefd d!Z  ZS )#ResidualVectorQuantizera  
    Residual vector quantization (RVQ) algorithm as described in https://arxiv.org/pdf/2107.03312.pdf.

    Args:
        num_codebooks: Number of codebooks to use.
        codebook_size: Number of codes to use for each codebook.
        codebook_dim: Dimension of each code.
        decay: Decay for exponential moving average over the codebooks.
        threshold_ema_dead_code: Threshold for dead code expiration.
            During every iteration, replace codes with exponential moving average cluster size less than threshold
            with randomly selected values from the current batch.
        kmeans_iters: Optional int, if provided codes will be initialized from the centroids learned from
            kmeans_iters iterations of k-means clustering on the first training batch.
       ru   r  r  r  num_codebooksr  r  r   r  r   c                    sB   t     | _t | _t fddt|D | _d S )Nc              	      s   g | ]}t  d qS ))r  r  r   r  r   )r  r   rl   r  r  r   r   r  r3   r4   r     s    z4ResidualVectorQuantizer.__init__.<locals>.<listcomp>)	r(   r)   r  r	   commit_loss_fnr\   r   r  	codebooks)r/   rR  r  r  r   r  r   r1   rT  r4   r)     s   
	
z ResidualVectorQuantizer.__init__c                 C   $   t dt t dt t dt dS Nr6   r   rb   r   r6   rb   r3   )r=  r   commit_lossr   r   r   r   r>   r3   r3   r4   rE        


z$ResidualVectorQuantizer.output_typesr:   r;   r   c                 C   s   d}t |d}g }t|}| jD ]=}|||d\}}	| jrA| }
| jt |dt |
d|d}|| }||
 }|||   }n|| }|| }||	 qt|}t |d}|||fS )Ng        B D T -> B T Dr9   B T D -> B D T)	predictedtarget
target_len)	r   r[   
zeros_likerV  rE  detachrU  r   stack)r/   r:   r;   r[  residual
index_listr=  codebookdequantized_i	indices_idequantized_i_constcommit_loss_ir   r3   r3   r4   rJ     s,   





zResidualVectorQuantizer.forwardrY  r6   r9   r   rZ  rH  c           	      C   sV   t |d}g }| jD ]}|j||d}|j||d}|| }|| q
t|}|S )Nr^  r9   rJ  )r   rV  rI  rK  r   r[   re  )	r/   r:   r;   rf  rg  rh  rj  ri  r   r3   r3   r4   rI    s   


zResidualVectorQuantizer.encoderJ  r=  c                 C   s^   t j|jd |jd | jg|jd}t|| jD ]\}}|j||d}|| }qt|d}|S )Nr"   r!   r   rJ  r_  )	r[   r(  r   r  r   r   rV  rK  r   )r/   r   r;   r=  codebook_indicesrh  ri  r3   r3   r4   rK  %  s   $

zResidualVectorQuantizer.decode)rQ  ru   r  r  r  )rK   rL   rM   rL  rN   rM  r   r)   rP   rE   r   r   r   rJ   r   r   r=   r   r   rI  rK  rQ   r3   r3   r1   r4   rP    sN    
" 

 	rP  c                       s   e Zd ZdZdededef fddZedd Zed	d
 Zedd Z	e
 dd Ze
ede eede ddede iddededefddZe
ede eede ddede iddededefddZ  ZS )GroupResidualVectorQuantizera7  Split the input vector into groups and apply RVQ on each group separately.

    Args:
        num_codebooks: total number of codebooks
        num_groups: number of groups to split the input into, each group will be quantized separately using num_codebooks//num_groups codebooks
        codebook_dim: embedding dimension, will be split into num_groups
        **kwargs: parameters of ResidualVectorQuantizer

    References:
        Yang et al, HiFi-Codec: Group-residual Vector quantization for High Fidelity Audio Codec, 2023 (http://arxiv.org/abs/2305.02765).
    rR  
num_groupsr  c                    s   t    |_|_|_tj fddtjD _	t
djj t
dj t
dj t
dj t
dj t
dj d S )	Nc                    s$   g | ]}t djjd  qS ))rR  r  r3   )rP  num_codebooks_per_groupcodebook_dim_per_grouprS  kwargsr/   r3   r4   r   N  s    z9GroupResidualVectorQuantizer.__init__.<locals>.<listcomp>zInitialized %s withz	num_codebooks:           %dz	num_groups:              %dz	codebook_dim:            %dz	num_codebooks_per_group: %dz	codebook_dim_per_group:  %d)r(   r)   rR  ro  r  r[   r\   r   r  rvqsr   debugr2   rK   rp  rq  )r/   rR  ro  r  rs  r1   rr  r4   r)   E  s   
	z%GroupResidualVectorQuantizer.__init__c                 C   6   | j | j dkrtd| j  d| j d| j | j S )z#Number of codebooks for each group.r   znum_codebooks (#) must be divisible by num_groups ())rR  ro  r`   r>   r3   r3   r4   rp  ]  s
   z4GroupResidualVectorQuantizer.num_codebooks_per_groupc                 C   rv  )z&Input vector dimension for each group.r   zcodebook_dim (rw  rx  )r  ro  r`   r>   r3   r3   r4   rq  g  s   z3GroupResidualVectorQuantizer.codebook_dim_per_groupc                 C   rW  rX  r\  r>   r3   r3   r4   rE   p  r]  z)GroupResidualVectorQuantizer.output_typesc                 C   s   |j | jdd}g g }}d}t|| jD ]\}}|||d\}	}
}||	 ||
 ||7 }qtj|dd}tj|dd}|||fS )z=Quantize each group separately, then concatenate the results.r"   r   r   r9   )chunkro  r   rt  r   r[   r   )r/   r:   r;   inputs_groupedr=  r   r[  in_group	rvq_groupdequantized_groupindices_groupcommit_loss_groupr3   r3   r4   rJ   x  s   




z$GroupResidualVectorQuantizer.forwardrY  r6   r9   r   rZ  rH  r:   r;   r   c                 C   T   |j | jdd}g }t|| jD ]\}}|j||d}|| qtj|dd}|S )z`Input is split into groups, each group is encoded separately, then the results are concatenated.r"   r   r9   r   )ry  ro  r   rt  rI  r   r[   r   )r/   r:   r;   rz  r   r{  r|  r~  r3   r3   r4   rI    s   	z#GroupResidualVectorQuantizer.encoderJ  r=  c                 C   r  )ziInput indices are split into groups, each group is decoded separately, then the results are concatenated.r   r   rJ  r"   )ry  ro  r   rt  rK  r   r[   r   )r/   r   r;   indices_groupedr=  r~  r|  r}  r3   r3   r4   rK    s   z#GroupResidualVectorQuantizer.decode)rK   rL   rM   rL  rN   r)   rP   rp  rq  rE   r   rJ   r   r   r=   r   r   r   rI  rK  rQ   r3   r3   r1   r4   rn  8  s4    
	




 	rn  )r   )r   )=typingr   r   r   r[   torch.nnr\   torch.nn.functional
functionalr1  einopsr   r   r   #nemo.collections.common.parts.utilsr   ,nemo.collections.tts.losses.audio_codec_lossr	   0nemo.collections.tts.modules.audio_codec_modulesr
   r   r   r   r   r   ,nemo.collections.tts.parts.utils.distributedr   nemo.core.classes.commonr   nemo.core.classes.moduler   nemo.core.neural_types.elementsr   r   r   r   r   r   "nemo.core.neural_types.neural_typer   
nemo.utilsr   nemo.utils.decoratorsr   r   rR   rn   r   r   r   rM  r   rN   r   r   r   r  r  r  rP  rn  r3   r3   r3   r4   <module>   sB   %  ()\ZD($&  |