o
    㥵i8+                  	   @   s  d dl Z d dlmZ d dlmZ d dlZd dlZd dlmZ d dl	m
Z
 d dlmZ ddlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ dd ZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd de
eZedkrFd dlZd dlmZ e dZ e ! D ]"\Z"Z#e#$ Z%e&dd e#' D Z(d d! Z)e*e#d"ee)e%e(d# qe+e  e+d$e&d%d e ' D  d&Z,e-dde,e j.Z/e/0d' e/1  e e/d( Z2e+d)e/j3 e+d*e2j3 e4e2Z5de5dddde5j3d+ d, f< e26e5 e/j57d Z8e8d k&d Z8e8d k& Z9e+d-e9:   ee-ddd.d/Z/e j;e j<e/d'd0d'd0 dS dS )1    N)List)Union)AudioSignal)	BaseModel)nn   )
CodecMixin)Snake1d)WNConv1d)WNConvTranspose1d)ResidualVectorQuantizec                 C   s6   t | tjrtjj| jdd tj| jd d S d S )Ng{Gz?)stdr   )
isinstancer   Conv1dinittrunc_normal_weight	constant_bias)m r   A/home/ubuntu/.local/lib/python3.10/site-packages/dac/model/dac.pyinit_weights   s   r   c                       0   e Zd Zd	dedef fddZdd Z  ZS )
ResidualUnit   r   dimdilationc              
      sL   t    d| d }tt|t||d||dt|t||dd| _d S )N         )kernel_sizer   paddingr   )r!   )super__init__r   
Sequentialr	   r
   block)selfr   r   pad	__class__r   r   r$      s   

zResidualUnit.__init__c                 C   sD   |  |}|jd |jd  d }|dkr|d|| f }|| S )Nr   r   .)r&   shape)r'   xyr(   r   r   r   forward#   s
   
zResidualUnit.forwardr   r   __name__
__module____qualname__intr$   r/   __classcell__r   r   r)   r   r      s    
r   c                       r   )
EncoderBlockr   r   r   stridec                    sn   t    tt|d ddt|d ddt|d ddt|d t|d |d| |t|d d| _	d S )Nr   r   r      	   r!   r8   r"   )
r#   r$   r   r%   r   r	   r
   mathceilr&   )r'   r   r8   r)   r   r   r$   ,   s   


zEncoderBlock.__init__c                 C   
   |  |S Nr&   r'   r-   r   r   r   r/   <      
zEncoderBlock.forwardr0   r1   r   r   r)   r   r7   +   s    r7   c                       s>   e Zd Zdg ddfdededef fddZdd	 Z  ZS )
Encoder@   r         rH   d_modelstridesd_latentc              	      s   t    td|dddg| _|D ]}|d9 }|  jt||dg7  _q|  jt|t||dddg7  _tj| j | _|| _d S )Nr   r    r:   r!   r"   r   )r8   )	r#   r$   r
   r&   r7   r	   r   r%   enc_dim)r'   rI   rJ   rK   r8   r)   r   r   r$   A   s   

zEncoder.__init__c                 C   r?   r@   rA   rB   r   r   r   r/   Z   rC   zEncoder.forward)r2   r3   r4   r5   listr$   r/   r6   r   r   r)   r   rD   @   s    rD   c                       s4   e Zd Zddededef fddZd	d
 Z  ZS )DecoderBlockr   rH   r   	input_dim
output_dimr8   c                    sZ   t    tt|t||d| |t|d dt|ddt|ddt|dd| _	d S )Nr   r<   r   r9   r:   r;   )
r#   r$   r   r%   r	   r   r=   r>   r   r&   )r'   rP   rQ   r8   r)   r   r   r$   _   s   




zDecoderBlock.__init__c                 C   r?   r@   rA   rB   r   r   r   r/   o   rC   zDecoderBlock.forward)r   rH   r   r1   r   r   r)   r   rO   ^   s    rO   c                       s.   e Zd Z	ddef fddZdd Z  ZS )Decoderr   d_outc           
         s   t    t||dddg}t|D ]\}}|d|  }|d|d   }	|t||	|g7 }q|t|	t|	|dddt g7 }tj| | _	d S )Nr    r:   rL   r   r   )
r#   r$   r
   	enumeraterO   r	   r   Tanhr%   model)
r'   input_channelchannelsratesrS   layersir8   rP   rQ   r)   r   r   r$   t   s   
zDecoder.__init__c                 C   r?   r@   )rV   rB   r   r   r   r/      rC   zDecoder.forward)r   r1   r   r   r)   r   rR   s   s    rR   c                       s   e Zd Zdg dddg ddddd	d
f
dedee dededee dededeeef dedef fddZdd Z		d"de
jdefddZde
jfddZ		d#de
jdedefd d!Z  ZS )$DACrE   rF   Ni   )rH   rH   rG   r   r;   i   rH   FD  encoder_dimencoder_rates
latent_dimdecoder_dimdecoder_ratesn_codebookscodebook_sizecodebook_dimquantizer_dropoutsample_ratec                    s   t    || _|| _|| _|| _|
| _|d u r |dt|  }|| _t	
|| _t|||| _|| _|| _|| _t|||||	d| _t|||| _|
| _| t |  | _d S )Nr   )rP   rc   rd   re   rf   )r#   r$   r^   r_   ra   rb   rg   lenr`   npprod
hop_lengthrD   encoderrc   rd   re   r   	quantizerrR   decoderapplyr   	get_delaydelay)r'   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   r)   r   r   r$      s:   

zDAC.__init__c                 C   sV   |d u r| j }|| j ksJ |jd }t|| j | j | }tj|d|f}|S )Nr+   r   )rg   r,   r=   r>   rk   r   
functionalr(   )r'   
audio_datarg   length	right_padr   r   r   
preprocess   s   
zDAC.preprocessrs   n_quantizersc                 C   s.   |  |}| ||\}}}}}|||||fS )ae  Encode given audio data and return quantized latent codes

        Parameters
        ----------
        audio_data : Tensor[B x 1 x T]
            Audio data to encode
        n_quantizers : int, optional
            Number of quantizers to use, by default None
            If None, all quantizers are used.

        Returns
        -------
        dict
            A dictionary with the following keys:
            "z" : Tensor[B x D x T]
                Quantized continuous representation of input
            "codes" : Tensor[B x N x T]
                Codebook indices for each codebook
                (quantized discrete representation of input)
            "latents" : Tensor[B x N*D x T]
                Projected latents (continuous representation of input before quantization)
            "vq/commitment_loss" : Tensor[1]
                Commitment loss to train encoder to predict vectors closer to codebook
                entries
            "vq/codebook_loss" : Tensor[1]
                Codebook loss to update the codebook
            "length" : int
                Number of samples in input audio
        )rl   rm   )r'   rs   rw   zcodeslatentscommitment_losscodebook_lossr   r   r   encode   s
   
"z
DAC.encoderx   c                 C   r?   )a  Decode given latent codes and return audio data

        Parameters
        ----------
        z : Tensor[B x D x T]
            Quantized continuous representation of input
        length : int, optional
            Number of samples in output audio, by default None

        Returns
        -------
        dict
            A dictionary with the following keys:
            "audio" : Tensor[B x 1 x length]
                Decoded audio data.
        )rn   )r'   rx   r   r   r   decode   s   
z
DAC.decodec                 C   sT   |j d }| ||}| ||\}}}}}	| |}
|
dd|f |||||	dS )a%  Model forward pass

        Parameters
        ----------
        audio_data : Tensor[B x 1 x T]
            Audio data to encode
        sample_rate : int, optional
            Sample rate of audio data in Hz, by default None
            If None, defaults to `self.sample_rate`
        n_quantizers : int, optional
            Number of quantizers to use, by default None.
            If None, all quantizers are used.

        Returns
        -------
        dict
            A dictionary with the following keys:
            "z" : Tensor[B x D x T]
                Quantized continuous representation of input
            "codes" : Tensor[B x N x T]
                Codebook indices for each codebook
                (quantized discrete representation of input)
            "latents" : Tensor[B x N*D x T]
                Projected latents (continuous representation of input before quantization)
            "vq/commitment_loss" : Tensor[1]
                Commitment loss to train encoder to predict vectors closer to codebook
                entries
            "vq/codebook_loss" : Tensor[1]
                Codebook loss to update the codebook
            "length" : int
                Number of samples in input audio
            "audio" : Tensor[B x 1 x length]
                Decoded audio data.
        r+   .N)audiorx   ry   rz   zvq/commitment_losszvq/codebook_loss)r,   rv   r}   r~   )r'   rs   rg   rw   rt   rx   ry   rz   r{   r|   r-   r   r   r   r/     s   
(
zDAC.forwardr@   )NN)r2   r3   r4   r5   r   r   rN   boolr$   rv   torchTensorr}   r~   r/   r6   r   r   r)   r   r\      sb    
	
2
(r\   __main__)partialcpuc                 C      g | ]	}t | qS r   ri   rj   size.0pr   r   r   
<listcomp>M      r   c                 C   s   | d|d dd S )N g    .Az<.3fz	M params.r   or   r   r   r   <lambda>N  s    r   
extra_reprr   zTotal # of params: c                 C   r   r   r   r   r   r   r   r   Q  r   i Tr   zInput shape:zOutput shape:r+   r   zReceptive field: i_( r]   )verbose)=r=   typingr   r   numpyri   r   
audiotoolsr   audiotools.mlr   r   baser   dac.nn.layersr	   r
   r   dac.nn.quantizer   r   Moduler   r7   rD   rO   rR   r\   r2   	functoolsr   torV   named_modulesnr   r   r   sum
parametersr   fnsetattrprintrt   randndevicer-   requires_grad_retain_gradoutr,   
zeros_likegradbackwardsqueezegradmaprfitem
decompresscompressr   r   r   r   <module>   sb      
3

 
