o
    ϯi[                     @   s  d dl Z d dlmZmZ d dlmZ d dlZd dlZd dlZd dl	m
Z
 d dlmZ ddlmZ d dlmZmZmZmZmZ d d	lmZ d d
lmZ d dlmZ dd ZG dd dejZG dd dejZG dd dejZG dd dejZddgddgddgdZ G dd dejZ!ddgddgddgdZ"G dd  d ejZ#G d!d" d"ejZ$G d#d$ d$ejZ%G d%d& d&ejZ&G d'd( d(e
eZ'G d)d* d*e'Z(dS )+    N)ListOptional)Union)	BaseModel)nn   )
CodecMixin)MsgProcessor
NormConv1dNormConvTranspose1dSnake1d
activation)ResidualVectorQuantize)VAEBottleneck)hf_hub_downloadc                 C   s6   t | tjrtjj| jdd tj| jd d S d S )Ng{Gz?)stdr   )
isinstancer   Conv1dinittrunc_normal_weight	constant_bias)m r   G/home/ubuntu/.local/lib/python3.10/site-packages/dacvae/model/dacvae.pyinit_weights   s   r   c                       sz   e Zd Z										dded	ed
edededededededef fddZdejdejfddZ	dd Z
  ZS )ResidualUnit      r   SnakenoneFweight_normdimkerneldilationactstridecompresspad_modecausalnorm	true_skipc                    s   t    |dg}|dg}|| }|dkrd|i}n|dkr#ddi}ntd| g }tt||D ]1\}\}}|dkr?|n|}|t|d krK|n|}|tdd	|i|t||||||	||d
g7 }q3tj	| | _
|
| _d S )Nr   r    channelsELUalpha      ?zUnsupported activation: r   r&   )kernel_sizer'   r%   r+   r*   r)   r   )super__init__
ValueError	enumerateziplenr   r
   r   
Sequentialblockr,   )selfr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   kernels	dilationshidden
act_paramslayersir1   in_chsout_chs	__class__r   r   r3      s8   



zResidualUnit.__init__xyc                 C   s@   | j r|S |jd |jd  d }|dkr|d|| f }|S )N   r   .)r,   shape)r:   rE   rF   padr   r   r   shortcutJ   s   zResidualUnit.shortcutc                 C   s   |  |}|| || S N)r9   rK   )r:   rE   rF   r   r   r   forwardR   s   
zResidualUnit.forward)
r   r   r   r    r   r   r!   Fr"   F)__name__
__module____qualname__intstrboolr3   torchTensorrK   rM   __classcell__r   r   rC   r   r      sD    	
0r   c                       s0   e Zd Zd	dedef fddZdd Z  ZS )
EncoderBlockr   r   r#   r'   c                    sj   t    tt|d dddt|d dddt|d dddt|d t|d |d| |dd| _d S )	NrH   r   r   )r$   r%      	   r!   )r1   r'   r)   )r2   r3   r   r8   r   r   r
   r9   )r:   r#   r'   rC   r   r   r3   X   s   


zEncoderBlock.__init__c                 C   
   |  |S rL   r9   r:   rE   r   r   r   rM   h      
zEncoderBlock.forward)r   r   )rN   rO   rP   rQ   r3   rM   rV   r   r   rC   r   rW   W   s    rW   c                       s0   e Zd Zdddef fddZdd Z  ZS )	LSTMBlockT)skipr_   c                   s&   t    || _tj|i || _d S rL   )r2   r3   r_   r   LSTMlstm)r:   r_   argskwargsrC   r   r   r3   m   s   
zLSTMBlock.__init__c                 C   s8   | ddd}| |\}}| jr|| }| dddS )NrH   r   r   )permutera   r_   )r:   rE   rF   _r   r   r   rM   r   s
   zLSTMBlock.forward)rN   rO   rP   rS   r3   rM   rV   r   r   rC   r   r^   l   s    r^   c                       s>   e Zd Zdg ddfdededef fddZdd	 Z  ZS )
Encoder@   rH         rj   d_modelstridesd_latentc                    sr   t    td|dddg}|D ]}|d9 }|t||dg7 }q|t|t||dddg7 }tj| | _|| _d S )Nr   r   r!   )r1   r)   rH   )r'   rX   )	r2   r3   r
   rW   r   r   r8   r9   enc_dim)r:   rk   rl   rm   r?   r'   rC   r   r   r3   {   s   

zEncoder.__init__c                 C   rZ   rL   r[   r\   r   r   r   rM      r]   zEncoder.forward)rN   rO   rP   rQ   listr3   rM   rV   r   r   rC   r   rf   z   s    rf   r    r.   r!   autor"   )actsr)   r+   c                       s   e Zd Z									ddededed	ed
eee  deee  deee  dedee f fddZdd Zdd Z	dd Z
  ZS )DecoderBlockr   rj   r   NrX   	input_dim
output_dimr'   	stride_wmrq   	pad_modesnormsdownsampling_factorlast_kernel_sizec
                    s  t    |d u rtd }|d u rtd }|d u rtd }||g}
|}|}g }t||||
D ]7\}}}}|dkr?d|i}d}nddi}d	}|| }|| }|tdd
|i|t||d| ||||dg7 }q.|t|ddddddddt|ddddddddt|| dddd	ddd	dt|| dddd	ddd	dt|ddddddddg7 }|	d ur|t||	ddddd	dg7 }n|t g7 }|tj	ddt
||d| |d	dddg7 }t|| _t|| _d S )Nrq   r)   r+   r    r-   Fr/   r0   Tr&   rH   )r1   r'   r*   r)   r+   r   r!   r"   )r%   r&   r(   r*   r)   r+   r,   rX   r.   rp   )r$   r&   r(   r*   r)   r+   r,   rY   )r$   r&   r)   r+   r*   r,   r/   r   )r2   r3   default_decoder_convtr_kwargsr6   r   r   r   r   Identityr.   r
   
ModuleListr9   r7   _chunk_size)r:   rs   rt   r'   ru   rq   rv   rw   rx   ry   conv_stridesconv_in_dimconv_out_dimr?   r&   r+   r)   conv_strider>   r*   rC   r   r   r3      s^   

zDecoderBlock.__init__c                    sN   t  j} fddtd| jD } fddt|D }tj| }||S )Nc                        g | ]} j || j  qS r   r9   r~   .0r@   r:   r   r   
<listcomp>        z(DecoderBlock.forward.<locals>.<listcomp>r   c                    s,   g | ]\}}| j  d kr|D ]}|qqS r   r~   r   jchunklayerr   r   r   r         , r7   r9   ranger~   r5   r   r8   )r:   rE   	layer_cntchunksgroupr   r   r   rM      s
   

zDecoderBlock.forwardc                    sV   t  j} fddtd| jD } fddt|D }tj|t |d d   S )Nc                    r   r   r   r   r   r   r   r      r   z/DecoderBlock.upsample_group.<locals>.<listcomp>r   c                    ,   g | ]\}}| j  d kr|D ]}|qqS r   r   r   r   r   r   r      r   rH   r   r:   r   r   r   r   r   r   upsample_group      
zDecoderBlock.upsample_groupc                    sV   t  j} fddtd| jD } fddt|D }tj|d t |d   S )Nc                    r   r   r   r   r   r   r   r      r   z1DecoderBlock.downsample_group.<locals>.<listcomp>r   c                    r   r   r   r   r   r   r   r      r   rH   r   r   r   r   r   downsample_group   r   zDecoderBlock.downsample_group)	r   rj   r   r   NNNrX   N)rN   rO   rP   rQ   r   r   rR   r3   rM   r   r   rV   r   r   rC   r   rr      s@    


	
Err   Tanhc                       s   e Zd Z								ddededed	ed
ee deee  deee  deee  f fddZdd Zdd Z	dd Z
dd Z  ZS )WatermarkEncoderBlock`             Nin_dimout_dimwm_channelsr=   lstm_layersrq   rv   rw   c	                    s"  t    |d u rtd }|d u rtd }|d u rtd }g }	tt|||D ];\}
\}}}|
dkr4|nd}|
dkr<dn|}|dkrId|i}d}ni }d	}|	tdd
|i|t||d|||dg7 }	q'tj|	 | _	|d urut
|||g}ng }|tjddt||dd	dddg7 }tj| | _d S )Nrq   r)   r+   r   r   r    r-   FTr&   r   )r1   r*   r)   r+   r0   rz   r!   rp   r1   r*   r+   r)   r   )r2   r3   default_wm_encoder_kwargsr5   r6   r   r
   r   r8   prer^   r.   post)r:   r   r   r   r=   r   rq   rv   rw   
pre_layersr@   r&   r+   r)   rs   rt   r>   r*   post_layersrC   r   r   r3     sJ   


zWatermarkEncoderBlock.__init__c                 C   rZ   rL   r   r\   r   r   r   rM   :  r]   zWatermarkEncoderBlock.forwardc              	   C   sH   | j d }ztjj| j d j |  |}|W || j d< S || j d< w NrG   )r   rT   r   utilsremove_weight_normconvr:   rE   _convr   r   r   forward_conv=  s   

z"WatermarkEncoderBlock.forward_convc              	   C   s<   | j d }zt | j d< |  |W || j d< S || j d< w r   r   r   r|   r   r   r   r   forward_no_convF  s
   

z%WatermarkEncoderBlock.forward_no_convc                 C   rZ   rL   r   r\   r   r   r   post_processN  r]   z"WatermarkEncoderBlock.post_process)r   r   r   r   NNNN)rN   rO   rP   rQ   r   r   rR   r3   rM   r   r   r   rV   r   r   rC   r   r      s<    


	9	r   c                       sZ   e Zd Z					ddededed	ed
ee f
 fddZdd Zdd Zdd Z  Z	S )WatermarkDecoderBlockr   r   r   r   Nr   r   r-   r=   r   c              	      sv   t    t||dddddg}|d ur|t|||g7 }tj| | _tjddt||dddddg}tj| | _d S )Nr   Tr!   rp   r   r0   rz   )	r2   r3   r
   r^   r   r8   r   r.   r   )r:   r   r   r-   r=   r   r   r   rC   r   r   r3   T  s   


zWatermarkDecoderBlock.__init__c                 C   s   |  |}|S rL   r   r\   r   r   r   rM   p  s   
zWatermarkDecoderBlock.forwardc              	   C   s@   | j d }zt | j d< |  |}W || j d< d S || j d< w r   r   r   r   r   r   r   t  s
   
z%WatermarkDecoderBlock.forward_no_convc                 C   rZ   rL   r   r\   r   r   r   r   |  r]   z"WatermarkDecoderBlock.post_process)r   r   r   r   N)
rN   rO   rP   rQ   r   r3   rM   r   r   rV   r   r   rC   r   r   S  s(    r   c                       sp   e Zd Z						ddeded	ed
edededee f fddZdefddZdejdejfddZ	  Z
S )Watermarkerr   r   r   r   r   Nr#   d_outrm   r-   r=   nbitsr   c                    sB   t    t|||||d| _t||| _t|||||d| _d S )N)r=   r   )r2   r3   r   encoder_blockr	   msg_processorr   decoder_block)r:   r#   r   rm   r-   r=   r   r   rC   r   r   r3     s   

zWatermarker.__init__bszc                 C   s0   | j d ur
| j j}nd}tjdd||ftjdS )Nr   r   rH   )dtype)r   r   rT   randintfloat32)r:   r   r   r   r   r   random_message  s   

zWatermarker.random_messagerE   msgc                 C   s$   |  |}| ||}| |}|S rL   )r   r   r   )r:   rE   r   r   r   r   rM     s   

zWatermarker.forward)r   r   r   r   r   N)rN   rO   rP   rQ   r   r3   r   rT   rU   rM   rV   r   r   rC   r   r     s0    r   c                       sj   e Zd Z					ddededed	ed
ef
 fddZddeej fddZ	ddeej fddZ
  ZS )Decoderr   r   r   r   linearr   r   r   d_wm_outblendingc
              	      s   t    t||dddg}
tt||D ]\}\}}|d|  }|d|d   }|
t||||g7 }
qt|
| _t	||||d|dd| _
|| | _|	| _d S )Nr   r   )r1   r'   rH   r   )r=   r   r   )r2   r3   r
   r5   r6   rr   r   r}   modelr   wm_modelr/   r   )r:   input_channelr-   rateswm_ratesr   r   r   r   r   r?   r@   r'   	wm_striders   rt   rC   r   r   r3     s   


zDecoder.__init__Nmessagec                 C   s    | j D ]}||}q| ||S rL   )r   	watermark)r:   rE   r   r   r   r   r   rM     s   

zDecoder.forwardc                 C   s  | j dkr|S | j|}tdd | jdd  }t|d d d }|D ]}||}q$| jj|}|d u rA|jd }| j|}|	|j
}| j||}| j|}tdd | jdd  }|D ]}||}qb| jj|}| jdkr| jj|| j |  S | jj|| j |  S )	Ng        c                 S      |   S rL   )r   rE   r   r   r   <lambda>      z#Decoder.watermark.<locals>.<lambda>r   rG   r   c                 S   r   rL   )r   r   r   r   r   r     r   r   )r/   r   r   mapr   ro   r   rI   r   todevicer   r   r   rM   r   )r:   rE   r   h	upsamplerr   r   downsamplerr   r   r   r     s*   




zDecoder.watermark)r   r   r   r   r   rL   )rN   rO   rP   rQ   rR   r3   r   rT   rU   rM   r   rV   r   r   rC   r   r     s&    	
 r   c                       s   e Zd Zdg dddg dddddd	d
fdedee dee dedee deee  dededeeef dedef fddZ	dd Z
	d$dejdee fddZd$dejdeej fd d!Z		d%dejdee dee fd"d#Z  ZS )&DACrg   rh   N   rj   rj   ri   rH   rY      rj   FD  encoder_dimencoder_rates
latent_dimdecoder_dimdecoder_ratesr   n_codebookscodebook_sizecodebook_dimquantizer_dropoutsample_ratec                    s   t    || _|| _|| _|| _|| _|d u r |dt|  }t|t	s'J |d u r/g d}|| _
t|| _t|||| _|| _|| _|	| _t||||	|
d| _t||||| _|| _| t |  | _d S )NrH   )rj      ri   rH   )rs   r   r   r   r   )r2   r3   r   r   r   r   r   r7   r   rQ   r   npprod
hop_lengthrf   encoderr   r   r   r   	quantizerr   decoderapplyr   	get_delaydelay)r:   r   r   r   r   r   r   r   r   r   r   r   rC   r   r   r3     sB   

zDAC.__init__c                 C   sV   |d u r| j }|| j ksJ |jd }t|| j | j | }tj|d|f}|S )NrG   r   )r   rI   mathceilr   r   
functionalrJ   )r:   
audio_datar   length	right_padr   r   r   
preprocess  s   
zDAC.preprocessr   n_quantizersc                 C   s.   |  |}| ||\}}}}}|||||fS )ae  Encode given audio data and return quantized latent codes

        Parameters
        ----------
        audio_data : Tensor[B x 1 x T]
            Audio data to encode
        n_quantizers : int, optional
            Number of quantizers to use, by default None
            If None, all quantizers are used.

        Returns
        -------
        dict
            A dictionary with the following keys:
            "z" : Tensor[B x D x T]
                Quantized continuous representation of input
            "codes" : Tensor[B x N x T]
                Codebook indices for each codebook
                (quantized discrete representation of input)
            "latents" : Tensor[B x N*D x T]
                Projected latents (continuous representation of input before quantization)
            "vq/commitment_loss" : Tensor[1]
                Commitment loss to train encoder to predict vectors closer to codebook
                entries
            "vq/codebook_loss" : Tensor[1]
                Codebook loss to update the codebook
            "length" : int
                Number of samples in input audio
        )r   r   )r:   r   r   zcodeslatentscommitment_losscodebook_lossr   r   r   encode&  s
   
"z
DAC.encoder   r   c                 C   s   | j ||dS )a.  Decode given latent codes and return audio data

        Parameters
        ----------
        z : Tensor[B x D x T]
            Quantized continuous representation of input
        length : int, optional
            Number of samples in output audio, by default None
        message : Tensor[B x nbits], optional
            Message to embed in the audio, by default None

        Returns
        -------
        dict
            A dictionary with the following keys:
            "audio" : Tensor[B x 1 x length]
                Decoded audio data.
        )r   )r   )r:   r   r   r   r   r   decodeO  s   z
DAC.decodec                 C   sT   |j d }| ||}| ||\}}}}}	| |}
|
dd|f |||||	dS )a%  Model forward pass

        Parameters
        ----------
        audio_data : Tensor[B x 1 x T]
            Audio data to encode
        sample_rate : int, optional
            Sample rate of audio data in Hz, by default None
            If None, defaults to `self.sample_rate`
        n_quantizers : int, optional
            Number of quantizers to use, by default None.
            If None, all quantizers are used.

        Returns
        -------
        dict
            A dictionary with the following keys:
            "z" : Tensor[B x D x T]
                Quantized continuous representation of input
            "codes" : Tensor[B x N x T]
                Codebook indices for each codebook
                (quantized discrete representation of input)
            "latents" : Tensor[B x N*D x T]
                Projected latents (continuous representation of input before quantization)
            "vq/commitment_loss" : Tensor[1]
                Commitment loss to train encoder to predict vectors closer to codebook
                entries
            "vq/codebook_loss" : Tensor[1]
                Codebook loss to update the codebook
            "length" : int
                Number of samples in input audio
            "audio" : Tensor[B x 1 x length]
                Decoded audio data.
        rG   .N)audior   r   r   zvq/commitment_losszvq/codebook_loss)rI   r   r   r  )r:   r   r   r   r   r   r   r   r   r   rE   r   r   r   rM   d  s   
(
zDAC.forwardrL   )NN)rN   rO   rP   rQ   r   r   r   ro   rS   r3   r   rT   rU   r   r  rM   rV   r   r   rC   r   r     sh    
	

8
)r   c                       s   e Zd Zdg dddg ddddd	d
f
dedee dee dedee dededeeef dedef fddZ	e
 fddZdd ZdejfddZdejfdd Z  ZS )!DACVAErg   rh   Nr   r   rY   r   rj   Fr   r   r   r   r   r   r   r   r   r   r   c                    s6   t  j|||||||||	|
d
 t| j||d| _d S )N)
r   r   r   r   r   r   r   r   r   r   )rs   r   r   )r2   r3   r   r   r   )r:   r   r   r   r   r   r   r   r   r   r   rC   r   r   r3     s"   zDACVAE.__init__c                    s.   t j|s|drt|dd}t |S )Nz	facebook/zweights.pth)repo_idfilename)ospathexists
startswithr   r2   load)clsr  rC   r   r   r
    s   zDACVAE.loadc                 C   s>   | d}|| j rd| j|| j  f}tjj||dS |S )NrG   r   reflect)sizer   rT   r   r   rJ   )r:   wavsr   p1dr   r   r   _pad  s
   

zDACVAE._padr   c                 C   s@   |  | |}| j|jddd\}}| j||\}}|S )NrH   r   )r#   )r   r  r   in_projr   _vae_sample)r:   r   r   meanscaleencoded_framesre   r   r   r   r     s   zDACVAE.encoder  c                 C   s   | j |}| |S rL   )r   out_projr   )r:   r  embr   r   r   r    s   
zDACVAE.decode)rN   rO   rP   rQ   r   r   r   ro   rS   r3   classmethodr
  r  rT   rU   r   r  rV   r   r   rC   r   r    sN    
	

	r  ))r   typingr   r   r   numpyr   r  rT   audiotools.mlr   r   baser   dacvae.nn.layersr	   r
   r   r   r   dacvae.nn.quantizer   dacvae.nn.bottleneckr   huggingface_hubr   r   Moduler   rW   r^   rf   r{   rr   r   r   r   r   r   r   r  r   r   r   r   <module>   sD   >[S-C <