o
    i*                     @   sv  d dl Z d dlmZmZmZ d dlZd dlZd dlmZ d dl	m  m
Z d dlmZ d dlmZ dd Zdd	 ZG d
d dejZG dd dejZdd Zdd Zejjdd ZG dd dejZdd ZG dd dejZG dd dejZG dd dejZ G dd dejZ!G d d! d!ejZ"G d"d# d#ejjZ#G d$d% d%ejZ$G d&d' d'eZ%G d(d) d)ejZ&dS )*    N)ListUnionOptional)nn)weight_norm)	BaseModelc                  O      t tj| i |S N)r   r   Conv1dargskwargs r   U/home/ubuntu/.local/lib/python3.10/site-packages/voxcpm/modules/audiovae/audio_vae.pyWNConv1d      r   c                  O   r   r	   )r   r   ConvTranspose1dr   r   r   r   WNConvTranspose1d   r   r   c                       s4   e Zd Zdddef fddZ fddZ  ZS )CausalConv1dr   )paddingr   c                   s   t  j|i | || _d S r	   )super__init___CausalConv1d__padding)selfr   r   r   	__class__r   r   r      s   
zCausalConv1d.__init__c                    s"   t || jd df}t |S )N   r   )Fpadr   r   forward)r   xx_padr   r   r   r      s   zCausalConv1d.forward__name__
__module____qualname__intr   r   __classcell__r   r   r   r   r      s    r   c                       s:   e Zd Zddddedef fddZ fddZ  ZS )	CausalTransposeConv1dr   )r   output_paddingr   r)   c                   s"   t  j|i | || _|| _d S r	   )r   r   _CausalTransposeConv1d__padding&_CausalTransposeConv1d__output_padding)r   r   r)   r   r   r   r   r   r      s   
zCausalTransposeConv1d.__init__c                    s&   t  |dd | jd | j  f S )N.r   )r   r   r*   r+   r   r    r   r   r   r   $   s   &zCausalTransposeConv1d.forwardr"   r   r   r   r   r(      s    r(   c                  O      t t| i |S r	   )r   r   r   r   r   r   WNCausalConv1d(      r.   c                  O   r-   r	   )r   r(   r   r   r   r   WNCausalTransposeConv1d,   r/   r0   c                 C   sN   | j }| |d |d d} | |d  t||  d  } | |} | S )Nr      g&.>r   )shapereshape
reciprocaltorchsinpow)r    alphar3   r   r   r   snake1   s
   $
r:   c                       $   e Zd Z fddZdd Z  ZS )Snake1dc                    s$   t    ttd|d| _d S )Nr1   )r   r   r   	Parameterr6   onesr9   )r   channelsr   r   r   r   ;   s   
zSnake1d.__init__c                 C   s   t || jS r	   )r:   r9   r,   r   r   r   r   ?   s   zSnake1d.forwardr#   r$   r%   r   r   r'   r   r   r   r   r<   :       r<   c                 C   sD   t | tjrtjj| jdd | jd ur tj| jd d S d S d S )Ng{Gz?)stdr   )
isinstancer   r
   inittrunc_normal_weightbias	constant_)mr   r   r   init_weightsC   s   
rJ   c                	       s8   e Zd Zddedededef fdd	Zd
d Z  ZS )CausalResidualUnit   r1      dimdilationkernelgroupsc                    sN   t    d| d }tt|t||||||dt|t||dd| _d S )N   r   )kernel_sizerO   r   rQ   r1   rS   )r   r   r   
Sequentialr<   r.   block)r   rN   rO   rP   rQ   r   r   r   r   r   K   s   

zCausalResidualUnit.__init__c                 C   sP   |  |}|jd |jd  d }|dksJ |dkr$|d|| f }|| S )Nr2   r   r   .)rV   r3   )r   r    yr   r   r   r   r   \   s   
zCausalResidualUnit.forward)rL   r1   rM   r1   r"   r   r   r   r   rK   J   s     rK   c                       s0   e Zd Zd
dedef fddZdd	 Z  ZS )CausalEncoderBlockrL   Nr1   
output_dimstridec                    sl   t    |p
|d }tt|d|dt|d|dt|d|dt|t||d| |t|d d| _	d S )Nr   r1   rO   rQ      	   )rS   rZ   r   )
r   r   r   rU   rK   r<   r.   mathceilrV   )r   rY   	input_dimrZ   rQ   r   r   r   r   f   s   

zCausalEncoderBlock.__init__c                 C   
   |  |S r	   rV   r,   r   r   r   r   w      
zCausalEncoderBlock.forward)rL   Nr1   r1   r"   r   r   r   r   rX   e   s    rX   c                	       sD   e Zd Zddg ddfdedededef fd	d
Zdd Z  ZS )CausalEncoder@       )r         rh   Fd_model
latent_dimstrides	depthwisec                    s   t    td|dddg| _|D ]}|d9 }|r|d nd}|  jt|||dg7  _q|r1|nd}t||ddd| _t||ddd| _tj| j | _|| _	d S )Nr1   rM   r\   rS   r   r   )rY   rZ   rQ   )
r   r   r.   rV   rX   fc_mu	fc_logvarr   rU   enc_dim)r   ri   rj   rk   rl   rZ   rQ   r   r   r   r   |   s   

zCausalEncoder.__init__c                 C   s"   |  |}|| || |dS )N)hidden_statemulogvar)rV   rn   ro   )r   r    rq   r   r   r   r      s
   
zCausalEncoder.forward)	r#   r$   r%   r&   listboolr   r   r'   r   r   r   r   rd   {   s    rd   c                       r;   )
NoiseBlockc                    s    t    t||ddd| _d S )Nr1   F)rS   rG   )r   r   r.   linear)r   rN   r   r   r   r      s   
zNoiseBlock.__init__c                 C   sD   |j \}}}tj|d|f|j|jd}| |}|| }|| }|S )Nr1   )devicedtype)r3   r6   randnrx   ry   rw   )r   r    BCTnoisehnr   r   r   r      s   
zNoiseBlock.forwardr@   r   r   r   r   rv      rA   rv   c                	       sB   e Zd Z					ddedededef fd	d
Zdd Z  ZS )CausalDecoderBlockrL   rh   r1   Fr`   rY   rZ   use_noise_blockc              
      s   t    t|t||d| |t|d |d dg}|r%|t| |t	|d|dt	|d|dt	|d|dg t
j| | _d S )Nr   )rS   rZ   r   r)   r1   r[   r\   r]   )r   r   r<   r0   r^   r_   appendrv   extendrK   r   rU   rV   )r   r`   rY   rZ   rQ   r   layersr   r   r   r      s(   
zCausalDecoderBlock.__init__c                 C   ra   r	   rb   r,   r   r   r   r      rc   zCausalDecoderBlock.forward)rL   rh   r1   r1   F)r#   r$   r%   r&   ru   r   r   r'   r   r   r   r   r      s     r   c                   @   s   e Zd Zdd ZdS )TransposeLastTwoDimc                 C   s   t |ddS )Nr2   )r6   	transposer,   r   r   r   r      s   zTransposeLastTwoDim.forwardN)r#   r$   r%   r   r   r   r   r   r      s    r   c                       s:   e Zd Z			d
dededef fddZdd	 Z  ZS )CausalDecoderFr1   rl   d_outr   c              	      s   t    |rt||dd|dt||ddg}n	t||dddg}t|D ]$\}}	|d|  }
|d|d   }|r;|nd}|t|
||	||dg7 }q%|t|t||dddt g7 }tj| | _	d S )	NrM   r\   )rS   r   rQ   r1   rT   rm   r   )rQ   r   )
r   r   r.   	enumerater   r<   r   TanhrU   model)r   input_channelr?   ratesrl   r   r   r   irZ   r`   rY   rQ   r   r   r   r      s>   
	zCausalDecoder.__init__c                 C   ra   r	   )r   r,   r   r   r   r   
  rc   zCausalDecoder.forward)Fr1   F)r#   r$   r%   ru   r&   r   r   r'   r   r   r   r   r      s    2r   c                   @   s~   e Zd ZU dZeed< g dZee ed< dZeed< dZ	eed< g d	Z
ee ed
< dZeed< dZeed< dZeed< dS )AudioVAEConfig   encoder_dim)r      rh   rh   encoder_ratesre   rj   i   decoder_dim)rh   rh   r   r   decoder_ratesTrl   i>  sample_rateFr   N)r#   r$   r%   r   r&   __annotations__r   r   rj   r   r   rl   ru   r   r   r   r   r   r   r     s   
 r   c                       sZ   e Zd ZdZ	ddee f fddZdd Zdej	fd	d
Z
dej	defddZ  ZS )AudioVAEz
    Args:
    Nconfigc           
         s   |d u rt  }t   |j}|j}|j}|j}|j}|j}|j	}|j
}	|| _|| _|| _|| _|| _|	| _
|d u rB|dt|  }|| _t|| _t||||d| _t|||||	d| _|| _	t|| _d S )Nr   )rl   )rl   r   )r   r   r   r   r   rj   r   r   rl   r   r   lennpprod
hop_lengthrd   encoderr   decoderr^   
chunk_size)
r   r   r   r   rj   r   r   rl   r   r   r   r   r   r     sH   
zAudioVAE.__init__c                 C   sX   |d u r| j }|| j ksJ | j}|jd }t|| | | }tj|d|f}|S )Nr2   r   )r   r   r3   r^   r_   r   
functionalr   )r   
audio_datar   pad_tolength	right_padr   r   r   
preprocessO  s   
zAudioVAE.preprocesszc                 C   ra   )a  Decode given latent codes and return audio data

        Parameters
        ----------
        z : Tensor[B x D x T]
            Quantized continuous representation of input
        length : int, optional
            Number of samples in output audio, by default None

        Returns
        -------
        dict
            A dictionary with the following keys:
            "audio" : Tensor[B x 1 x length]
                Decoded audio data.
        )r   )r   r   r   r   r   decodeZ  s   
zAudioVAE.decoder   r   c                 C   s.   |j dkr
|d}| ||}| |d S )z
        Args:
            audio_data: Tensor[B x 1 x T]
            sample_rate: int
        Returns:
            z: Tensor[B x D x T]
        r   r1   rr   )ndim	unsqueezer   r   )r   r   r   r   r   r   encodem  s   

zAudioVAE.encoder	   )r#   r$   r%   __doc__r   r   r   r   r6   Tensorr   r&   r   r'   r   r   r   r   r     s    1r   )'r^   typingr   r   r   numpyr   r6   r   torch.nn.functionalr   r   torch.nn.utilsr   pydanticr   r   r   r
   r   r   r(   r.   r0   jitscriptr:   Moduler<   rJ   rK   rX   rd   rv   r   r   r   r   r   r   r   r   r   <module>   s6    


	%$7