o
    ̳i2:                     @   sF  d dl mZ d dlmZ ddlmZ ddlT d dlmZm	Z	 d dl
mZ dd	 Zd
d ZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZd(dejdedejfddZdejdejfd d!Zdejdejfd"d#ZG d$d% d%ejZG d&d' d'ejZdS ))    N)	rearrange   )activations)*)OptionalTuple)parametrizationsc                  O   $   t j| i |}tj|ddd |S Nweightr   )namedim)nnConv1dr   weight_norm)argskwargsconv r   E/home/ubuntu/.local/lib/python3.10/site-packages/xcodec2/vq/module.pyWNConv1d      r   c                  O   r	   r
   )r   ConvTranspose1dr   r   )r   r   convtr   r   r   WNConvTranspose1d   r   r   c                       s0   e Zd Zd	dedef fddZdd Z  ZS )
ResidualUnit   r   r   dilationc              
      sd   t    d| d }tttj|dddt||d||dttj|dddt||dd	| _d S )
N      Talpha_logscale
activation   )kernel_sizer   paddingr   )r%   	super__init__r   
SequentialActivation1dr   	SnakeBetar   block)selfr   r   pad	__class__r   r   r)   "   s   

zResidualUnit.__init__c                 C   s   ||  | S Nr-   r.   xr   r   r   forward,   s   zResidualUnit.forward)r   r   __name__
__module____qualname__intr)   r6   __classcell__r   r   r0   r   r   !   s    
r   c                       s0   e Zd Zd
dedef fddZdd	 Z  ZS )EncoderBlockr   r   r      	   r   stridec              
      sp   t     fdd|D }tjg |ttj d dddt d  d| ||d |d  dR  | _d S )Nc                    s   g | ]
}t  d  |dqS )r   r   r   .0dr   r   r   
<listcomp>2   s    z)EncoderBlock.__init__.<locals>.<listcomp>r   Tr    r"   )r%   rA   r&   r'   )r.   r   rA   	dilationsrunitsr0   rG   r   r)   0   s   
zEncoderBlock.__init__c                 C   
   |  |S r2   r3   r4   r   r   r   r6   ?      
zEncoderBlock.forward)r   r   r>   r7   r   r   r0   r   r=   /   s    r=   c                       s4   e Zd Zddededef fdd	Zd
d Z  ZS )DecoderBlockr      r   r>   	input_dim
output_dimrA   c                    sj   t    tttj|dddt| d| ||d |d  |d d| _| j	 fdd|D  d S )NTr    r"   r   )r%   rA   r&   output_paddingc                    s   g | ]}t  |d qS )rB   rC   rD   rP   r   r   rH   P   s    z)DecoderBlock.__init__.<locals>.<listcomp>)
r(   r)   r   r*   r+   r   r,   r   r-   extend)r.   rO   rP   rA   rI   r0   rR   r   r)   C   s   
zDecoderBlock.__init__c                 C   rK   r2   r3   r4   r   r   r   r6   R   rL   zDecoderBlock.forward)r   rN   r   r>   r7   r   r   r0   r   rM   B   s    rM   c                	       s>   e Zd Z			ddedededef fdd	Zd
d Z  ZS )ResLSTMr   FT	dimension
num_layersbidirectionalskipc                    s6   t    || _tj||s|n|d |d|d| _d S )Nr   T)batch_firstrW   )r(   r)   rX   r   LSTMlstm)r.   rU   rV   rW   rX   r0   r   r   r)   V   s   
zResLSTM.__init__c                 C   s4   t |d}| |\}}| jr|| }t |d}|S )z[
        Args:
            x: [B, F, T]

        Returns:
            y: [B, F, T]
        zb f t -> b t fzb t f -> b f t)r   r[   rX   )r.   r5   y_r   r   r   r6   `   s   

zResLSTM.forward)r   FTr8   r9   r:   r;   boolr)   r6   r<   r   r   r0   r   rT   U   s    
rT   c                
       s\   e Zd ZdZ	ddedededee f fddZdd	ej	d
eej	 dej	fddZ
  ZS )ConvNeXtBlocka  ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.

    Args:
        dim (int): Number of input channels.
        intermediate_dim (int): Dimensionality of the intermediate layer.
        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
            Defaults to None.
        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
            None means non-conditional LayerNorm. Defaults to None.
    Nr   intermediate_dimlayer_scale_init_valueadanorm_num_embeddingsc                    s   t    tj||dd|d| _|d u| _|r t||dd| _ntj|dd| _t	||| _
t | _t	||| _|dkrNtj|t| dd| _d S d | _d S )	Nr$   r?   )r%   r&   groupsư>epsr   Trequires_grad)r(   r)   r   r   dwconvadanormAdaLayerNormnorm	LayerNormLinearpwconv1GELUactpwconv2	Parametertorchonesgamma)r.   r   ra   rb   rc   r0   r   r   r)   }   s   


zConvNeXtBlock.__init__r5   cond_embedding_idreturnc                 C   s   |}|  |}|dd}| jr|d usJ | ||}n| |}| |}| |}| |}| jd ur;| j| }|dd}|| }|S )Nr   r   )rj   	transposerk   rm   rp   rr   rs   rw   )r.   r5   rx   residualr   r   r   r6      s   






zConvNeXtBlock.forwardr2   )r8   r9   r:   __doc__r;   floatr   r)   ru   Tensorr6   r<   r   r   r0   r   r`   q   s    *r`   c                       sL   e Zd ZdZddededef fddZdejd	ejd
ejfddZ	  Z
S )rl   z
    Adaptive Layer Normalization module with learnable embeddings per `num_embeddings` classes

    Args:
        num_embeddings (int): Number of embeddings.
        embedding_dim (int): Dimension of the embeddings.
    re   num_embeddingsembedding_dimrg   c                    s^   t    || _|| _tj||d| _tj||d| _tjj	
| jj tjj	| jj d S )N)r   r   )r(   r)   rg   r   r   	Embeddingscaleshiftru   initones_r   zeros_)r.   r   r   rg   r0   r   r   r)      s   
zAdaLayerNorm.__init__r5   rx   ry   c                 C   s<   |  |}| |}tjj|| jf| jd}|| | }|S )Nrf   )r   r   r   
functional
layer_normr   rg   )r.   r5   rx   r   r   r   r   r   r6      s
   

zAdaLayerNorm.forward)re   )r8   r9   r:   r|   r;   r}   r)   ru   r~   r6   r<   r   r   r0   r   rl      s    $	rl   c                       s   e Zd ZdZ				ddededeeeef d	ed
ee f
 fddZde	j
de	j
fddZdd ZeddededefddZ  ZS )	ResBlock1a  
    ResBlock adapted from HiFi-GAN V1 (https://github.com/jik876/hifi-gan) with dilated 1D convolutions,
    but without upsampling layers.

    Args:
        dim (int): Number of input channels.
        kernel_size (int, optional): Size of the convolutional kernel. Defaults to 3.
        dilation (tuple[int], optional): Dilation factors for the dilated convolutions.
            Defaults to (1, 3, 5).
        lrelu_slope (float, optional): Negative slope of the LeakyReLU activation function.
            Defaults to 0.1.
        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
            Defaults to None.
    r?   r   r?      皙?Nr   r%   r   lrelu_sloperb   c                    s|  t    || _tttj|||d|d | ||d dttj|||d|d | ||d dttj|||d|d | ||d dg| _tttj|||dd| |ddttj|||dd| |ddttj|||dd| |ddg| _	t
|d urtj|t|d ddnd |d urtj|t|d ddnd |d urtj|t|d ddnd g| _d S )Nr   r   )r   r&   r   Trh   )r(   r)   r   r   
ModuleListr   r   get_paddingconvs1convs2ParameterListrt   ru   rv   rw   )r.   r   r%   r   r   rb   r0   r   r   r)      sj   


#   
zResBlock1.__init__r5   ry   c                 C   sr   t | j| j| jD ]-\}}}tjjj|| jd}||}tjjj|| jd}||}|d ur2|| }|| }q	|S )N)negative_slope)	zipr   r   rw   ru   r   r   
leaky_relur   )r.   r5   c1c2rw   xtr   r   r   r6     s   
zResBlock1.forwardc                 C   s4   | j D ]}t|d q| jD ]}t|d qd S )Nr   )r   r   remove_parametrizationsr   )r.   lr   r   r   remove_weight_norm   s
   

zResBlock1.remove_weight_normr   c                 C   s   t | | | d S )Nr   )r;   )r%   r   r   r   r   r   &  s   zResBlock1.get_padding)r?   r   r   N)r   )r8   r9   r:   r|   r;   r   r}   r   r)   ru   r~   r6   r   staticmethodr   r<   r   r   r0   r   r      s*    C"r   Hz>r5   clip_valry   c                 C   s   t t j| |dS )aU  
    Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.

    Args:
        x (Tensor): Input tensor.
        clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.

    Returns:
        Tensor: Element-wise logarithm of the input tensor with clipping applied.
    )min)ru   logclip)r5   r   r   r   r   safe_log+  s   r   c                 C   s   t | t |   S r2   )ru   signlog1pabsr5   r   r   r   symlog9  s   r   c                 C   s   t | t |  d  S )Nr   )ru   r   expr   r   r   r   r   symexp=  s   r   c                       @   e Zd Z		ddededededef
 fdd	Zd
d Z  ZS )SemanticEncoderr?   Tinput_channelscode_dimencode_channelsr%   biasc                       t t|   tj|||d|d d dd| _ttjddtj|||d|d d |dtjddtj|||d|d d |d| _tj|||d|d d dd| _	d S Nr   r   F)in_channelsout_channelsr%   rA   r&   r   T)inplace)r%   rA   r&   r   )
r(   r   r)   r   r   initial_convr*   ReLUresidual_blocks
final_conv)r.   r   r   r   r%   r   r0   r   r   r)   C  sJ   






zSemanticEncoder.__init__c                 C   s&   |  |}| || }| |}|S )u   
        前向传播方法。

        Args:
            x (Tensor): 输入张量，形状为 (Batch, Input_channels, Length)

        Returns:
            Tensor: 编码后的张量，形状为 (Batch, Code_dim, Length)
        r   r   r   r4   r   r   r   r6   w  s   


zSemanticEncoder.forwardr?   Tr^   r   r   r0   r   r   B  s    4r   c                       r   )SemanticDecoderr?   Tr   output_channelsdecode_channelsr%   r   c                    r   r   )
r(   r   r)   r   r   r   r*   r   r   r   )r.   r   r   r   r%   r   r0   r   r   r)     s.   




zSemanticDecoder.__init__c                 C   s&   |  |}| || }| |}|S r2   r   )r.   zr5   r   r   r   r6     s   

zSemanticDecoder.forwardr   r^   r   r   r0   r   r     s    &r   )r   )torch.nnr   einopsr    r   alias_free_torchtypingr   r   torch.nn.utilsr   r   r   Moduler   r=   rM   rT   r`   rl   r   ru   r~   r}   r   r   r   r   r   r   r   r   r   <module>   s(    
7iD