o
    Si                     @   sv   d dl mZ d dlZd dlmZ d dlmZ d dlmZmZm	Z	 G dd dej
ZG dd	 d	eZG d
d deZdS )    )OptionalN)nn)weight_norm)ConvNeXtBlock	ResBlock1AdaLayerNormc                   @   s&   e Zd ZdZdejdejfddZdS )BackbonezeBase class for the generator's backbone. It preserves the same temporal resolution across all layers.xreturnc                 K   s   t d)ai  
        Args:
            x (Tensor): Input tensor of shape (B, C, L), where B is the batch size,
                        C denotes output features, and L is the sequence length.

        Returns:
            Tensor: Output of shape (B, L, H), where B is the batch size, L is the sequence length,
                    and H denotes the model dimension.
        z-Subclasses must implement the forward method.)NotImplementedErrorselfr	   kwargs r   @/home/ubuntu/.local/lib/python3.10/site-packages/vocos/models.pyforward   s   
zBackbone.forwardN)__name__
__module____qualname____doc__torchTensorr   r   r   r   r   r   
   s    r   c                       sf   e Zd ZdZ		ddededededee dee f fd	d
Zdd Zde	j
de	j
fddZ  ZS )VocosBackbonea  
    Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization

    Args:
        input_channels (int): Number of input features channels.
        dim (int): Hidden dimension of the model.
        intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock.
        num_layers (int): Number of ConvNeXtBlock layers.
        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`.
        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
                                                None means non-conditional model. Defaults to None.
    Ninput_channelsdimintermediate_dim
num_layerslayer_scale_init_valueadanorm_num_embeddingsc                    s   t    || _tj|ddd| _ d u| _ r"t dd| _ntj	dd| _p/d| t
 fddt|D | _tj	dd| _| | j d S )	N      kernel_sizepaddinggư>)eps   c                    s   g | ]
}t  d qS ))r   r   r   r   )r   .0_r   r   r   r   r   r   
<listcomp>;   s    z*VocosBackbone.__init__.<locals>.<listcomp>)super__init__r   r   Conv1dembedadanormr   norm	LayerNorm
ModuleListrangeconvnextfinal_layer_normapply_init_weights)r   r   r   r   r   r   r   	__class__r)   r   r,   (   s   
	
zVocosBackbone.__init__c                 C   s<   t |tjtjfrtjj|jdd tj|jd d S d S )Ng{Gz?)stdr   )	
isinstancer   r-   Linearinittrunc_normal_weight	constant_bias)r   mr   r   r   r7   H   s   zVocosBackbone._init_weightsr	   r
   c                 K   s   | dd }| |}| jr |d usJ | j|dd|d}n	| |dd}|dd}| jD ]}|||d}q2| |dd}|S )Nbandwidth_idr%      )cond_embedding_id)getr.   r/   r0   	transposer4   r5   )r   r	   r   rC   
conv_blockr   r   r   r   M   s   

zVocosBackbone.forward)NN)r   r   r   r   intr   floatr,   r7   r   r   r   __classcell__r   r   r8   r   r      s&     r   c                       s:   e Zd ZdZ	d	 fdd	ZdejdejfddZ  ZS )
VocosResNetBackboneaN  
    Vocos backbone module built with ResBlocks.

    Args:
        input_channels (int): Number of input features channels.
        dim (int): Hidden dimension of the model.
        num_blocks (int): Number of ResBlock1 blocks.
        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to None.
    Nc                    s\   t    || _ttj| ddd| _pd| d tj fddt|D  | _	d S )Nr    r%   r!   c                    s   g | ]}t  d qS )r   r   )r   r&   rM   r   r   r*   o   s    z0VocosResNetBackbone.__init__.<locals>.<listcomp>)
r+   r,   r   r   r   r-   r.   
Sequentialr3   resnet)r   r   r   
num_blocksr   r8   rM   r   r,   g   s   

zVocosResNetBackbone.__init__r	   r
   c                 K   s$   |  |}| |}|dd}|S )Nr%   rD   )r.   rO   rG   r   r   r   r   r   r   s   

zVocosResNetBackbone.forward)N)	r   r   r   r   r,   r   r   r   rK   r   r   r8   r   rL   \   s
    rL   )typingr   r   r   torch.nn.utilsr   vocos.modulesr   r   r   Moduler   r   rL   r   r   r   r   <module>   s    B