o
    Siu                     @  s   d dl mZ d dlmZmZmZmZmZ d dlZd dl	Z	d dl
mZ d dlmZ d dlmZmZ d dlmZ d dlmZ dddZG dd dejZdS )    )annotations)AnyDictTupleUnionOptionalN)hf_hub_download)nn)FeatureExtractorEncodecFeatures)FourierHead)BackboneargsUnion[Any, Tuple[Any, ...]]initDict[str, Any]returnr   c                 C  sV   | di }t| ts| f} |d dd\}}t||gd}t||}|| i |S )zInstantiates a class with the given args and init.

    Args:
        args: Positional arguments required for instantiation.
        init: Dict of the form {"class_path":...,"init_args":...}.

    Returns:
        The instantiated class object.
    	init_args
class_path.   )fromlist)get
isinstancetuplersplit
__import__getattr)r   r   kwargsclass_module
class_namemodule
args_class r#   D/home/ubuntu/.local/lib/python3.10/site-packages/vocos/pretrained.pyinstantiate_class   s   


r%   c                      sv   e Zd ZdZd! fdd	Zed"ddZed#d$ddZe	 d%ddZ
e	 d&ddZe	 d'dd Z  ZS )(Vocosa#  
    The Vocos class represents a Fourier-based neural vocoder for audio synthesis.
    This class is primarily designed for inference, with support for loading from pretrained
    model checkpoints. It consists of three main components: a feature extractor,
    a backbone, and a head.
    feature_extractorr
   backboner   headr   c                   s    t    || _|| _|| _d S N)super__init__r'   r(   r)   )selfr'   r(   r)   	__class__r#   r$   r,   )   s   

zVocos.__init__config_pathstrr   c                 C  sv   t |d}t|}W d   n1 sw   Y  td|d d}td|d d}td|d d}| |||d}|S )	z}
        Class method to create a new Vocos model instance from hyperparameters stored in a yaml configuration file.
        rNr#   r'   )r   r   r(   r)   )r'   r(   r)   )openyaml	safe_loadr%   )clsr0   fconfigr'   r(   r)   modelr#   r#   r$   from_hparams1   s   zVocos.from_hparamsNrepo_idrevisionOptional[str]c                 C  sz   t |d|d}t |d|d}| |}tj|dd}t|jtr2dd |jj 	 D }|
| || |  |S )z
        Class method to create a new Vocos model instance from a pre-trained model stored in the Hugging Face model hub.
        zconfig.yaml)r;   filenamer<   zpytorch_model.bincpu)map_locationc                 S  s   i | ]	\}}d | |qS )zfeature_extractor.encodec.r#   ).0keyvaluer#   r#   r$   
<dictcomp>H   s    z)Vocos.from_pretrained.<locals>.<dictcomp>)r   r:   torchloadr   r'   r   encodec
state_dictitemsupdateload_state_dicteval)r6   r;   r<   r0   
model_pathr9   rH   encodec_parametersr#   r#   r$   from_pretrained>   s   


zVocos.from_pretrainedaudio_inputtorch.Tensorr   r   c                 K  s(   | j |fi |}| j|fi |}|S )a'  
        Method to run a copy-synthesis from audio waveform. The feature extractor first processes the audio input,
        which is then passed through the backbone and the head to reconstruct the audio output.

        Args:
            audio_input (Tensor): The input tensor representing the audio waveform of shape (B, T),
                                        where B is the batch size and L is the waveform length.


        Returns:
            Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
        )r'   decode)r-   rP   r   featuresaudio_outputr#   r#   r$   forwardQ   s   zVocos.forwardfeatures_inputc                 K  s    | j |fi |}| |}|S )a  
        Method to decode audio waveform from already calculated features. The features input is passed through
        the backbone and the head to reconstruct the audio output.

        Args:
            features_input (Tensor): The input tensor of features of shape (B, C, L), where B is the batch size,
                                     C denotes the feature dimension, and L is the sequence length.

        Returns:
            Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
        )r(   r)   )r-   rV   r   xrT   r#   r#   r$   rR   c   s   
zVocos.decodecodesc                 C  s   t | jts
J d| dkr|d}| jjjj}tj	d|t
| ||jd}||ddd }tjj|| jjjdd}|dd}|S )a   
        Transforms an input sequence of discrete tokens (codes) into feature embeddings using the feature extractor's
        codebook weights.

        Args:
            codes (Tensor): The input tensor. Expected shape is (K, L) or (K, B, L),
                            where K is the number of codebooks, B is the batch size and L is the sequence length.

        Returns:
            Tensor: Features of shape (B, C, L), where B is the batch size, C denotes the feature dimension,
                    and L is the sequence length.
        z:Feature extractor should be an instance of EncodecFeatures   r   r   )device)dim)r   r'   r   r\   	unsqueezerG   	quantizerbinsrE   arangelenrZ   viewr	   
functional	embeddingcodebook_weightssum	transpose)r-   rX   n_binsoffsetsembeddings_idxsrS   r#   r#   r$   codes_to_featurest   s   
zVocos.codes_to_features)r'   r
   r(   r   r)   r   )r0   r1   r   r&   r*   )r;   r1   r<   r=   r   r&   )rP   rQ   r   r   r   rQ   )rV   rQ   r   r   r   rQ   )rX   rQ   r   rQ   )__name__
__module____qualname____doc__r,   classmethodr:   rO   rE   inference_moderU   rR   rk   __classcell__r#   r#   r.   r$   r&   !   s    r&   )r   r   r   r   r   r   )
__future__r   typingr   r   r   r   r   rE   r4   huggingface_hubr   r	   vocos.feature_extractorsr
   r   vocos.headsr   vocos.modelsr   r%   Moduler&   r#   r#   r#   r$   <module>   s    
