o
    @Tiq                     @  s   d dl mZ d dlmZmZmZmZmZ d dlZd dl	Z	d dl
mZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ ddlmZ ddlmZ dddZG dd dejZ dS )    )annotations)AnyDictTupleUnionOptionalN)hf_hub_download)nn)FeatureExtractorEncodecFeatures)FourierHead)Backbone)	ISTFTHead)autocast   )crossover_merge_linkwitz_riley)UpSamplerBlockargsUnion[Any, Tuple[Any, ...]]initDict[str, Any]returnr   c                 C  sV   | di }t| ts| f} |d dd\}}t||gd}t||}|| i |S )zInstantiates a class with the given args and init.

    Args:
        args: Positional arguments required for instantiation.
        init: Dict of the form {"class_path":...,"init_args":...}.

    Returns:
        The instantiated class object.
    	init_args
class_path.r   )fromlist)get
isinstancetuplersplit
__import__getattr)r   r   kwargsclass_module
class_namemodule
args_class r'   K/home/ubuntu/.local/lib/python3.10/site-packages/linacodec/vocoder/vocos.pyinstantiate_class   s   


r)   c                      sd   e Zd ZdZd" fddZed#ddZed$d%ddZe	 d&ddZ
e	 d'd d!Z  ZS )(Vocosa#  
    The Vocos class represents a Fourier-based neural vocoder for audio synthesis.
    This class is primarily designed for inference, with support for loading from pretrained
    model checkpoints. It consists of three main components: a feature extractor,
    a backbone, and a head.
    feature_extractorr
   backboner   headr   	upsamplerr   head_48kr   c                   s8   t    || _|| _|| _|| _|| _d| _d| _d S )Ni  T)	super__init__r+   r,   r-   r.   r/   
freq_range
return_48k)selfr+   r,   r-   r.   r/   	__class__r'   r(   r1   .   s   

zVocos.__init__config_pathstrr   c           
      C  s   t |d}t|}W d   n1 sw   Y  td|d d}td|d d}td|d d}td|d d}td|d	 d}| |||||d
}	|	S )z}
        Class method to create a new Vocos model instance from hyperparameters stored in a yaml configuration file.
        rNr'   r+   )r   r   r,   r-   r.   r/   )r+   r,   r-   r.   r/   )openyaml	safe_loadr)   )
clsr7   fconfigr+   r,   r-   r.   r/   modelr'   r'   r(   from_hparams:   s   zVocos.from_hparamsNrepo_idrevisionOptional[str]c                 C  sz   t |d|d}t |d|d}| |}tj|dd}t|jtr2dd |jj 	 D }|
| || |  |S )z
        Class method to create a new Vocos model instance from a pre-trained model stored in the Hugging Face model hub.
        zconfig.yaml)rB   filenamerC   zpytorch_model.bincpu)map_locationc                 S  s   i | ]	\}}d | |qS )zfeature_extractor.encodec.r'   ).0keyvaluer'   r'   r(   
<dictcomp>S   s    z)Vocos.from_pretrained.<locals>.<dictcomp>)r   rA   torchloadr   r+   r   encodec
state_dictitemsupdateload_state_dicteval)r=   rB   rC   r7   
model_pathr@   rO   encodec_parametersr'   r'   r(   from_pretrainedI   s   


zVocos.from_pretrainedaudio_inputtorch.Tensorr"   r   c                 K  s(   | j |fi |}| j|fi |}|S )a'  
        Method to run a copy-synthesis from audio waveform. The feature extractor first processes the audio input,
        which is then passed through the backbone and the head to reconstruct the audio output.

        Args:
            audio_input (Tensor): The input tensor representing the audio waveform of shape (B, T),
                                        where B is the batch size and L is the waveform length.


        Returns:
            Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
        )r+   decode)r4   rW   r"   featuresaudio_outputr'   r'   r(   forward\   s   zVocos.forwardfeatures_inputc                 K  s   | j |fi |dd}| |dd}| |}| |dd}t|dd}|ddd|jd f }tdd t	|
 |
 | jd}W d   n1 sTw   Y  | jd	kr`|S |S )
a  
        Method to decode audio waveform from already calculated features. The features input is passed through
        the backbone and the head to reconstruct the audio output.

        Args:
            features_input (Tensor): The input tensor of features of shape (B, C, L), where B is the batch size,
                                     C denotes the feature dimension, and L is the sequence length.

        Returns:
            Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
        r      i]  i  NF)enabled)cutoffT)r,   	transposer.   r/   r-   AFresampleshaper   r   floatr2   r3   )r4   r]   r"   rZ   upsampled_features
pred_audiopred_audio2merged_audior'   r'   r(   rY   n   s   

zVocos.decode)
r+   r
   r,   r   r-   r   r.   r   r/   r   )r7   r8   r   r*   )N)rB   r8   rC   rD   r   r*   )rW   rX   r"   r   r   rX   )r]   rX   r"   r   r   rX   )__name__
__module____qualname____doc__r1   classmethodrA   rV   rL   inference_moder\   rY   __classcell__r'   r'   r5   r(   r*   &   s    r*   )r   r   r   r   r   r   )!
__future__r   typingr   r   r   r   r   rL   r;   huggingface_hubr   r	   vocos.feature_extractorsr
   r   vocos.headsr   vocos.modelsr   r   torch.cuda.ampr   torchaudio.functional
functionalrb   linkwitzr   upsampler_blockr   r)   Moduler*   r'   r'   r'   r(   <module>   s     
