o
    %ݫi                     @   s   d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ zddl	m
Z ddlmZ W n ey=   d	Zed
7 Zeew dZg dZeeZG dd dejZ
dS )a  This lobe enables the integration of huggingface pretrained
Vocos model.

Vocos is a vocoder trained on top of EnCodec tokens. While
EnCodec itself can be used for a lossy reconstruction of speech,
a vocoder, such as Vocos, can be used to improve the quality.

Repository: https://huggingface.co/charactr/vocos-encodec-24khz
Paper: https://arxiv.org/pdf/2306.00814.pdf

TODO: There is an open feature request to add this model to
HuggingFace Transformers.

If this is implemented, it will be possible to make this model
inherit from HFTransformersInterface

https://github.com/huggingface/transformers/issues/25123

Authors
 * Artem Ploujnikov 2023
    N)hf_hub_download)nn)length_to_mask)
get_logger)Vocos)EncodecFeaturesz,Please install vocos to use the Vocos model
zE.G. run: pip install vocosi]  )      ?g      @g      @g      (@c                       s8   e Zd ZdZ			d fdd	Zdd Zd	d
 Z  ZS )r   a]  An wrapper for the HuggingFace Vocos model

    Arguments
    ---------
    source : str
        A HuggingFace repository identifier or a path
    save_path : str
        The location where the pretrained model will be saved
    revision : str
        The model revision
    bandwidth : float
        The bandwidth value
        Supported:
        1.5, 3.0, 6.0, 12.0
    freeze : bool
        Whether or not parameters should be
        frozen

    Example
    -------
    >>> model_hub = "charactr/vocos-encodec-24khz"
    >>> save_path = "savedir"
    >>> model = Vocos(model_hub, save_path)
    >>> tokens = torch.randint(1024, (4, 10, 2))
    >>> length = torch.tensor([1.0, 0.5, 0.75, 1.0])
    >>> audio, out_length = model(tokens, length)
    >>> audio.shape
    torch.Size([4, 3200])
    >>> out_length
    tensor([1.0000, 0.5000, 0.7500, 1.0000])
    Nr   Tc                    s|   t    || _|| _|| _|  | _|| _|| _t	
t|    | _| jr:td | j D ]}d|_q4d S d S )Nz$huggingface_Vocos - Vocos is frozen.F)super__init__source	save_pathrevision_load_modelmodelfreeze	bandwidthtorchtensor
BANDWIDTHSabsargminitembandwidth_idloggerwarning
parametersrequires_grad)selfr   r   r   r   r   param	__class__ k/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/huggingface_transformers/vocos.pyr
   O   s   


zVocos.__init__c                 C   s   t | jd| j| jd}t | jd| j| jd}t|}tj|dd}t|j	t
r:dd |j	j  D }|| || |  |S )zLoads the pretrained model. This is a customized implementation of
        Vocos.from_pretrained(), which has been customized to specify an
        alternate cache_dirzconfig.yaml)repo_idfilenamer   	cache_dirzpytorch_model.bincpu)map_locationc                 S   s   i | ]	\}}d | |qS )zfeature_extractor.encodec.r!   ).0keyvaluer!   r!   r"   
<dictcomp>y   s    z%Vocos._load_model.<locals>.<dictcomp>)r   r   r   r   
VocosModelfrom_hparamsr   load
isinstancefeature_extractorr   encodec
state_dictitemsupdateload_state_dicteval)r   config_path
model_pathr   r2   encodec_parametersr!   r!   r"   r   f   s,   


zVocos._load_modelc                 C   s   t | j 9 | j|ddd}| jj|t j| jg|j	dd}t
||d |d|j	d}|| |fW  d   S 1 sCw   Y  dS )a  Converts EnCodec tokens to audio

        Arguments
        ---------
        inputs : torch.Tensor
            A tensor of EnCodec tokens
        length : torch.Tensor
            A 1-D tensor of relative lengths

        Returns
        -------
        wavs : torch.Tensor
            A (Batch x Length) tensor of raw waveforms
        length : torch.Tensor
            Relative lengths
           r      )device)r   )max_lenr<   N)r   set_grad_enabledr   r   codes_to_featurespermutedecoder   r   r<   r   size)r   inputslengthfeatureswavsmaskr!   r!   r"   forward   s   

$zVocos.forward)Nr   T)__name__
__module____qualname____doc__r
   r   rH   __classcell__r!   r!   r   r"   r   .   s    $r   )rL   r   huggingface_hubr   r   speechbrain.dataio.dataior   speechbrain.utils.loggerr   vocosr   r,   vocos.feature_extractorsr   ImportErrorMSGDEFAULT_SAMPLE_RATEr   rI   r   Moduler!   r!   r!   r"   <module>   s$    