o
    pi                     @   sF   d Z ddlZddlmZ ddlmZ ddlmZ G dd dejZ	dS )a  This lobe enables the integration of pretrained SpeechTokenizer.

Please, install speechtokenizer:
    pip install speechtokenizer

Reference: https://arxiv.org/abs/2308.16692
Reference: https://arxiv.org/abs/1904.05862
Reference: https://arxiv.org/abs/2110.13900

Transformer from HuggingFace needs to be installed:
https://huggingface.co/transformers/installation.html

Author
 * Pooneh Mousavi 2023

    N)snapshot_download)SpeechTokenizerc                       s<   e Zd ZdZ fddZdddZdddZd	d
 Z  ZS )SpeechTokenizer_interfacea  This lobe enables the integration of HuggingFace and SpeechBrain
    pretrained SpeechTokenizer.

    Please, install speechtokenizer:
    pip install speechtokenizer

    Source paper: https://arxiv.org/abs/2308.16692


    The model can be used as a fixed Discrete feature extractor or can be finetuned. It
    will download automatically the model from HuggingFace or use a local path.

    Arguments
    ---------
    source : str
        HuggingFace hub name: e.g "fnlp/SpeechTokenizer"
    save_path : str
        Path (dir) of the downloaded model.

    Example
    -------
    >>> import torch
    >>> inputs = torch.rand([10, 600])
    >>> model_hub = "fnlp/SpeechTokenizer"
    >>> save_path = "savedir"
    >>> model =SpeechTokenizer_interface(model_hub, save_path)  # doctest: +SKIP
    >>> tokens = model(inputs)  # doctest: +SKIP
    >>> print(tokens.shape)  # doctest: +SKIP
    torch.Size([8, 10, 2])
    >>> wav=model.decode(tokens)
    >>> print(wav.shape)
    torch.Size([10, 640])
    c                    sL   t    t|ddg|d}| d}| d}t||| _| j  d S )Nz*config.jsonz*SpeechTokenizer.pt)repo_idallow_patterns	cache_dirz'/speechtokenizer_hubert_avg/config.jsonz./speechtokenizer_hubert_avg/SpeechTokenizer.pt)super__init__r   r   load_from_checkpointmodeleval)selfsource	save_path	saved_dirconfig_path	ckpt_path	__class__ y/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/speechbrain/lobes/models/discrete/speechtokenizer_interface.pyr	   ;   s   


z"SpeechTokenizer_interface.__init__Nc                 C   s   |  ||S )  Takes an input waveform and return its corresponding wav2vec encoding.

        Arguments
        ---------
        wav : torch.Tensor (signal)
            A batch of audio signals to transform to features.
        wav_lens : torch.Tensor
            The relative length of the wav given in SpeechBrain format.

        Returns
        -------
        tokens : torch.Tensor
            A (N_q, Batch x Seq) tensor of audio tokens

        )encode)r   wavwav_lensr   r   r   forwardO   s   z!SpeechTokenizer_interface.forwardc                 C   s@   t   | j|d}W d   |S 1 sw   Y  |S )r      N)torchno_gradr   r   	unsqueeze)r   r   r   codesr   r   r   r   a   s   

z SpeechTokenizer_interface.encodec                 C   sX   |ddddddf }|ddddddf }| j tj||gdd}|dS )aB  Takes an input waveform and return its corresponding wav2vec encoding.

        Arguments
        ---------
        codes : torch.Tensor
            A (N_q, Batch x Seq) tensor of audio tokens

        Returns
        -------
        wav : torch.Tensor (signal)
            A batch of reconstructed audio signals.
        Nr   r   )axis)r   decoder   catsqueeze)r   r    RVQ_1RVQ_supplementr   r   r   r   r"   w   s   
z SpeechTokenizer_interface.decode)N)	__name__
__module____qualname____doc__r	   r   r   r"   __classcell__r   r   r   r   r      s    "

r   )
r*   r   torch.nnnnhuggingface_hubr   speechtokenizerr   Moduler   r   r   r   r   <module>   s    