o
    eiV                     @   s:   d Z ddlZddlmZ ddlmZ G dd dejZdS )a;  This lobe enables the integration of pretrained SpeechTokenizer.

Please, install speechtokenizer:
    pip install speechtokenizer

Reference: https://arxiv.org/abs/2308.16692

Transformer from HuggingFace needs to be installed:
https://huggingface.co/transformers/installation.html

Author
 * Pooneh Mousavi 2023

    N)snapshot_downloadc                       s@   e Zd ZdZ	d fdd	ZdddZddd	Zd
d Z  ZS )SpeechTokenizera  This lobe enables the integration of HuggingFace and SpeechBrain
    pretrained SpeechTokenizer.

    Please, install speechtokenizer:
    pip install speechtokenizer

    Source paper: https://arxiv.org/abs/2308.16692


    The model can be used as a fixed Discrete feature extractor or can be finetuned. It
    will download automatically the model from HuggingFace or use a local path.

    Arguments
    ---------
    source : str
        HuggingFace hub name: e.g "fnlp/SpeechTokenizer"
    save_path : str
        Path (dir) of the downloaded model.
    sample_rate : int (default: 16000)
        The audio sampling rate

    Example
    -------
    >>> import torch
    >>> inputs = torch.rand([10, 600])
    >>> model_hub = "fnlp/SpeechTokenizer"
    >>> save_path = "savedir"
    >>> model =SpeechTokenizer(model_hub, save_path)
    >>> tokens = model.encode(inputs)
    >>> tokens.shape
    torch.Size([8, 10, 2])
    >>> wav=model.decode(tokens)
    >>> wav.shape
    torch.Size([10, 640])
    >  c                    s   zddl m} || _W n ty   tdw t   t|ddg|d}| d}| d}| j||| _| j  || _	d S )	Nr   )r   zhPlease install the speechtokenizer module using: pip install speechtokenizer`pip install beartype==0.1.1z*config.jsonz*SpeechTokenizer.pt)repo_idallow_patterns	cache_dirz'/speechtokenizer_hubert_avg/config.jsonz./speechtokenizer_hubert_avg/SpeechTokenizer.pt)
speechtokenizerr   ImportErrorsuper__init__r   load_from_checkpointmodelevalsample_rate)selfsource	save_pathr   r   	saved_dirconfig_path	ckpt_path	__class__ o/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lobes/models/discrete/speechtokenizer.pyr   :   s*   





zSpeechTokenizer.__init__Nc                 C   s   |  ||S )  Takes an input waveform and return its corresponding wav2vec encoding.

        Arguments
        ---------
        wav : torch.Tensor (signal)
            A batch of audio signals to transform to features.
        wav_lens : torch.Tensor
            The relative length of the wav given in SpeechBrain format.

        Returns
        -------
        tokens : torch.Tensor
            A (N_q, Batch x Seq) tensor of audio tokens

        )encode)r   wavwav_lensr   r   r   forward\   s   zSpeechTokenizer.forwardc                 C   s@   t   | j|d}W d   |S 1 sw   Y  |S )r      N)torchno_gradr   r   	unsqueeze)r   r   r   codesr   r   r   r   n   s   

zSpeechTokenizer.encodec                 C   sX   |ddddddf }|ddddddf }| j tj||gdd}|dS )aB  Takes an input waveform and return its corresponding wav2vec encoding.

        Arguments
        ---------
        codes : torch.Tensor
            A (N_q, Batch x Seq) tensor of audio tokens

        Returns
        -------
        wav : torch.Tensor (signal)
            A batch of reconstructed audio signals.
        Nr   r   )axis)r   decoder    catsqueeze)r   r#   RVQ_1RVQ_supplementr   r   r   r   r%      s   
zSpeechTokenizer.decode)r   )N)	__name__
__module____qualname____doc__r   r   r   r%   __classcell__r   r   r   r   r      s    (
"
r   )r-   r    torch.nnnnhuggingface_hubr   Moduler   r   r   r   r   <module>   s
    