o
    ei                     @   sB   d Z ddlZddlZddlmZ ddlmZ G dd dejZdS )a&  This lobe enables the integration of pretrained WavTokenizer.

Note that you need to pip install `git+https://github.com/Tomiinek/WavTokenizer` to use this module.

Repository: https://github.com/jishengpeng/WavTokenizer/
Paper: https://arxiv.org/abs/2408.16532

Authors
 * Pooneh Mousavi 2024
    N)snapshot_downloadc                       sT   e Zd ZdZ					d fdd	Zd	d
 Ze dd Zdd Z	dd Z
  ZS )WavTokenizera  This lobe enables the integration of pretrained WavTokenizer model, a discrete codec models with single codebook for Audio Language Modeling.

    Source paper:
        https://arxiv.org/abs/2408.16532

    You need to pip install `git+https://github.com/Tomiinek/WavTokenizer` to use this module.

    The code is adapted from the official WavTokenizer repository:
    https://github.com/jishengpeng/WavTokenizer/

    Arguments
    ---------
    source : str
        A HuggingFace repository identifier or a path
    save_path : str
        The location where the pretrained model will be saved
    config : str
        The name of the HF config file.
    checkpoint : str
        The name of the HF checkpoint file.
    sample_rate : int (default: 24000)
        The audio sampling rate
    freeze : bool
        whether the model will be frozen (e.g. not trainable if used
        as part of training another model)

    Example
    -------
    >>> model_hub = "novateur/WavTokenizer"
    >>> save_path = "savedir"
    >>> config="wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
    >>> checkpoint="WavTokenizer_small_600_24k_4096.ckpt"
    >>> model = WavTokenizer(model_hub, save_path,config=config,checkpoint=checkpoint)
    >>> audio = torch.randn(4, 48000)
    >>> length = torch.tensor([1.0, .5, .75, 1.0])
    >>> tokens, embs= model.encode(audio)
    >>> tokens.shape
    torch.Size([4, 1, 80])
    >>> embs.shape
    torch.Size([4, 80, 512])
    >>> rec = model.decode(tokens)
    >>> rec.shape
    torch.Size([4, 48000])
    NIwavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml$WavTokenizer_small_600_24k_4096.ckpt]  Tc                    s   z	dd l }|| _ W n ty   tdw t   t||d}tj||}	tj||}
| j j	|
|	| _
|  | _|| _d S )Nr   zhPlease install the WavTokenizer module using: `pip install git+https://github.com/Tomiinek/WavTokenizer`)repo_id	cache_dir)wavtokenizerImportErrorsuper__init__r   ospathjoinr   from_pretrained0802model_compute_embedding
embeddingssample_rate)selfsource	save_pathconfig
checkpointr   freezer	   r   checkpoint_pathconfig_path	__class__ l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lobes/models/discrete/wavtokenizer.pyr   A   s"   




zWavTokenizer.__init__c                 C   s"   |  |\}}| |}|||fS )a  Encodes the input audio as tokens and embeddings and  decodes audio from tokens

        Arguments
        ---------
        inputs : torch.Tensor
            A (Batch x Samples)
            tensor of audio
        Returns
        -------
        tokens : torch.Tensor
            A (Batch x Tokens x Heads) tensor of audio tokens
        emb : torch.Tensor
            Raw vector embeddings from the model's
            quantizers
        audio : torch.Tensor
            the reconstructed audio
        )encodedecode)r   inputstokens	embeddingaudior   r   r    forward`   s   

zWavTokenizer.forwardc                 C   s   | j jjjjjd j}|S )Nr   )r   feature_extractorencodec	quantizervqlayerscodebook)r   embsr   r   r    r   x   s   zWavTokenizer._compute_embeddingc                 C   s,   | j j|dd\}}|dd|ddfS )a  Encodes the input audio as tokens and embeddings

        Arguments
        ---------
        inputs : torch.Tensor
            A (Batch x Samples) or (Batch x Channel x Samples)
            tensor of audio

        Returns
        -------
        tokens : torch.Tensor
            A (Batch x NQ x Length) tensor of audio tokens
        emb : torch.Tensor
            Raw vector embeddings from the model's
            quantizers
        r   bandwidth_id   )r   r!   movedim)r   r#   embr$   r   r   r    r!      s   zWavTokenizer.encodec                 C   s4   | j |dd}| j j|tjd|jdd}|S )a  Decodes audio from tokens

        Arguments
        ---------
        tokens : torch.Tensor
            A (Batch x NQ x Length) tensor of audio tokens
        Returns
        -------
        audio : torch.Tensor
            the reconstructed audio
        r1   r   )devicer/   )r   codes_to_featuresr3   r"   torchtensorr5   )r   r$   featssigr   r   r    r"      s
   zWavTokenizer.decode)Nr   r   r   T)__name__
__module____qualname____doc__r   r'   r7   no_gradr   r!   r"   __classcell__r   r   r   r    r      s    0
r   )	r>   r   r7   torch.nnnnhuggingface_hubr   Moduler   r   r   r   r    <module>   s    