o
    %ݫiT                     @   sP   d Z ddlZddlmZmZ ddlmZ ddlmZ ee	Z
G dd deZdS )a  This lobe enables the integration of huggingface pretrained Mimi.

Mimi codec is a state-of-the-art audio neural codec, developed by Kyutai.
It combines semantic and acoustic information into audio tokens running at 12Hz and a bitrate of 1.1kbps.

Note that you need to install `transformers>=4.45.1` to use this module.

Repository: https://huggingface.co/kyutai/mimi
Paper: https://kyutai.org/Moshi.pdf

Authors
 * Pooneh Mousavi 2024
    N)clean_padding_length_to_mask)HFTransformersInterface)
get_loggerc                       sR   e Zd ZdZ			d fdd	Ze dd Zd	d
 Zdd Z	dddZ
  ZS )Mimia  This lobe enables the integration of HuggingFace pretrained Mimi model.
    Mimi codec is a state-of-the-art audio neural codec, developed by Kyutai.
    It combines semantic and acoustic information into audio tokens running at 12Hz and a bitrate of 1.1kbps.

    Source paper:
       https://kyutai.org/Moshi.pdf

    Transformers>=4.45.1 from HuggingFace needs to be installed:
        https://huggingface.co/transformers/installation.html

    The code is adapted from the official HF Kyutai repository:
        https://huggingface.co/kyutai/mimi

    Arguments
    ---------
    source : str
        A HuggingFace repository identifier or a path
    save_path : str
        The location where the pretrained model will be saved
    sample_rate : int (default: 24000)
        The audio sampling rate
    freeze : bool
        whether the model will be frozen (e.g. not trainable if used as part of training another model)
    num_codebooks : int (default: 8)
        Number of codebooks. It could be [2,3,4,5,6,7,8]

    Example
    -------
    >>> model_hub = "kyutai/mimi"
    >>> save_path = "savedir"
    >>> model = Mimi(model_hub, save_path)
    >>> audio = torch.randn(4, 48000)
    >>> length = torch.tensor([1.0, .5, .75, 1.0])
    >>> tokens, emb = model.encode(audio, length)
    >>> tokens.shape
    torch.Size([4, 8, 25])
    >>> emb.shape
    torch.Size([4, 8, 25, 256])
    >>> rec = model.decode(tokens, length)
    >>> rec.shape
    torch.Size([4, 1, 48000])
    ]  T   c                    s(   t  j|||d || _|| _d | _d S )N)source	save_pathfreeze)super__init__num_codebookssample_rate
embeddings)selfr	   r
   r   r   r   	__class__ j/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/huggingface_transformers/mimi.pyr   F   s   	
zMimi.__init__c                 C   sF   | j jjj}| j jjj}|| d | j }dd |D }t|}|S )Nc                 S   s   g | ]}|j jqS r   )codebookembed).0layerr   r   r   
<listcomp>]   s    z+Mimi._compute_embedding.<locals>.<listcomp>)model	quantizer"semantic_residual_vector_quantizerlayers"acoustic_residual_vector_quantizerr   torchstack)r   semantic_layersacoustic_layersr   embsr   r   r   _compute_embeddingT   s   


zMimi._compute_embeddingc                 C   s&   |  ||\}}| ||}|||fS )a`  Encodes the input audio as tokens and embeddings and  decodes audio from tokens

        Arguments
        ---------
        inputs : torch.Tensor
            A (Batch x Samples) or (Batch x Channel x Samples)
            tensor of audio
        length : torch.Tensor
            A tensor of relative lengths

        Returns
        -------
        tokens : torch.Tensor
            A (Batch x Tokens x Heads) tensor of audio tokens
        emb : torch.Tensor
            Raw vector embeddings from the model's
            quantizers
        audio : torch.Tensor
            the reconstructed audio
        )encodedecode)r   inputslengthtokens	embeddingaudior   r   r   forwarda   s   
zMimi.forwardc                 C   s   | j du r
|  | _ | dkr|d}|d}t|| ||jdd}| jj||| j	dd }|d
ddd| j jd }t| j d
|jd dddd|}||fS )a  Encodes the input audio as tokens and embeddings

        Arguments
        ---------
        inputs : torch.Tensor
            A (Batch x Samples) or (Batch x Channel x Samples)
            tensor of audio
        length : torch.Tensor
            A tensor of relative lengths

        Returns
        -------
        tokens : torch.Tensor
            A (Batch x num_codebooks x Length) tensor of audio tokens
        emb : torch.Tensor
            Raw vector embeddings from the model's
            quantizers
        N      )device)num_quantizersr   )r   r%   dim	unsqueezesizer   r1   r   r&   r   expandshaper    gather)r   r(   r)   max_lenpadding_maskr*   input_tensorr   r   r   r   r&   |   s0   




zMimi.encodeNc                 C   s<   | j du r
|  | _ | j|}|j}|durt|| |S )aX  Decodes audio from tokens

        Arguments
        ---------
        tokens : torch.Tensor
            A (Batch x num_codebooks x Length) tensor of audio tokens
        length : torch.Tensor
            A 1-D tensor of relative lengths

        Returns
        -------
        audio : torch.Tensor
            the reconstructed audio
        N)r   r%   r   r'   audio_valuesr   )r   r*   r)   resultr,   r   r   r   r'      s   


zMimi.decode)r   Tr   )N)__name__
__module____qualname____doc__r   r    no_gradr%   r-   r&   r'   __classcell__r   r   r   r   r      s    /
.r   )rA   r    speechbrain.dataio.dataior   r   =speechbrain.lobes.models.huggingface_transformers.huggingfacer   speechbrain.utils.loggerr   r>   loggerr   r   r   r   r   <module>   s    