o
    %ݫi1                     @   s`   d Z ddlZddlmZ ddlmZmZ ddlm	Z	 ddl
mZ dZeeZG dd	 d	e	ZdS )
a  This lobe enables the integration of huggingface pretrained EnCodec.

EnCodec makes it possible to compress audio into a sequence of discrete tokens
at different bandwidths - and to reconstruct audio from such sequences, with
some loss of quality depending on the bandwidth.

Note that while encodec can be used to reconstruct speech data, for a
high-quality reconstruction, it is recommended to use a specially trained
vocoder, such as Vocos (speechbrain.lobes.models.huggingface_transformers.vocos)

Repository: https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec
Paper: https://arxiv.org/abs/2210.13438

Authors
 * Artem Ploujnikov 2023
    N)
functional)clean_padding_length_to_mask)HFTransformersInterface)
get_loggeri]  c                       s   e Zd ZdZ						d fdd	Zdd	 Zdd
dZdd Zdd Zdd Z	dd Z
dd Zdd ZdddZdddZdd Z  ZS ) EncodecaU  An wrapper for the HuggingFace encodec model

    Arguments
    ---------
    source : str
        A HuggingFace repository identifier or a path
    save_path : str
        The location where the pretrained model will be saved
    sample_rate : int
        The audio sampling rate
    bandwidth : float
        The encoding bandwidth, in kbps (optional)
        Supported bandwidths:
        1.5, 3.0, 6.0, 12.0, 24.0
    flat_embeddings : bool
        If set to True, embeddings will be flattened into
        (Batch x Length x (Heads * Embedding))
    freeze : bool
        whether the model will be frozen (e.g. not trainable if used
        as part of training another model)
    renorm_embeddings : bool
        whether embeddings should be renormalized. In the original
        model.

    Example
    -------
    >>> model_hub = "facebook/encodec_24khz"
    >>> save_path = "savedir"
    >>> model = Encodec(model_hub, save_path)
    >>> audio = torch.randn(4, 1000)
    >>> length = torch.tensor([1.0, .5, .75, 1.0])
    >>> tokens, emb = model.encode(audio, length)
    >>> tokens.shape
    torch.Size([4, 4, 2])
    >>> emb.shape
    torch.Size([4, 4, 2, 128])
    >>> rec = model.decode(tokens, length)
    >>> rec.shape
    torch.Size([4, 1, 1280])
    >>> rec_emb = model.decode_emb(emb, length)
    >>> rec_emb.shape
    torch.Size([4, 1, 1280])
    >>> rec_tokens = model.tokens(emb, length)
    >>> rec_tokens.shape
    torch.Size([4, 4, 2])
    >>> model = Encodec(model_hub, save_path, flat_embeddings=True)
    >>> _, emb = model.encode(audio, length)
    >>> emb.shape
    torch.Size([4, 4, 256])
    N      ?FTc                    s<  t  j|||d |st}|| _|| _|| _| jj|| _	| jj
j| _| jjjd | j	 }tdd |D }	| d|	 | jj\}
| _| _| j| j	| j | j}| d| t| j	d d d d f | j }| d| || _| jr|  \}}| d| | d| | jrtd	 | j D ]}d
|_qd S d S )N)source	save_pathfreezec                 S   s   g | ]}|j jqS  )codebookembed).0layerr   r   m/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/huggingface_transformers/encodec.py
<listcomp>j   s    z$Encodec.__init__.<locals>.<listcomp>
vocabularyvocabulary_flattoken_index_offsetsemb_meanemb_stdz(huggingface_Encodec - Encodec is frozen.F)super__init__DEFAULT_SAMPLE_RATEsample_rate	bandwidthflat_embeddingsmodel	quantizer get_num_quantizers_for_bandwidth	num_headsconfigcodebook_size
num_tokenslayerstorchstackregister_bufferr   shapeemb_dimreshapearangerenorm_embeddings_precalibrater   loggerwarning
parametersrequires_grad)selfr	   r
   r   r   r   r   r-   quantizer_layersr   _r   r   r   r   param	__class__r   r   r   T   sD   

zEncodec.__init__c                 C   s2   t | jddddf d| j| j}| |S )z5Compute parameters required to renormalize embeddingsN   )r&   r,   r$   expandr!   _compute_embedding_norm)r3   sampler   r   r   r.      s   

zEncodec._precalibratec           	      C   s   |du rt jt||jd}|d}| |}t|| |ddddddf |}|d	d|d	d ddddddf }|| | d }|j	g dd||j	g ddd  
 dddddf }||fS )	a  Computes the normalization for embeddings based on
        a sample.

        Arguments
        ---------
        sample : torch.Tensor
            A (Batch x Samples) or (Batch x Channel x Samples)
            audio sample
        length : torch.Tensor
            A tensor of relative lengths

        Returns
        -------
        emb_mean : torch.Tensor
        emb_std : torch.Tensor
            Norm stats for embeddings.
        Ndevicer9   r      )r   r9      dim)r&   oneslenr>   size_raw_embeddingsr   	expand_asmeansumsqrt)	r3   r<   lengthmax_lenembmaskr   emb_diff_sqr   r   r   r   r;      s&   

$zEncodec._compute_embedding_normc                 C   sB   | j std| ||}| ||\| _| _| j | j fS )a  Calibrates the normalization on a sound sample

        Arguments
        ---------
        sample : torch.Tensor
            A (Batch x Samples) or (Batch x Channel x Samples)
            audio sample

        length : torch.Tensor
            A tensor of relative lengths

        Returns
        -------
        emb_mean : torch.Tensor
            The embedding mean

        emb_std : torch.Tensor
            The embedding standard deviation
        z0Not supported when renorm_embeddings is disabled)r-   
ValueError_encode_tokensr;   r   r   squeeze)r3   r<   rL   sample_tokensr   r   r   	calibrate   s   zEncodec.calibratec                 C   s   |  ||S )a  Encodes the input audio as tokens

        Arguments
        ---------
        inputs : torch.Tensor
            A (Batch x Samples) or (Batch x Channel x Samples)
            tensor of audio
        length : torch.Tensor
            A tensor of relative lengths

        Returns
        -------
        tokens : torch.Tensor
            A (Batch X Tokens) tensor of audio tokens
        )encode)r3   inputsrL   r   r   r   forward   s   zEncodec.forwardc                 C   sP   t | j  | ||}| |}||fW  d   S 1 s!w   Y  dS )a   Encodes the input audio as tokens and embeddings

        Arguments
        ---------
        inputs : torch.Tensor
            A (Batch x Samples) or (Batch x Channel x Samples)
            tensor of audio
        length : torch.Tensor
            A tensor of relative lengths

        Returns
        -------
        tokens : torch.Tensor
            A (Batch x Tokens x Heads) tensor of audio tokens
        emb : torch.Tensor
            Raw vector embeddings from the model's
            quantizers
        N)r&   set_grad_enabledr   rR   
embeddings)r3   rW   rL   tokensrN   r   r   r   rV      s
   
$zEncodec.encodec                 C   sf   |  dkr|d}|d}t|| ||jdd}| jj||| jd}|j	d
dd}|S )a  Encodes audio as tokens only

        Arguments
        ---------
        inputs : torch.Tensor
            A (Batch x Samples) or (Batch x Channel x Samples)
            tensor of audio
        length : torch.Tensor
            A tensor of relative lengths

        Returns
        -------
        tokens : torch.Tensor
            A (Batch x Tokens x Heads) tensor of audio tokens
        r@   r9   r?   r=   )r   r   )rC   	unsqueezerF   r   r>   r   rV   r   audio_codesrS   	transpose)r3   rW   rL   rM   rO   resultr[   r   r   r   rR      s   

zEncodec._encode_tokensc                 C   s   || j  }t|| j}|S )a  Converts token indexes to vector embeddings, for
        each quantizer

        Arguments
        ---------
        tokens : torch.Tensor
            a (Batch x Length x Heads) tensor of token indexes

        Returns
        -------
        emb : torch.Tensor
            a (Batch x Length x Heads x Embedding) tensor
            of raw vector embeddings from the model's
            quantizer codebooks
        )r   F	embeddingr   )r3   r[   idxrN   r   r   r   rG   
  s   
zEncodec._raw_embeddingsc                 C   sJ   |  |}| jr|| j | j }| jr#|j\}}}}||||| }|S )a  Converts token indexes to vector embeddings

        Arguments
        ---------
        tokens : torch.Tensor
            a (Batch x Length x Heads) tensor of token indexes

        Returns
        -------
        emb : torch.Tensor
            a (Batch x Length x Heads x Embedding) tensor
            of raw vector embeddings from the model's
            quantizer codebooks
        )rG   r-   r   r   r   r)   r+   )r3   r[   rN   
batch_sizerM   r!   r*   r   r   r   rZ     s   
zEncodec.embeddingsc                 C   sl   t | j % | j|ddddg}|j}|dur#t|| |W  d   S 1 s/w   Y  dS )aP  Decodes audio from tokens

        Arguments
        ---------
        tokens : torch.Tensor
            A (Batch x Length x Heads) tensor of audio tokens
        length : torch.Tensor
            A 1-D tensor of relative lengths

        Returns
        -------
        audio : torch.Tensor
            the reconstructed audio
        r   r?   r\   N)	r&   rY   r   r   decoder]   r_   audio_valuesr   )r3   r[   rL   r`   audior   r   r   re   5  s   
$zEncodec.decodec                 C   s  t | j s | jr|j\}}}|||| j| j}| jr&|| j	 | j
 }|djddd}| jddd}|dddd}|| dd}	|djddddd}
|d|	  |
  }|jdd	j}|d
urqt|| |W  d
   S 1 s}w   Y  d
S )a{  Comberts embeddings to raw tokens

        Arguments
        ---------
        emb : torch.Tensor
            Raw embeddings
        length : torch.Tensor
            A 1-D tensor of relative lengths. If supplied,
            padded positions will be zeroed out

        Returns
        -------
        tokens : torch.Tensor
            A (Batch x Length) tensor of token indicesr@   r?   T)keepdimr\   r   r9   rA   rB   N)r&   rY   r   r   r)   r+   r!   r*   r-   r   r   powrJ   r   r_   r]   permutemoveaxismaxindicesr   )r3   rN   rL   rd   rM   r5   scaled_statesvocabemb_permemb_vocab_prod	vocab_sumdistr[   r   r   r   r[   M  s&   
$zEncodec.tokensc                 C   sH   t | j  | |}| ||W  d   S 1 sw   Y  dS )a  Decodes raw vector embeddings into audio

        Arguments
        ---------
        emb : torch.Tensor
            A (Batch x Length x Heads x Embedding) tensor of
            raw vector embeddings
        length : torch.Tensor
            The corresponding lengths of the inputs.

        Returns
        -------
        audio : torch.Tensor
            the reconstructed audio
        N)r&   rY   r   r[   re   )r3   rN   rL   r[   r   r   r   
decode_embo  s   

$zEncodec.decode_emb)NNr   FTT)N)__name__
__module____qualname____doc__r   r.   r;   rU   rX   rV   rR   rG   rZ   re   r[   rt   __classcell__r   r   r7   r   r       s(    6,
#

"r   )rx   r&   torch.nnr   ra   speechbrain.dataio.dataior   r   =speechbrain.lobes.models.huggingface_transformers.huggingfacer   speechbrain.utils.loggerr   r   ru   r/   r   r   r   r   r   <module>   s    