o
    %ݫiL-                     @   s   d Z ddlZddlZddlm  mZ ddlZddl	m
Z
 ddlmZmZ ddlmZ eeZG dd deZG dd	 d	eZdS )
a  This lobe enables the integration of huggingface pretrained wav2vec2 models.

Reference: https://arxiv.org/abs/2006.11477
Reference: https://arxiv.org/abs/1904.05862
Reference: https://arxiv.org/abs/2110.13900
Transformer from HuggingFace needs to be installed:
https://huggingface.co/transformers/installation.html

Authors
 * Titouan Parcollet 2021
 * Boumadane Abdelmoumene 2021
 * Ha Nguyen 2023
    N)_compute_mask_indices)HFTransformersInterfacemake_padding_masks)
get_loggerc                       sN   e Zd ZdZ					d fdd	ZdgfddZdd	d
ZdddZ  ZS )Wav2Vec2a  This lobe enables the integration of HuggingFace and SpeechBrain
    pretrained wav2vec2.0/Hubert models.

    Source paper wav2vec2.0: https://arxiv.org/abs/2006.11477
    Source paper Hubert: https://arxiv.org/abs/2106.07447
    Transformer from HuggingFace needs to be installed:
    https://huggingface.co/transformers/installation.html

    The model can be used as a fixed feature extractor or can be finetuned. It
    will download automatically the model from HuggingFace or use a local path.

    Arguments
    ---------
    source : str
        HuggingFace hub name: e.g "facebook/wav2vec2-large-lv60"
    save_path : str
        Path (dir) of the downloaded model.
    output_norm : bool (default: True)
        If True, a layer_norm (affine) will be applied to the output obtained
        from the wav2vec model.
    freeze : bool (default: True)
        If True, the model is frozen. If False, the model will be trained
        alongside with the rest of the pipeline.
    freeze_feature_extractor :  bool (default: False)
        When freeze = False and freeze_feature_extractor True, the feature_extractor module of the model is Frozen. If False
        all the wav2vec model will be trained including feature_extractor module.
    apply_spec_augment : bool (default: False)
        If True, the model will apply spec augment on the output of feature extractor
        (inside huggingface Wav2VecModel() class).
        If False, the model will not apply spec augment. We set this to false to prevent from doing it twice.
    output_all_hiddens : bool (default: False)
        If True, the forward function outputs the hidden states from all transformer layers.
        For example wav2vec2-base has 12 transformer layers and the output is of shape (13, B, T, C),
        where a projection of the CNN output is added to the beginning.
        If False, the forward function outputs the hidden states only from the last transformer layer.
    **kwargs
        Extra keyword arguments passed to the `from_pretrained` function.

    Example
    -------
    >>> inputs = torch.rand([10, 600])
    >>> model_hub = "facebook/wav2vec2-base-960h"
    >>> save_path = "savedir"
    >>> model = Wav2Vec2(model_hub, save_path)
    >>> outputs = model(inputs)
    Fc           
         s   t  jd|||d| || jj_| j||d | jj| _|| _	| j
s>| j	r>td | jj  | jj D ]}	d|	_q8|| _|| _d S )N)source	save_pathfreeze)	cache_dirzespeechbrain.lobes.models.huggingface_transformers.wav2vec2 - wav2vec 2.0 feature extractor is frozen.F )super__init__modelconfigapply_spec_augmentload_feature_extractorfeature_extractordo_normalizenormalize_wavfreeze_feature_extractorr	   loggerwarningeval
parametersrequires_gradoutput_normoutput_all_hiddens)
selfr   r   r   r	   r   r   r   kwargsparam	__class__r   n/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/huggingface_transformers/wav2vec2.pyr   N   s$   


zWav2Vec2.__init__wav2vec2c           	      C   s\   i }t j|dd}| D ]\}}|D ]}| d|v r*|d| dd}|||< qq|S )aF  A custom loading ensures SpeechBrain compatibility for Pretrain and model
        de/serialization. Here, the scope is to remove '.wav2vec2' before loading.

        Arguments
        ---------
        path : str
            Checkpoint path, file name relative to the repo root.
        replaceables : List[str]
            State dict sub-keys that if found, shall be dropped (incl. the 'model.' parent key), elevating key structures.

        Returns
        -------
        modified_state_dict : see torch.load
            SpeechBrain-valid deserialized pretrained model.
        cpu)map_location.zmodel. )torchloaditemsreplace)	r   pathreplaceablesmodified_state_dictorig_state_dictkeyparamstagsave_keyr   r   r"   _modify_state_dicto   s   zWav2Vec2._modify_state_dictNc                 C   sF   | j rt  | ||W  d   S 1 sw   Y  | ||S )a  Takes an input waveform and return its corresponding wav2vec encoding.

        Arguments
        ---------
        wav : torch.Tensor (signal)
            A batch of audio signals to transform to features.
        wav_lens : torch.Tensor
            The relative length of the wav given in SpeechBrain format.

        Returns
        -------
        Wav2vec encoded features.
        N)r	   r(   no_gradextract_features)r   wavwav_lensr   r   r"   forward   s
   

 zWav2Vec2.forwardc                 C   s   t ||d}| jrt||jdd }| j||| jd}| jr2tjt	|j
dd}|jdd }n|j}|j}| jrEt||dd }|S )a  Takes an input waveform and return its corresponding wav2vec encoding.

        Arguments
        ---------
        wav : torch.Tensor (signal)
            A batch of audio signals to transform to features.
        wav_lens : torch.Tensor
            The relative length of the wav given in SpeechBrain format.

        Returns
        -------
        out : torch.Tensor
            Wav2vec encoded features.
        wav_len   N)attention_maskoutput_hidden_statesr   )dim)r   r   F
layer_normshaper   r   r(   stacklisthidden_stateslast_hidden_stater   )r   r7   r8   padding_maskout
norm_shaper   r   r"   r6      s    zWav2Vec2.extract_features)FFFFFN)	__name__
__module____qualname____doc__r   r4   r9   r6   __classcell__r   r   r    r"   r      s    3!
r   c                       s:   e Zd ZdZ			d fdd	Zddd	Zd
d Z  ZS )Wav2Vec2Pretraina  This lobe enables the integration of HuggingFace
    wav2vec2.0 models to be pretrained.

    Source paper: https://arxiv.org/abs/2006.11477
    Transformer from HuggingFace needs to be installed:
    https://huggingface.co/transformers/installation.html

    The return is an HuggingFace format and the mask indices that contains:
    https://huggingface.co/transformers/model_doc/wav2vec2.html#wav2vec2forpretraining

    For instance, it returns the loss that can be accessed with .loss

    Arguments
    ---------
    source : str
        HuggingFace hub name: e.g "facebook/wav2vec2-large-lv60"
    save_path : str
        Path (dir) of the downloaded model.
    mask_prob : float (default: 0.65)
        Probability of masking a given frame. Default is taken from the paper.
    mask_length : float (default: 10)
        Length (i.e. number of consecutive masked frames). Default is taken from
        the paper.
    normalize_wav : bool
        Whether to normalize input before processing.

    Example
    -------
    >>> inputs = torch.rand([10, 32000])
    >>> model_hub = "facebook/wav2vec2-base-960h"
    >>> save_path = "savedir"
    >>> model = Wav2Vec2Pretrain(model_hub, save_path)
    >>> outputs, _ = model(inputs, wav_lens=None)
    ?
   Tc                    s(   t  j||dd || _|| _|| _d S )NT)r   r   for_pretraining)r   r   	mask_probmask_lengthr   )r   r   r   rU   rV   r   r    r   r"   r      s   
zWav2Vec2Pretrain.__init__Nc                 C   s   |j \}}| jrt||j }| j| }t||f| j| j	d}t
j||jt
jd}t||d}t||f}	t
jtjjjj||f| jj|	d|jt
jd}
| j|||
|d|fS )a  Takes an input waveform and return its corresponding wav2vec encoding.

        Arguments
        ---------
        wav : torch.Tensor (signal)
            A batch of audio signals to transform to features.
        wav_lens : torch.Tensor
            The relative length of the wav given in SpeechBrain format.

        Returns
        -------
        Wav2vec encoded outputs.
        )rU   rV   )devicedtyper:   )num_negativesmask_time_indices)rZ   sampled_negative_indicesr=   )rC   r   rA   rB   r    _get_feat_extract_output_lengthsitemr   rU   rV   r(   tensorrW   longr   nponestransformersmodelsr#   modeling_wav2vec2_sample_negative_indicesr   rY   )r   r7   r8   
batch_sizeraw_sequence_lengthsequence_lengthrZ   torch_mask_time_indicesrH   full_sentence_indicesnegative_sample_indicesr   r   r"   r9      sJ   

zWav2Vec2Pretrain.forwardc                 C   s
   d|_ |S )zIf the config needs to be overridden, here is the place

        Arguments
        ---------
        config : Wav2Vec2Config
            The original config needs to be overridden.

        Returns
        -------
        Overridden config
        T)r>   )r   r   r   r   r"   override_config?  s   z Wav2Vec2Pretrain.override_config)rR   rS   TrK   )rL   rM   rN   rO   r   r9   rl   rP   r   r   r    r"   rQ      s    '
?rQ   )rO   numpyr`   r(   torch.nn.functionalnn
functionalrA   rb   .transformers.models.wav2vec2.modeling_wav2vec2r   =speechbrain.lobes.models.huggingface_transformers.huggingfacer   r   speechbrain.utils.loggerr   rL   r   r   rQ   r   r   r   r"   <module>   s     -