o
    %ݫi_/                     @   s   d Z ddlZddlm  mZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ zddlZW n ey>   dZed7 Zeew eeZG d	d
 d
ejZG dd dejZdS )a.  This lobe enables the integration of fairseq pretrained wav2vec models.

Reference: https://arxiv.org/abs/2006.11477
Reference: https://arxiv.org/abs/1904.05862
FairSeq >= 1.0.0 needs to be installed: https://fairseq.readthedocs.io/en/latest/

Authors
 * Titouan Parcollet 2021
 * Salima Mdhaffar 2021
    N)nn)length_to_mask)download_file)
get_loggerz1Please install Fairseq to use pretrained wav2vec
zE.G. run: pip install fairseqc                       s\   e Zd ZdZ							d fdd	Zdd Zdd	d
Zdd Zdd ZdddZ	  Z
S )FairseqWav2Vec2a	  This lobe enables the integration of fairseq pretrained wav2vec2.0 models.

    Source paper: https://arxiv.org/abs/2006.11477
    FairSeq >= 0.10.0 needs to be installed:
    https://fairseq.readthedocs.io/en/latest/

    The model can be used as a fixed features extractor or can be finetuned. It
    will download automatically the model if a url is given (e.g FairSeq
    repository from GitHub).

    Arguments
    ---------
    pretrained_path : str
        Path of the pretrained wav2vec2 model. It can be a url or a local path.
    save_path : str
        Path and filename of the downloaded model.
    input_norm : bool (default: None)
        If True, a layer_norm (affine) will be applied to the input waveform.
        By default, it is extracted from the checkpoint of the downloaded model
        in order to match the pretraining conditions. However, if this information
        is not given in the checkpoint, it has to be given manually.
    output_norm : bool (default: False)
        If True, a layer_norm (affine) will be applied to the output obtained
        from the wav2vec model.
    freeze : bool (default: False)
        If True, the model is frozen. If False, the model will be trained
        alongside with the rest of the pipeline.
    freeze_feature_extractor : bool (default: False)
        Whether to prevent feature extraction weights from updating.
    pretrain : bool (default: True)
        If True, the model is pretrained with the specified source.
        If False, the randomly-initialized model is instantiated.
    dropout : float (default: None)
        If different from None (0.0 to 1.0), it will override the given fairseq
        dropout rates. This is useful if the wav2vec2 model has been trained
        without dropout and one wants to reactivate it for downstream task
        fine-tuning (better performance observed).
    layer_drop : float (default: None)
        If different from None (0.0 to 1.0), it will override the given fairseq
        layer_drop rate. This is useful if the wav2vec2 model has been trained
        without layer_drop and one wants to reactivate it for downstream task
        fine-tuning.

    Example
    -------
    >>> inputs = torch.rand([10, 600])
    >>> model_url = "https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small.pt"
    >>> save_path = "models_checkpoints/wav2vec2.pt"
    >>> model = FairseqWav2Vec2(model_url, save_path)
    >>> outputs = model(inputs)
    >>> outputs.shape
    torch.Size([10, 100,  768])
    NFTc
                    sv  t    t|| i }
|s6|d ur6i |
d< |d ur,||
d d< ||
d d< ||
d d< |	d ur6|	|
d d< tjj|g|
d\}}}|d u rbt|d drT|d j| _nt|dr^|j| _nd	| _n|| _|d
 }|| _|| _	|| _
|| _| j	rtd | j  | j D ]}d	|_qn| j  | jrtd | jj  | jj D ]}d	|_q|s| | j |   d S )Nmodeldropoutdropout_inputattention_dropout
layer_drop)arg_overridestask	normalizeFr   zAspeechbrain.lobes.models.fairseq_wav2vec - wav2vec 2.0 is frozen.zSspeechbrain.lobes.models.fairseq_wav2vec - wav2vec 2.0 feature extractor is frozen.)super__init__r   fairseqcheckpoint_utilsload_model_ensemble_and_taskhasattrr   r   freezeoutput_normfreeze_feature_extractorloggerwarningeval
parametersrequires_gradtrainfeature_extractorreset_layerremove_pretraining_modules)selfpretrained_path	save_path
input_normr   r   r   pretrainr   r   	overridesr   cfgr   param	__class__ \/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/fairseq_wav2vec.pyr   V   sb   

	



zFairseqWav2Vec2.__init__c                 C   sT   | j ||d}| jr$t  | ||W  d   S 1 sw   Y  | ||S )aj  Takes an input waveform and return its corresponding wav2vec encoding.

        Arguments
        ---------
        wav : torch.Tensor
            A batch of audio signals to transform to features.
        wav_lens : torch.Tensor
            The lengths corresponding to the input wavs.

        Returns
        -------
        wav2vec encoded features.
        )wav_lenN)
make_masksr   torchno_gradextract_features)r!   wavwav_lenspadding_maskr+   r+   r,   forward   s   

 zFairseqWav2Vec2.forwardc                 C   sR   | j rt||jdd }| jj||ddd }| jr't||jdd }|S ) Extracts the wav2vect embeddings   NF)r4   maskx)r   F
layer_normshaper   r1   r   )r!   r2   r4   outr+   r+   r,   r1      s   z FairseqWav2Vec2.extract_featuresc                 C   6   t |dr	|  | D ]}||kr| | qdS z+Reinitializes the parameters of the networkreset_parametersNr   r@   childrenr   r!   r   child_layerr+   r+   r,   r         

zFairseqWav2Vec2.reset_layerc                 C   s$   d| j _d| j _d| j _d| j _dS )z?Remove unneeded modules. Inspired by the same fairseq function.N)r   	quantizer	project_q
target_glu
final_proj)r!   r+   r+   r,   r       s   z*FairseqWav2Vec2.remove_pretraining_modulesr   c                 C   s2   d}|durt ||jd  }t|  }|S )a  This method generates the padding masks.

        Arguments
        ---------
        src : tensor
            The sequence to the encoder (required).
        wav_len : tensor
            The relative length of the wav given in SpeechBrain format.
        pad_idx : int
            The index for <pad> token (default=0).

        Returns
        -------
        src_key_padding_mask : torch.Tensor
            The mask for removing pad tokens.
        Nr7   )r/   roundr<   r   bool)r!   srcr-   pad_idxsrc_key_padding_maskabs_lenr+   r+   r,   r.      s
   zFairseqWav2Vec2.make_masks)NFFFTNN)NNr   )__name__
__module____qualname____doc__r   r5   r1   r   r    r.   __classcell__r+   r+   r)   r,   r      s    :T
r   c                       s@   e Zd ZdZ			d fdd	Zdd Zdd Zd	d
 Z  ZS )FairseqWav2Vec1a  This lobes enables the integration of fairseq pretrained wav2vec1.0 models.

    Arguments
    ---------
    pretrained_path : str
        Path of the pretrained wav2vec1 model. It can be a url or a local path.
    save_path : str
        Path and filename of the downloaded model.
    output_norm : bool (default: True)
        If True, a layer_norm (affine) will be applied to the output obtained
        from the wav2vec model.
    freeze : bool (default: True)
        If True, the model is frozen. If False, the model will be trained
        alongside with the rest of the pipeline.
    pretrain : bool (default: True)
        If True, the model is pretrained with the specified source.
        If False, the randomly-initialized model is instantiated.

    Example
    -------
    >>> inputs = torch.rand([10, 600])
    >>> model_url = ""
    >>> save_path = "models_checkpoints/wav2vec.pt"
    >>> model = FairseqWav2Vec1(model_url, save_path)
    >>> outputs = model(inputs)
    >>> outputs.shape
    torch.Size([10, 100, 512])
    Tc           	         sn   t    || _|| _t|| tj|g\}}}|| _| jd | _| jr+| j	  |s5| 
| j d S d S rP   )r   r   r   r   r   r   r   r   r   r   r   )	r!   r"   r#   r   r   r%   r   r'   r   r)   r+   r,   r     s$   


zFairseqWav2Vec1.__init__c                 C   sF   | j rt  | | W  d   S 1 sw   Y  | |S )a  Takes an input waveform and return its corresponding wav2vec encoding.

        Arguments
        ---------
        wav : torch.Tensor
            A batch of audio signals to transform to features.

        Returns
        -------
        wav2vec encoded features
        N)r   r/   r0   r1   detach)r!   r2   r+   r+   r,   r5   :  s
   
 
zFairseqWav2Vec1.forwardc                 C   sB   | j |}| j |d}|dd}| jrt||j}|S )r6   r      r7   )	r   r   feature_aggregatorsqueeze	transposer   r:   r;   r<   )r!   r2   r=   r+   r+   r,   r1   N  s   z FairseqWav2Vec1.extract_featuresc                 C   r>   r?   rA   rC   r+   r+   r,   r   [  rE   zFairseqWav2Vec1.reset_layer)TTT)	rQ   rR   rS   rT   r   r5   r1   r   rU   r+   r+   r)   r,   rV      s    ! rV   )rT   r/   torch.nn.functionalr   
functionalr:   speechbrain.dataio.dataior   speechbrain.utils.data_utilsr   speechbrain.utils.loggerr   r   ImportErrorMSGrQ   r   Moduler   rV   r+   r+   r+   r,   <module>   s$     ^