o
    %ݫiu:                     @   s   d Z ddlmZ ddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ eeZG dd deZdS )zLTransformer for ST in the SpeechBrain style.

Authors
* YAO FEI, CHENG 2021
    )OptionalN)nn)ConformerEncoder)NormalizedEmbeddingTransformerDecoderTransformerEncoderget_key_padding_maskget_lookahead_mask)TransformerASR)Swish)
ModuleList)
get_loggerc                       s   e Zd ZdZddddddejddd	d
deddd
dddddfdee dee	 dee
 deej dee
 dee dee	 dededededef fddZd(ddZd(d d!Zd(d"d#Zd$d% Zd(d&d'Z  ZS ))TransformerSTa  This is an implementation of transformer model for ST.

    The architecture is based on the paper "Attention Is All You Need":
    https://arxiv.org/pdf/1706.03762.pdf

    Arguments
    ---------
    tgt_vocab: int
        Size of vocabulary.
    input_size: int
        Input feature size.
    d_model : int, optional
        Embedding dimension size.
        (default=512).
    nhead : int, optional
        The number of heads in the multi-head attention models (default=8).
    num_encoder_layers : int, optional
        The number of sub-encoder-layers in the encoder (default=6).
    num_decoder_layers : int, optional
        The number of sub-decoder-layers in the decoder (default=6).
    d_ffn : int, optional
        The dimension of the feedforward network model (default=2048).
    dropout : int, optional
        The dropout value (default=0.1).
    activation : torch.nn.Module, optional
        The activation function of FFN layers.
        Recommended: relu or gelu (default=relu).
    positional_encoding: str, optional
        Type of positional encoding used. e.g. 'fixed_abs_sine' for fixed absolute positional encodings.
    normalize_before: bool, optional
        Whether normalization should be applied before or after MHA or FFN in Transformer layers.
        Defaults to True as this was shown to lead to better performance and training stability.
    kernel_size: int, optional
        Kernel size in convolutional layers when Conformer is used.
    bias: bool, optional
        Whether to use bias in Conformer convolutional layers.
    encoder_module: str, optional
        Choose between Conformer and Transformer for the encoder. The decoder is fixed to be a Transformer.
    conformer_activation: torch.nn.Module, optional
        Activation module used after Conformer convolutional layers. E.g. Swish, ReLU etc. it has to be a torch Module.
    attention_type: str, optional
        Type of attention layer used in all Transformer or Conformer layers.
        e.g. regularMHA or RelPosMHA.
    max_length: int, optional
        Max length for the target and source sequence in input.
        Used for positional encodings.
    causal: bool, optional
        Whether the encoder should be causal or not (the decoder is always causal).
        If causal the Conformer convolutional layer is causal.
    ctc_weight: float
        The weight of ctc for asr task
    asr_weight: float
        The weight of asr task for calculating loss
    mt_weight: float
        The weight of mt task for calculating loss
    asr_tgt_vocab: int
        The size of the asr target language
    mt_src_vocab: int
        The size of the mt source language

    Example
    -------
    >>> src = torch.rand([8, 120, 512])
    >>> tgt = torch.randint(0, 720, [8, 120])
    >>> net = TransformerST(
    ...     720, 512, 512, 8, 1, 1, 1024, activation=torch.nn.GELU,
    ...     ctc_weight=1, asr_weight=0.3,
    ... )
    >>> enc_out, dec_out = net.forward(src, tgt)
    >>> enc_out.shape
    torch.Size([8, 120, 512])
    >>> dec_out.shape
    torch.Size([8, 120, 512])
    i         i   g?fixed_abs_sineF   Ttransformer
regularMHAi	  g        r   kernel_sizebiasencoder_moduleconformer_activationattention_type
max_lengthcausal
ctc_weight
asr_weight	mt_weightasr_tgt_vocabmt_src_vocabc                    sJ  t  jdi d|d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|d| |dk r[|dkr[t||||||	|ddd	| _tt||| _|dkrtt||| _|dkr|t||||||	|| j	| j
d	| _n#|dkrt||||||||| j	| j
d
| _|sJ d|d usJ d|   d S )N	tgt_vocab
input_sized_modelnheadnum_encoder_layersnum_decoder_layersd_ffndropout
activationpositional_encodingnormalize_beforer   r   r   r   r   r   r      r   Tr   )	
num_layersr$   r'   r#   r(   r)   r+   r   r   r   )	r$   r-   r'   r#   r(   r)   r+   r   r   	conformer)
r$   r-   r'   r#   r(   r)   r   r   r   r   z+normalize_before must be True for Conformerz%conformer_activation must not be None )super__init__r   asr_decoderr   r   custom_asr_tgt_modulecustom_mt_src_moduler   r   r   
mt_encoderr   _init_params)selfr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r   r   r   r   r   r   r   r   r   r   r   r    	__class__r/   f/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/transformer/TransformerST.pyr1   h   s   	


zTransformerST.__init__c                 C   s   |  dkr|j\}}}}	|||||	 }| j||||d\}
}}}| |}| jdkr5|| | }n| jdkrA|| | }| j||||||
d\}}}|S )ad  This method implements a decoding step for asr task

        Arguments
        ---------
        encoder_out : torch.Tensor
            The representation of the encoder (required).
        src : torch.Tensor
            Input sequence (required).
        tgt : torch.Tensor
            The sequence to the decoder (transcription) (required).
        wav_len : torch.Tensor
            Length of input tensors (required).
        pad_idx : int
            The index for <pad> token (default=0).

        Returns
        -------
        asr_decoder_out : torch.Tensor
            One step of asr decoder.
           pad_idxRelPosMHAXLr   tgtmemorymemory_masktgt_masktgt_key_padding_maskmemory_key_padding_mask)	dimshapereshape
make_masksr3   r   positional_encoding_decoderr*   r2   )r7   encoder_outsrcr@   wav_lenr=   bztch1ch2src_key_padding_maskrD   src_maskrC   transcriptionasr_decoder_out_r/   r/   r:   forward_asr   s6   


	zTransformerST.forward_asrc                 C   s   | j |||d\}}}}| |}| jdkr| |}n| jdkr*|| | }d}| j||||d\}	}
| |}| jdkrN|| | }|| | }n| jdkrZ|| | }| j||	||||d\}}
}
|	|fS )a  This method implements a forward step for mt task

        Arguments
        ---------
        src : torch.Tensor
            The sequence to the encoder (transcription) (required).
        tgt : torch.Tensor
            The sequence to the decoder (translation) (required).
        pad_idx : int
            The index for <pad> token (default=0).

        Returns
        -------
        encoder_out : torch.Tensor
            Output of encoder
        decoder_out : torch.Tensor
            Output of decoder
        r<   r>   r   N)rL   rS   rR   pos_embsr?   )	make_masks_for_mtr4   r   r*   positional_encoding_typer5   custom_tgt_modulerJ   decoder)r7   rL   r@   r=   rR   rD   rS   rC   pos_embs_encoderrK   rV   decoder_outr/   r/   r:   
forward_mt  sB   






	zTransformerST.forward_mtc                 C   st   | j |||d\}}}}| |}| jdkr|| | }n| jdkr*|| | }| j||||||d\}}	}
|S )a  This method implements a forward step for mt task using a wav2vec encoder
        (same than above, but without the encoder stack)

        Arguments
        ----------
        src (transcription): torch.Tensor
            output features from the w2v2 encoder
        tgt (translation): torch.Tensor
            The sequence to the decoder (required).
        pad_idx : int
            The index for <pad> token (default=0).
        r<   r>   r   r?   )rY   r[   r   rJ   rZ   r*   r\   )r7   rL   r@   r=   rR   rD   rS   rC   r^   rV   	multiheadr/   r/   r:   forward_mt_decoder_onlyD  s(   


	z%TransformerST.forward_mt_decoder_onlyc                 C   st   t |}| |}| jdkr|| | }|| | }n| jdkr)|| | }| j|||d\}}}||d fS )a  This method implements a decoding step for the transformer model.

        Arguments
        ---------
        tgt : torch.Tensor
            The sequence to the decoder.
        encoder_out : torch.Tensor
            Hidden output of the encoder.

        Returns
        -------
        prediction : torch.Tensor
            The predicted outputs.
        multihead_attns : torch.Tensor
            The last step of attention.
        r>   r   )rC   )r	   r[   r   rJ   rZ   r*   r2   )r7   r@   rK   rC   
predictionrV   multihead_attnsr/   r/   r:   
decode_asrl  s   


zTransformerST.decode_asrc                 C   s:   d}| j rt||d}t||d}d}t|}||||fS )a  This method generates the masks for training the transformer model.

        Arguments
        ---------
        src : torch.Tensor
            The sequence to the encoder (required).
        tgt : torch.Tensor
            The sequence to the decoder (required).
        pad_idx : int
            The index for <pad> token (default=0).

        Returns
        -------
        src_key_padding_mask : torch.Tensor
            Timesteps to mask due to padding
        tgt_key_padding_mask : torch.Tensor
            Timesteps to mask due to padding
        src_mask : torch.Tensor
            Timesteps to mask for causality
        tgt_mask : torch.Tensor
            Timesteps to mask for causality
        Nr<   )trainingr   r	   )r7   rL   r@   r=   rR   rD   rS   rC   r/   r/   r:   rY     s   zTransformerST.make_masks_for_mt)r   )__name__
__module____qualname____doc__r   ReLUr   r   intboolstrModulefloatr1   rW   r_   ra   re   rY   __classcell__r/   r/   r8   r:   r      sj    O
g
7
>("r   )rj   typingr   torchr   .speechbrain.lobes.models.transformer.Conformerr   0speechbrain.lobes.models.transformer.Transformerr   r   r   r   r	   3speechbrain.lobes.models.transformer.TransformerASRr
   speechbrain.nnet.activationsr   speechbrain.nnet.containersr   speechbrain.utils.loggerr   rg   loggerr   r/   r/   r/   r:   <module>   s    