o
    eiG                     @   s   d Z ddlZddlmZ ddlmZmZmZ ddlmZ ddl	m
Z
 G dd dejZG d	d
 d
ejZG dd deZdd Zdd Zdd ZdS )zlThe Attentional RNN model for Grapheme-to-Phoneme

Authors
 * Mirco Ravanelli 2021
 * Artem Ploujnikov 2021
    N)nn)TransformerInterfaceget_key_padding_maskget_lookahead_mask)normalization)Linearc                       s:   e Zd ZdZ			d fdd	ZdddZd	d
 Z  ZS )AttentionSeq2Seqa  
    The Attentional RNN encoder-decoder model

    Arguments
    ---------
    enc: torch.nn.Module
        the encoder module
    encoder_emb: torch.nn.Module
        the encoder_embedding_module
    emb: torch.nn.Module
        the embedding module
    dec: torch.nn.Module
        the decoder module
    lin: torch.nn.Module
        the linear module
    out: torch.nn.Module
        the output layer (typically log_softmax)
    bos_token: int
        the index of the Beginning-of-Sentence token
    use_word_emb: bool
        whether or not to use word embedding
    word_emb_enc: nn.Module
        a module to encode word embeddings
    r   FNc
           
         sR   t    || _|| _|| _|| _|| _|| _|| _|| _	|r$|	| _
d S d | _
d S )N)super__init__encencoder_embembdeclinout	bos_tokenuse_word_embword_emb_enc)
selfr   r   r   r   r   r   r   r   r   	__class__ `/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lobes/models/g2p/model.pyr
   .   s   
zAttentionSeq2Seq.__init__c                 C   s   |\}}|du rt |d|j}n|\}}| |}| jr%t| j||}| |\}	}| |}
| 	|
|	|\}}| 
|}| |}|||	|fS )aZ  Computes the forward pass

        Arguments
        ---------
        grapheme_encoded: torch.Tensor
            graphemes encoded as a Torch tensor
        phn_encoded: torch.Tensor
            the encoded phonemes
        word_emb: torch.Tensor
            word embeddings (optional)

        Returns
        -------
        p_seq: torch.Tensor
            a (batch x position x token) tensor of token probabilities in each
            position
        char_lens: torch.Tensor
            a tensor of character sequence lengths
        encoder_out:
            the raw output of the encoder
        Nr   )get_dummy_phonemessizedevicer   r   _apply_word_embr   r   r   r   r   r   )r   grapheme_encodedphn_encodedword_embchars	char_lensphn_bos_emb_charencoder_oute_inhwlogitsp_seqr   r   r   forwardE   s   



zAttentionSeq2Seq.forwardc                 C   s*   | j dur
|  |n|}tj||gddS )a  Concatenate character embeddings with word embeddings,
        possibly encoding the word embeddings if an encoder
        is provided

        Arguments
        ---------
        emb_char: torch.Tensor
            the character embedding tensor
        word_emb: torch.Tensor
            the word embedding tensor

        Returns
        -------
        result: torch.Tensor
            the concatenation of the tensorNdim)r   torchcat)r   r$   r   r   r   r   r   r   n   s
   

z AttentionSeq2Seq._apply_word_emb)r   FNNN)__name__
__module____qualname____doc__r
   r+   r   __classcell__r   r   r   r   r      s    !
)r   c                       sF   e Zd ZdZ	d
 fdd	Zdd Zdd Zejej	ej
d	Z  ZS )WordEmbeddingEncodera  A small encoder module that reduces the dimensionality
    and normalizes word embeddings

    Arguments
    ---------
    word_emb_dim: int
        the dimension of the original word embeddings
    word_emb_enc_dim: int
        the dimension of the encoded word embeddings
    norm: torch.nn.Module
        the normalization to be used (
            e.g. speechbrain.nnet.normalization.LayerNorm)
    norm_type: str
        the type of normalization to be used
    Nc                    sL   t    || _|| _|r| ||| _n|| _t||d| _t	 | _
d S )N)	n_neurons
input_size)r	   r
   word_emb_dimword_emb_enc_dim	_get_normnormr   r   r   Tanh
activation)r   r:   r;   r=   	norm_typer   r   r   r
      s   
zWordEmbeddingEncoder.__init__c                 C   s(   | j |}|std| ||dS )a"  Determines the type of normalizer

        Arguments
        ---------
        norm: str
            the normalization type: "batch", "layer" or "instance
        dim: int
            the dimensionality of the inputs

        Returns
        -------
        The normalized outputs.
        zInvalid norm: )r9   )NORMSget
ValueError)r   r=   r.   norm_clsr   r   r   r<      s   
zWordEmbeddingEncoder._get_normc                 C   s,   | j dur
|  |}| |}| |}|S )zComputes the forward pass of the embedding

        Arguments
        ---------
        emb: torch.Tensor
            the original word embeddings

        Returns
        -------
        emb_enc: torch.Tensor
            encoded word embeddings
        N)r=   r   r?   )r   r   xr   r   r   r+      s
   



zWordEmbeddingEncoder.forward)batchlayerinstancer1   )r2   r3   r4   r5   r
   r<   r+   r   BatchNorm1d	LayerNormInstanceNorm1drA   r6   r   r   r   r   r7      s    r7   c                       sx   e Zd ZdZddddddejdddd	d
d	dddddddddddf fdd	ZdddZdd ZdddZ	dd Z
  ZS )TransformerG2Pu]  
    A Transformer-based Grapheme-to-Phoneme model

    Arguments
    ----------
    emb: torch.nn.Module
        the embedding module
    encoder_emb: torch.nn.Module
        the encoder embedding module
    char_lin: torch.nn.Module
        a linear module connecting the inputs
        to the transformer
    phn_lin: torch.nn.Module
        a linear module connecting the outputs to
        the transformer
    out: torch.nn.Module
        the decoder module (usually Softmax)
    lin: torch.nn.Module
        the linear module for outputs
    d_model: int
        The number of expected features in the encoder/decoder inputs (default=512).
    nhead: int
        The number of heads in the multi-head attention models (default=8).
    num_encoder_layers: int, optional
        The number of encoder layers in1ì the encoder.
    num_decoder_layers: int, optional
        The number of decoder layers in the decoder.
    dim_ffn: int, optional
        The dimension of the feedforward network model hidden layer.
    dropout: int, optional
        The dropout value.
    activation: torch.nn.Module, optional
        The activation function for Feed-Forward Network layer,
        e.g., relu or gelu or swish.
    custom_src_module: torch.nn.Module, optional
        Module that processes the src features to expected feature dim.
    custom_tgt_module: torch.nn.Module, optional
        Module that processes the src features to expected feature dim.
    positional_encoding: str, optional
        Type of positional encoding used. e.g. 'fixed_abs_sine' for fixed absolute positional encodings.
    normalize_before: bool, optional
        Whether normalization should be applied before or after MHA or FFN in Transformer layers.
        Defaults to True as this was shown to lead to better performance and training stability.
    kernel_size: int, optional
        Kernel size in convolutional layers when Conformer is used.
    bias: bool, optional
        Whether to use bias in Conformer convolutional layers.
    encoder_module: str, optional
        Choose between Conformer and Transformer for the encoder. The decoder is fixed to be a Transformer.
    conformer_activation: torch.nn.Module, optional
        Activation module used after Conformer convolutional layers. E.g. Swish, ReLU etc. it has to be a torch Module.
    attention_type: str, optional
        Type of attention layer used in all Transformer or Conformer layers.
        e.g. regularMHA or RelPosMHA.
    max_length: int, optional
        Max length for the target and source sequence in input.
        Used for positional encodings.
    causal: bool, optional
        Whether the encoder should be causal or not (the decoder is always causal).
        If causal the Conformer convolutional layer is causal.
    pad_idx: int
        the padding index (for masks)
    encoder_kdim: int, optional
        Dimension of the key for the encoder.
    encoder_vdim: int, optional
        Dimension of the value for the encoder.
    decoder_kdim: int, optional
        Dimension of the key for the decoder.
    decoder_vdim: int, optional
        Dimension of the value for the decoder.
    i         i   g?Nfixed_abs_sineT   transformer
regularMHAi	  Fr   c                    s   t  jdi d|d|d|	d|
d|d|d|d|d	|d
|d|d|d|d|d|d|d|d|d|d|d| || _|| _|| _|| _|| _|| _|| _|| _	|| _
|   d S )Nd_modelnheadnum_encoder_layersnum_decoder_layersd_ffndropoutr?   custom_src_modulecustom_tgt_modulepositional_encodingnormalize_beforekernel_sizebiasencoder_moduleattention_type
max_lengthcausalencoder_kdimencoder_vdimdecoder_kdimdecoder_vdimr   )r	   r
   r   r   char_linphn_linr   r   pad_idxr   r   _reset_params)r   r   r   rg   rh   r   r   rS   rT   rU   rV   rW   rX   r?   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   ri   rc   rd   re   rf   r   r   r   r   r   r
     sj   !	
zTransformerG2P.__init__c              
   C   s`  |\}}|du rt |d|j}n|\}}| |}| jr%t| j||}| |}	| |}
| 	|
}
| j
|	|
|| jd\}}}}d}| jdkrO| |	}n| jdkr]|	| |	 }	d}| j|	|||d\}}| jdkr|
| |
 }
|	| |	 }	d}d}n| jdkr|
| |
 }
d}d}| j|
|||||||d\}}}| |}| |}||||fS )au  Computes the forward pass

        Arguments
        ---------
        grapheme_encoded: torch.Tensor
            graphemes encoded as a Torch tensor
        phn_encoded: torch.Tensor
            the encoded phonemes
        word_emb: torch.Tensor
            word embeddings (if applicable)

        Returns
        -------
        p_seq: torch.Tensor
            the log-probabilities of individual tokens i a sequence
        char_lens: torch.Tensor
            the character length syntax
        encoder_out: torch.Tensor
            the encoder state
        attention: torch.Tensor
            the attention state
        Nr   ri   RelPosMHAXLrO   )srcsrc_masksrc_key_padding_maskpos_embs)tgtmemorymemory_masktgt_masktgt_key_padding_maskmemory_key_padding_maskpos_embs_tgtpos_embs_src)r   r   r   r   r   r   r   rg   r   rh   
make_masksri   r`   r[   positional_encoding_typeencoderpositional_encoding_decoderdecoderr   r   )r   r   r   r   r    r!   phnr#   r$   rm   rq   ro   ru   rn   rt   pos_embs_encoderr%   pos_embs_targetdecoder_out	attentionr)   r*   r   r   r   r+   ^  sd   











zTransformerG2P.forwardc                 C   s,   |   D ]}| dkrtjj| qdS )z"Resets the parameters of the model   N)
parametersr.   r/   r   initxavier_normal_)r   pr   r   r   rj     s
   zTransformerG2P._reset_paramsc           
      C   sr   |dur't ||jd  }t |jd dddf ||dddf k}t||d}d}t|}	||||	fS )a  This method generates the masks for training the transformer model.

        Arguments
        ---------
        src : torch.Tensor
            The sequence to the encoder (required).
        tgt : torch.Tensor
            The sequence to the decoder (required).
        src_len : torch.Tensor
            Lengths corresponding to the src tensor.
        pad_idx : int
            The index for <pad> token (default=0).

        Returns
        -------
        src_key_padding_mask: torch.Tensor
            the source key padding mask
        tgt_key_padding_mask: torch.Tensor
            the target key padding masks
        src_mask: torch.Tensor
            the source mask
        tgt_mask: torch.Tensor
            the target mask
        Nr   rk   )r/   roundshapearangetor   r   )
r   rm   rq   src_lenri   abs_lenro   ru   rn   rt   r   r   r   ry     s    zTransformerG2P.make_masksc           	      C   s   t |}| |}| |}| jdkr"|| | }|| | }n| jdkr.|| | }| j|||ddd\}}}|d }||fS )aX  This method implements a decoding step for the transformer model.

        Arguments
        ---------
        tgt : torch.Tensor
            The sequence to the decoder.
        encoder_out : torch.Tensor
            Hidden output of the encoder.
        enc_lens : torch.Tensor
            The corresponding lengths of the encoder inputs.

        Returns
        -------
        prediction: torch.Tensor
            the predicted sequence
        attention: torch.Tensor
            the attention matrix corresponding to the last attention head
            (useful for plotting attention)
        rl   rO   N)rt   rw   rx   r,   )r   r   rh   r`   r|   rz   r[   r}   )	r   rq   r%   enc_lensrt   
prediction
self_attnsmultihead_attnsr   r   r   r   decode  s&   



zTransformerG2P.decoder1   )Nr   )r2   r3   r4   r5   r   ReLUr
   r+   rj   ry   r   r6   r   r   r   r   rL      s>    P
DU
&rL   c                 C   s   || |  S )ab  Computes the input dimension (intended for hparam files)

    Arguments
    ---------
    use_word_emb: bool
        whether to use word embeddings
    embedding_dim: int
        the embedding dimension
    word_emb_enc_dim: int
        the dimension of encoded word embeddings

    Returns
    -------
    input_dim: int
        the input dimension
    r   )r   embedding_dimr;   r   r   r   	input_dim	  s   r   c                 C   s*   | dur	| |j n|j } tj|| gddS )a  
    Concatenates character and word embeddings together, possibly
    applying a custom encoding/transformation

    Arguments
    ---------
    word_emb_enc: callable
        an encoder to apply (typically, speechbrain.lobes.models.g2p.model.WordEmbeddingEncoder)
    emb_char: torch.Tensor
        character embeddings
    word_emb: char
        word embeddings

    Returns
    -------
    result: torch.Tensor
        the resulting (concatenated) tensor
    Nr,   r-   )datar/   r0   )r   r$   r   r   r   r   r     s
   
r   c                 C   s   t jdg|d| dS )z
    Creates a dummy phoneme sequence

    Arguments
    ---------
    batch_size: int
        the batch size
    device: str
        the target device

    Returns
    -------
    result: torch.Tensor
    r   )r   r   )r/   tensorexpand)
batch_sizer   r   r   r   r   8  s   r   )r5   r/   r   0speechbrain.lobes.models.transformer.Transformerr   r   r   speechbrain.nnetr   speechbrain.nnet.linearr   Moduler   r7   rL   r   r   r   r   r   r   r   <module>   s    rK  :