o
    %ݫi
                     @   sh   d Z ddlZddlmZ ddlmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZ G dd	 d	eZdS )
z]An implementation of Transformer Language model.

Authors
* Jianyuan Zhong
* Samuele Cornell
    N)nn)NormalizedEmbeddingTransformerInterfaceget_key_padding_maskget_lookahead_mask)
ModuleList)Linear)	LayerNormc                       s\   e Zd ZdZddddddejdd	d
dddd	f fdd	Zdd Zdd Z	dddZ	  Z
S )TransformerLMa  This is an implementation of transformer language model.

    The architecture is based on the paper "Attention Is All You Need": https://arxiv.org/pdf/1706.03762.pdf

    Arguments
    ---------
    vocab : int
        Embedding vocabulary size
    d_model : int
        The number of expected features in the encoder/decoder inputs (default=512).
    nhead : int
        The number of heads in the multiheadattention models (default=8).
    num_encoder_layers : int
        The number of sub-encoder-layers in the encoder (default=12).
    num_decoder_layers : int
        The number of sub-decoder-layers in the decoder (default=0).
    d_ffn : int
        The dimension of the feedforward network model (default=2048).
    dropout : float
        The dropout value (default=0.1).
    activation: torch class
        The activation function of encoder/decoder intermediate layer, relu or gelu (default=relu).
    positional_encoding : str
        Type of positional encoding, default "fixed_abs_sine"
    normalize_before : bool
        Whether to normalize before each layer.
    d_embedding : int
        Size of embedding, if None use d_model.
    max_length : int
        Maximum sequence length, default 2500 tokens.
    causal : bool
        Whether to incorporate future information in decoding, default True.
    attention_type : str
        Type of attention to use, one of "regularMHA" or "RelPosMHAXL"
    decoder_use_memory: bool
        whether to use the hidden state in the decoder

    Example
    -------
    >>> src = torch.randint(0, 720, [8, 120])
    >>> net = TransformerLM(720, 512, 8, 1, 0, 1024, activation=torch.nn.GELU)
    >>> enc_out = net.forward(src)
    >>> print(enc_out.shape)
    torch.Size([8, 120, 720])
    i         r   i   g?fixed_abs_sineFNi	  T
regularMHAc                    s   t  j||||||||	|
|||d || _|d u r|| _t| j|| _d | _|d ur2t| j|d| _tt||dt|ddt||d| _	|| _
|| _|| _|   d S )N)d_modelnheadnum_encoder_layersnum_decoder_layersd_ffndropout
activationpositional_encodingnormalize_before
max_lengthcausalattention_type)
input_size	n_neuronsgư>)eps)super__init__d_embeddingr   custom_src_moduleembedding_projr   r   r	   output_projr   r   decoder_use_memory_reset_params)selfvocabr   r   r   r   r   r   r   r   r   r    r   r   r   r$   	__class__ f/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/transformer/TransformerLM.pyr   E   s@   


zTransformerLM.__init__c                 C   s   |  |\}}| |}| jdur| |}|| | }| jdkr,| j|||d\}}| jdkrL| jrA| j||||d\}}}n| j||||d\}}| 	|}|S )z
        Arguments
        ---------
        src : torch.Tensor
            The sequence to the encoder (required).

        Returns
        -------
        pred : torch.Tensor
            Output of the transformer.
        Nr   )srcsrc_masksrc_key_padding_mask)tgtmemorytgt_masktgt_key_padding_mask)r,   r/   r1   r2   )

make_masksr!   r"   r   r   encoderr   r$   decoderr#   )r&   r,   r-   r.   encoder_out_predr*   r*   r+   forward   s6   







zTransformerLM.forwardc                 C   s,   |   D ]}| dkrtjj| qd S )N   )
parametersdimtorchr   initxavier_normal_)r&   pr*   r*   r+   r%      s
   zTransformerLM._reset_paramsc                 C   s*   d }|rt |}d }|rt||}||fS )N)r   r   )r&   r,   pad_idxlook_ahead_maskpadding_maskr-   r.   r*   r*   r+   r3      s   
zTransformerLM.make_masks)r   TT)__name__
__module____qualname____doc__r   ReLUr   r9   r%   r3   __classcell__r*   r*   r(   r+   r
      s*    1:,r
   )rG   r=   r   0speechbrain.lobes.models.transformer.Transformerr   r   r   r   speechbrain.nnet.containersr   speechbrain.nnet.linearr   speechbrain.nnet.normalizationr	   r
   r*   r*   r*   r+   <module>   s    