o
    %ݫi9                     @   s  d Z ddlZddlmZ ddlZddlmZ ddlZddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ G dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdd Zdd ZdddZdS ) zTransformer implementation in the SpeechBrain style.
Authors
* Jianyuan Zhong 2020
* Samuele Cornell 2021
* Shucong Zhang 2024
    N)Optional)Swish)RelPosEncXL)Conv1d)map_old_state_dict_weights   )BranchformerEncoder)ConformerEncoderc                        s   e Zd ZdZddddddejdddd	d
d	deejddddddddejdddfde	e
 de	e de	e de	ej de	ej de	e de	e
 de	e de	e
 de	e
 de	e
 de	e
 de	e
 de	ej de	e f fd d!Zd"d# Z  ZS )$TransformerInterfaceuD  This is an interface for transformer model.
    Users can modify the attributes and define the forward function as
    needed according to their own tasks.
    The architecture is based on the paper "Attention Is All You Need":
    https://arxiv.org/pdf/1706.03762.pdf

    Arguments
    ---------
    d_model: int
        The number of expected features in the encoder/decoder inputs (default=512).
    nhead: int
        The number of heads in the multi-head attention models (default=8).
    num_encoder_layers: int, optional
        The number of encoder layers in1ì the encoder.
    num_decoder_layers: int, optional
        The number of decoder layers in the decoder.
    d_ffn: int, optional
        The dimension of the feedforward network model hidden layer.
    dropout: int, optional
        The dropout value.
    activation: torch.nn.Module, optional
        The activation function for Feed-Forward Network layer,
        e.g., relu or gelu or swish.
    custom_src_module: torch.nn.Module, optional
        Module that processes the src features to expected feature dim.
    custom_tgt_module: torch.nn.Module, optional
        Module that processes the src features to expected feature dim.
    positional_encoding: str, optional
        Type of positional encoding used. e.g. 'fixed_abs_sine' for fixed absolute positional encodings.
    normalize_before: bool, optional
        Whether normalization should be applied before or after MHA or FFN in Transformer layers.
        Defaults to True as this was shown to lead to better performance and training stability.
    kernel_size: int, optional
        Kernel size in convolutional layers when Conformer is used.
    bias: bool, optional
        Whether to use bias in Conformer convolutional layers.
    encoder_module: str, optional
        Choose between Branchformer, Conformer and Transformer for the encoder. The decoder is fixed to be a Transformer.
    conformer_activation: torch.nn.Module, optional
        Activation module used after Conformer convolutional layers. E.g. Swish, ReLU etc. it has to be a torch Module.
    branchformer_activation: torch.nn.Module, optional
        Activation module used within the Branchformer Encoder. E.g. Swish, ReLU etc. it has to be a torch Module.
    attention_type: str, optional
        Type of attention layer used in all Transformer or Conformer layers.
        e.g. regularMHA or RelPosMHA.
    max_length: int, optional
        Max length for the target and source sequence in input.
        Used for positional encodings.
    causal: bool, optional
        Whether the encoder should be causal or not (the decoder is always causal).
        If causal the Conformer convolutional layer is causal.
    encoder_kdim: int, optional
        Dimension of the key for the encoder.
    encoder_vdim: int, optional
        Dimension of the value for the encoder.
    decoder_kdim: int, optional
        Dimension of the key for the decoder.
    decoder_vdim: int, optional
        Dimension of the value for the decoder.
    csgu_linear_units: int, optional
        Number of neurons in the hidden linear units of the CSGU Module.
        -> Branchformer
    gate_activation: torch.nn.Module, optional
        Activation function used at the gate of the CSGU module.
        -> Branchformer
    use_linear_after_conv: bool, optional
        If True, will apply a linear transformation of size input_size//2.
        -> Branchformer
    output_hidden_states: bool, optional
        Whether the model should output the hidden states as a list of tensor.
    layerdrop_prob: float
        The probability to drop an entire layer.
    i         i   g?Nfixed_abs_sineT   transformer
regularMHA	  Fi           kernel_sizebiasencoder_moduleconformer_activationbranchformer_activationattention_type
max_lengthcausalencoder_kdimencoder_vdimdecoder_kdimdecoder_vdimcsgu_linear_unitsgate_activationuse_linear_after_convc                    s  t    || _|| _|
| _|| _|| _|| _|| _|| _	|| _
|dv s&J |
dv s,J || dks6J d|
dkrAt||| _n|
d u rF	 |dkrUt|| _t||| _|dkr_t||| _|dkr|d url||| _|dkrt|||||||| j| j| j| j| j	| j
d	| _n@|d
krt||||||||| j| j| j	| j
d| _|sJ d|d usJ dn|dkrt||||||| j|||| j	| j
d| _|dkr|	d ur|	|| _t|||||||dd| j| jd| _d S d S )N)r   RelPosMHAXLhypermixingRoPEMHA)r   Nr   zGnumber of encoder layers and number of decoder layers cannot both be 0!r   r"   r$   r   )nhead
num_layersd_ffnd_modeldropout
activationnormalize_beforer   r   kdimvdimoutput_hidden_stateslayerdrop_prob	conformer)r%   r&   r'   r(   r)   r*   r   r   r   r   r.   r/   z+normalize_before must be True for Conformerz%conformer_activation must not be Nonebranchformer)r%   r&   r(   r)   r*   r   r   r   r    r!   r.   r/   Tr   )r&   r%   r'   r(   r)   r*   r+   r   r   r,   r-   )super__init__r   r   positional_encoding_typer   r   r   r   r.   r/   PositionalEncodingpositional_encodingr   positional_encoding_decodercustom_src_moduleTransformerEncoderencoderr	   r   custom_tgt_moduleTransformerDecoderdecoder)selfr(   r%   num_encoder_layersnum_decoder_layersr'   r)   r*   r8   r;   r6   r+   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r.   r/   	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/transformer/Transformer.pyr3   c   s   





zTransformerInterface.__init__c                 K   s   t )z?Users should modify this function according to their own tasks.)NotImplementedError)r>   kwagsrC   rC   rD   forward   s   zTransformerInterface.forward)__name__
__module____qualname____doc__nnReLUr   GELUIdentityr   intboolstrModuler3   rG   __classcell__rC   rC   rA   rD   r
      s~    L r
   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )r5   a  This class implements the absolute sinusoidal positional encoding function.
    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))

    Arguments
    ---------
    input_size: int
        Embedding dimension.
    max_len : int, optional
        Max length of the input sequences (default 2500).

    Example
    -------
    >>> a = torch.rand((8, 120, 512))
    >>> enc = PositionalEncoding(input_size=a.shape[-1])
    >>> b = enc(a)
    >>> b.shape
    torch.Size([1, 120, 512])
    r   c                    s   t    |d dkrtd| d|| _tj| j|dd}td| jd }t	td|d t
d|   }t|| |d d dd df< t|| |d d dd df< |d}| d	| d S )
N   r   zGCannot use sin/cos positional encoding with odd channels (got channels=)F)requires_gradr   g     @pe)r2   r3   
ValueErrormax_lentorchzerosarange	unsqueezefloatexpmathlogsincosregister_buffer)r>   
input_sizerZ   rX   	positionsdenominatorrA   rC   rD   r3     s"   

  
zPositionalEncoding.__init__c                 C   s$   | j ddd|df   S )z
        Arguments
        ---------
        x : torch.Tensor
            Input feature shape (batch, time, fea)

        Returns
        -------
        The positional encoding.
        Nr   )rX   sizeclonedetachr>   xrC   rC   rD   rG   $  s   $zPositionalEncoding.forward)r   rH   rI   rJ   rK   r3   rG   rT   rC   rC   rA   rD   r5      s    r5   c                	       sj   e Zd ZdZdddejdddddgdf	 fdd		Z			dd
eej	 deej	 deej	 fddZ
  ZS )TransformerEncoderLayeraM  This is an implementation of self-attention encoder layer.

    Arguments
    ---------
    d_ffn: int, optional
        The dimension of the feedforward network model hidden layer.
    nhead: int
        The number of heads in the multi-head attention models (default=8).
    d_model: int
        The number of expected features in the encoder/decoder inputs (default=512).
    kdim: int, optional
        Dimension of the key.
    vdim: int, optional
        Dimension of the value.
    dropout: int, optional
        The dropout value.
    activation: torch.nn.Module, optional
        The activation function for Feed-Forward Network layer,
        e.g., relu or gelu or swish.
    normalize_before: bool, optional
        Whether normalization should be applied before or after MHA or FFN in Transformer layers.
        Defaults to True as this was shown to lead to better performance and training stability.
    attention_type: str, optional
        Type of attention layer used in all Transformer or Conformer layers.
        e.g. regularMHA or RelPosMHA.
    ffn_type: str
        type of ffn: regularFFN/1dcnn
    ffn_cnn_kernel_size_list: list of int
        kernel size of 2 1d-convs if ffn_type is 1dcnn
    causal: bool, optional
        Whether the encoder should be causal or not (the decoder is always causal).
        If causal the Conformer convolutional layer is causal.

    Example
    -------
    >>> import torch
    >>> x = torch.rand((8, 60, 512))
    >>> net = TransformerEncoderLayer(512, 8, d_model=512)
    >>> output = net(x)
    >>> output[0].shape
    torch.Size([8, 60, 512])
    Nr   Fr   
regularFFN   c              
      sR  t    |	dkrtjjj|||||d| _n1|	dkr(tjjj||||d| _n |	dkr:tjjj	||d|dd| _n|	dkrHtjj
|||| _|
d	krYtjjj||||d
| _n&|
dkrtt|||d |ridnddt t|||d |rydndd| _tjjj|dd| _tjjj|dd| _tj|| _tj|| _|| _|
| _d S )Nr   )r%   r(   r)   r,   r-   r"   mask_pos_futurer#   F)input_output_dimhypernet_sizetied	num_headsfix_tm_hidden_sizer$   rp   r'   rf   r)   r*   1dcnnr   r   same)in_channelsout_channelsr   paddingr   ư>eps)r2   r3   sbnnet	attentionMultiheadAttentionself_attr"   r#   HyperMixingr$   PositionalwiseFeedForwardpos_ffnrL   
Sequentialr   rM   normalization	LayerNormnorm1norm2r[   Dropoutdropout1dropout2r+   pos_ffn_type)r>   r'   r%   r(   r,   r-   r)   r*   r+   r   ffn_typeffn_cnn_kernel_size_listr   rA   rC   rD   r3   ^  sp   







z TransformerEncoderLayer.__init__src_masksrc_key_padding_maskpos_embsc                 C   s   | j r	| |}n|}| j||||||d\}}|| | }| j s'| |}| j r0| |}n|}| |}|| | }| j sF| |}||fS )a'  
        Arguments
        ---------
        src : torch.Tensor
            The sequence to the encoder layer.
        src_mask : torch.Tensor
            The mask for the src query for each example in the batch.
        src_key_padding_mask : torch.Tensor, optional
            The mask for the src keys for each example in the batch.
        pos_embs: torch.Tensor, optional
            The positional embeddings tensor.

        Returns
        -------
        output : torch.Tensor
            The output of the transformer encoder layer.
        )	attn_maskkey_padding_maskr   )r+   r   r   r   r   r   r   )r>   srcr   r   r   src1output	self_attnrC   rC   rD   rG     s,   




zTransformerEncoderLayer.forward)NNNrH   rI   rJ   rK   rL   rM   r3   r   r[   TensorrG   rT   rC   rC   rA   rD   ro   2  s,    0Pro   c                       st   e Zd ZdZdddddejdddddddgdf fdd		Z				dd
eej	 deej	 deej	 fddZ
  ZS )r9   aG  This class implements the transformer encoder.

    Arguments
    ---------
    num_layers : int
        Number of transformer layers to include.
    nhead : int
        Number of attention heads.
    d_ffn : int
        Hidden size of self-attention Feed Forward layer.
    input_shape : tuple
        Expected shape of the input.
    d_model : int
        The dimension of the input embedding.
    kdim : int
        Dimension for key (Optional).
    vdim : int
        Dimension for value (Optional).
    dropout : float
        Dropout for the encoder (Optional).
    activation: torch.nn.Module, optional
        The activation function for Feed-Forward Network layer,
        e.g., relu or gelu or swish.
    normalize_before: bool, optional
        Whether normalization should be applied before or after MHA or FFN in Transformer layers.
        Defaults to True as this was shown to lead to better performance and training stability.
    causal: bool, optional
        Whether the encoder should be causal or not (the decoder is always causal).
        If causal the Conformer convolutional layer is causal.
    layerdrop_prob: float
        The probability to drop an entire layer
    attention_type: str, optional
        Type of attention layer used in all Transformer or Conformer layers.
        e.g. regularMHA or RelPosMHA.
    ffn_type: str
        type of ffn: regularFFN/1dcnn
    ffn_cnn_kernel_size_list: list of int
        conv kernel size of 2 1d-convs if ffn_type is 1dcnn
    output_hidden_states: bool, optional
        Whether the model should output the hidden states as a list of tensor.

    Example
    -------
    >>> import torch
    >>> x = torch.rand((8, 60, 512))
    >>> net = TransformerEncoder(1, 8, 512, d_model=512)
    >>> output, _ = net(x)
    >>> output.shape
    torch.Size([8, 60, 512])

    >>> import torch
    >>> x = torch.rand((8, 60, 512))
    >>> net = TransformerEncoder(1, 8, 512, d_model=512, output_hidden_states=True)
    >>> output, attn_list, hidden_list = net(x)
    >>> hidden_list[0].shape
    torch.Size([8, 60, 512])
    >>> len(hidden_list)
    2
    Nr   Fr   rp   rq   c                    sd   t    tj 	
fddt|D | _tjj	j
dd| _|| _|| _d S )Nc                    s,   g | ]}t 	 
d qS ))r'   r%   r(   r,   r-   r)   r*   r+   r   r   r   r   )ro   ).0ir*   r   r   r'   r(   r)   r   r   r,   r%   r+   r-   rC   rD   
<listcomp>7  s"    z/TransformerEncoder.__init__.<locals>.<listcomp>r   r   )r2   r3   r[   rL   
ModuleListrangelayersr   r   r   r   normr/   r.   )r>   r&   r%   r'   input_shaper(   r,   r-   r)   r*   r+   r   r/   r   r   r   r.   rA   r   rD   r3   !  s   
 
zTransformerEncoder.__init__r   r   r   c                 C   s   |du sJ d|}| j dkrtt| j}g }| jr|g}	t| jD ]*\}
}| jr7| j dks7||
 | j krN|||||d\}}|| | jrN|	| q$| 	|}| jr\|||	fS ||fS )a,  
        Arguments
        ---------
        src : torch.Tensor
            The sequence to the encoder layer (required).
        src_mask : torch.Tensor
            The mask for the src sequence (optional).
        src_key_padding_mask : torch.Tensor
            The mask for the src keys per batch (optional).
        pos_embs : torch.Tensor
            The positional embedding tensor
        dynchunktrain_config : config
            Not supported for this encoder.

        Returns
        -------
        output : torch.Tensor
            The output of the transformer.
        attention_lst : list
            The attention values.
        hidden_state_lst : list, optional
            The output of the hidden layers of the encoder.
            Only works if output_hidden_states is set to true.
        Nz3Dynamic Chunk Training unsupported for this encoderr   )r   r   r   )
r/   r[   randlenr   r.   	enumeratetrainingappendr   )r>   r   r   r   r   dynchunktrain_configr   
keep_probsattention_lsthidden_state_lstr   	enc_layerr   rC   rC   rD   rG   M  s8   
!






zTransformerEncoder.forward)NNNNr   rC   rC   rA   rD   r9     s6    A/r9   c                       sT   e Zd ZdZdddejdddf fdd	Z						ddd	Z fd
dZ  Z	S )TransformerDecoderLayera  This class implements the self-attention decoder layer.

    Arguments
    ---------
    d_ffn : int
        Hidden size of self-attention Feed Forward layer.
    nhead : int
        Number of attention heads.
    d_model : int
        Dimension of the model.
    kdim : int
        Dimension for key (optional).
    vdim : int
        Dimension for value (optional).
    dropout : float
        Dropout for the decoder (optional).
    activation : Callable
        Function to use between layers, default nn.ReLU
    normalize_before : bool
        Whether to normalize before layers.
    attention_type : str
        Type of attention to use, "regularMHA" or "RelPosMHAXL"
    causal : bool
        Whether to mask future positions.

    Example
    -------
    >>> src = torch.rand((8, 60, 512))
    >>> tgt = torch.rand((8, 60, 512))
    >>> net = TransformerDecoderLayer(1024, 8, d_model=512)
    >>> output, self_attn, multihead_attn = net(src, tgt)
    >>> output.shape
    torch.Size([8, 60, 512])
    Nr   Fr   c                    s  t    || _|	dkr'tjjj|||||d| _tjjj|||||d| _n|	dkrCtjjj	||||
d| _tjjj	||||
d| _tjjj
||||d| _tjjj|dd| _tjjj|dd| _tjjj|dd| _tj|| _tj|| _tj|| _|| _d S )Nr   )r%   r(   r,   r-   r)   r"   rr   ry   r   r   )r2   r3   r%   r   r   r   r   r   multihead_attnr"   r   r   r   r   r   r   norm3r[   rL   r   r   r   dropout3r+   )r>   r'   r%   r(   r,   r-   r)   r*   r+   r   r   rA   rC   rD   r3     sJ   


z TransformerDecoderLayer.__init__c	                 C   s   | j r	| |}	n|}	| j|	|	|	|||d\}
}|| |
 }| j s'| |}| j r0| |}	n|}	| j|	|||||d\}
}|| |
 }| j sN| |}| j rW| |}	n|}	| |	}
|| 	|
 }| j sm| |}|||fS )a3  
        Arguments
        ----------
        tgt: torch.Tensor
            The sequence to the decoder layer (required).
        memory: torch.Tensor
            The sequence from the last layer of the encoder (required).
        tgt_mask: torch.Tensor
            The mask for the tgt sequence (optional).
        memory_mask: torch.Tensor
            The mask for the memory sequence (optional).
        tgt_key_padding_mask: torch.Tensor
            The mask for the tgt keys per batch (optional).
        memory_key_padding_mask: torch.Tensor
            The mask for the memory keys per batch (optional).
        pos_embs_tgt: torch.Tensor
            The positional embeddings for the target (optional).
        pos_embs_src: torch.Tensor
            The positional embeddings for the source (optional).
        )querykeyvaluer   r   r   )
r+   r   r   r   r   r   r   r   r   r   )r>   tgtmemorytgt_maskmemory_masktgt_key_padding_maskmemory_key_padding_maskpos_embs_tgtpos_embs_srctgt1tgt2r   multihead_attentionrC   rC   rD   rG     sH   








zTransformerDecoderLayer.forwardc                    s2   ddi}t ||}t j||g|R i | dS )zFLoad the model from a state_dict and map the old keys to the new keys.mutihead_attentionr   N)r   r2   _load_from_state_dict)r>   
state_dictprefixargskwargsmappingrA   rC   rD   r   B  s   
 z-TransformerDecoderLayer._load_from_state_dictNNNNNN)
rH   rI   rJ   rK   rL   rM   r3   rG   r   rT   rC   rC   rA   rD   r     s$    (<
Ur   c                       sH   e Zd ZdZdddejdddf fdd	Z						d
dd	Z  ZS )r<   aP  This class implements the Transformer decoder.

    Arguments
    ---------
    num_layers : int
        Number of transformer layers for the decoder.
    nhead : int
        Number of attention heads.
    d_ffn : int
        Hidden size of self-attention Feed Forward layer.
    d_model : int
        Dimension of the model.
    kdim : int, optional
        Dimension for key (Optional).
    vdim : int, optional
        Dimension for value (Optional).
    dropout : float, optional
        Dropout for the decoder (Optional).
    activation : Callable
        The function to apply between layers, default nn.ReLU
    normalize_before : bool
        Whether to normalize before layers.
    causal : bool
        Whether to allow future information in decoding.
    attention_type : str
        Type of attention to use, "regularMHA" or "RelPosMHAXL"

    Example
    -------
    >>> src = torch.rand((8, 60, 512))
    >>> tgt = torch.rand((8, 60, 512))
    >>> net = TransformerDecoder(1, 8, 1024, d_model=512)
    >>> output, _, _ = net(src, tgt)
    >>> output.shape
    torch.Size([8, 60, 512])
    Nr   Fr   c                    sT   t    tj 	f
ddt|D | _tjj	j
dd| _d S )Nc                    s(   g | ]}t 	 d 
qS ))
r'   r%   r(   r,   r-   r)   r*   r+   r   r   )r   )r   _
r*   r   r   r'   r(   r)   r,   r%   r+   r-   rC   rD   r     s    z/TransformerDecoder.__init__.<locals>.<listcomp>r   r   )r2   r3   r[   rL   r   r   r   r   r   r   r   r   )r>   r&   r%   r'   r(   r,   r-   r)   r*   r+   r   r   rA   r   rD   r3   o  s   
zTransformerDecoder.__init__c	                 C   s`   |}	g g }
}| j D ]}||	|||||||d\}	}}|
| || q
| |	}	|	|
|fS )a;  
        Arguments
        ----------
        tgt : torch.Tensor
            The sequence to the decoder layer (required).
        memory : torch.Tensor
            The sequence from the last layer of the encoder (required).
        tgt_mask : torch.Tensor
            The mask for the tgt sequence (optional).
        memory_mask : torch.Tensor
            The mask for the memory sequence (optional).
        tgt_key_padding_mask : torch.Tensor
            The mask for the tgt keys per batch (optional).
        memory_key_padding_mask : torch.Tensor
            The mask for the memory keys per batch (optional).
        pos_embs_tgt : torch.Tensor
            The positional embeddings for the target (optional).
        pos_embs_src : torch.Tensor
            The positional embeddings for the source (optional).
        )r   r   r   r   r   r   )r   r   r   )r>   r   r   r   r   r   r   r   r   r   
self_attnsmultihead_attns	dec_layerr   r   rC   rC   rD   rG     s"   





zTransformerDecoder.forwardr   )	rH   rI   rJ   rK   rL   rM   r3   rG   rT   rC   rC   rA   rD   r<   I  s"    +&r<   c                       s(   e Zd ZdZ fddZdd Z  ZS )NormalizedEmbeddingau  This class implements the normalized embedding layer for the transformer.
    Since the dot product of the self-attention is always normalized by sqrt(d_model)
    and the final linear projection for prediction shares weight with the embedding layer,
    we multiply the output of the embedding by sqrt(d_model).

    Arguments
    ---------
    d_model: int
        The number of expected features in the encoder/decoder inputs (default=512).
    vocab: int
        The vocab size.

    Example
    -------
    >>> emb = NormalizedEmbedding(512, 1000)
    >>> trg = torch.randint(0, 999, (8, 50))
    >>> emb_fea = emb(trg)
    c                    s*   t    tjjj||dd| _|| _d S )Nr   )num_embeddingsembedding_dimblank_id)r2   r3   r   r   	embedding	Embeddingembr(   )r>   r(   vocabrA   rC   rD   r3     s
   

zNormalizedEmbedding.__init__c                 C   s   |  |t| j S )z:Processes the input tensor x and returns an output tensor.)r   ra   sqrtr(   rl   rC   rC   rD   rG     s   zNormalizedEmbedding.forwardrn   rC   rC   rA   rD   r     s    r   c                 C   sr   t | jdkr| j\}}}}| |||| } | || j}t | jdkr5| jdd }|	 S |	 S )ai  Creates a binary mask to prevent attention to padded locations.
    We suggest using ``get_mask_from_lengths`` instead of this function.

    Arguments
    ---------
    padded_input: torch.Tensor
        Padded input.
    pad_idx: int
        idx for padding element.

    Returns
    -------
    key_padded_mask: torch.Tensor
        Binary mask to prevent attention to padding.

    Example
    -------
    >>> a = torch.LongTensor([[1,1,0], [2,3,0], [4,5,0]])
    >>> get_key_padding_mask(a, pad_idx=0)
    tensor([[False, False,  True],
            [False, False,  True],
            [False, False,  True]])
       rU   )dim)
r   shapereshapeeqtodevicer_   prodrQ   rk   )padded_inputpad_idxbztimech1ch2key_padded_maskrC   rC   rD   get_key_padding_mask  s   r   c                 C   sh   | j d }ttj||f| jddkdd}| |dktd|dktd}| 	| jS )a  Creates a binary mask for each sequence which masks future frames.

    Arguments
    ---------
    padded_input: torch.Tensor
        Padded input tensor.

    Returns
    -------
    mask : torch.Tensor
        Binary mask for masking future frames.

    Example
    -------
    >>> a = torch.LongTensor([[1,1,0], [2,3,0], [4,5,0]])
    >>> get_lookahead_mask(a)
    tensor([[0., -inf, -inf],
            [0., 0., -inf],
            [0., 0., 0.]])
    r   )r   r   z-infr   )
r   r[   triuonesr   	transposer_   masked_fillrk   r   )r   seq_lenmaskrC   rC   rD   get_lookahead_mask  s   
r   c                 C   s@   |du rt |  }t j|| j| jd}|d| dk  S )as  Creates a binary mask from sequence lengths

    Arguments
    ---------
    lengths: torch.Tensor
        A tensor of sequence lengths
    max_len: int (Optional)
        Maximum sequence length, defaults to None.

    Returns
    -------
    mask: torch.Tensor
        the mask where padded elements are set to True.
        Then one can use tensor.masked_fill_(mask, 0) for the masking.

    Example
    -------
    >>> lengths = torch.tensor([3, 2, 4])
    >>> get_mask_from_lengths(lengths)
    tensor([[False, False, False,  True],
            [False, False,  True,  True],
            [False, False, False, False]])
    N)r   dtyper   r   )r[   maxitemr]   r   r   r^   )lengthsrZ   	seq_rangerC   rC   rD   get_mask_from_lengths-  s   
r   )N) rK   ra   typingr   r[   torch.nnrL   speechbrainr   speechbrain.nnet.activationsr   speechbrain.nnet.attentionr   speechbrain.nnet.CNNr   speechbrain.utils.checkpointsr   Branchformerr   	Conformerr	   rS   r
   r5   ro   r9   r   r<   r   r   r   r   rC   rC   rC   rD   <module>   s4     e6 3 . 9{ '"