o
    }oip                     @   s   d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 d dl
Z
d dlm  mZ d dlmZ G dd de
jjZG dd de
jjZG d	d
 d
e
jjZG dd deZG dd deZG dd de
jjZG dd de
jjZdS )    N)abstractmethod)CallableDictListOptionalTupleUnion)loggingc                       sX   e Zd Z						ddedededed	ee d
ededef fddZdd Z  ZS )ConvolutionLayer   NTFin_channelsout_channelskernel_sizestridepaddingdilationbias	is_causalc	           	   	      s   t    d| _|r)|d | df| _|dur&t|  d| d| j d d}n|du rA|d dkr7tdt||d  d }|| _|| _|| _	t
jj|||||||d	| _dS )
a  
        A convolutional layer that supports causal convolutions with padding. Replaces the standard MLP layer used in
        the original transformer.

        Args:
            in_channels (int): Number of input channels.
            out_channels (int): Number of output channels.
            kernel_size (int): Size of the convolving kernel.
            stride (int): Stride of the convolution.
            padding (Optional[int]): Padding added to both sides of the input. If None, it's calculated automatically.
            dilation (int): Spacing between kernel elements.
            bias (bool): If True, adds a learnable bias to the output.
            is_causal (bool): If True, uses causal convolution.
        Nr   r   z@ was initialized with is_causal set to True, and padding set to z8. The provided padding value will be ignored and set to .   z1`kernel_size` must be odd when `padding` is None.)r   r   r   r   r   )super__init__causal_paddingr	   warning
ValueErrorintr   r   r   torchnnConv1dconv)	selfr   r   r   r   r   r   r   r   	__class__ a/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/modules/transformer_2501.pyr      s6   
zConvolutionLayer.__init__c                 C   s"   | j r
t|| j}| |}|S N)r   Fpadr   r   )r    signalconv_signalr#   r#   r$   forwardU   s   
zConvolutionLayer.forward)r   r   Nr   TF)	__name__
__module____qualname__r   r   boolr   r*   __classcell__r#   r#   r!   r$   r
      s2    	:r
   c                       sV   e Zd Zdddejjddfdededed	ed
edede	f fddZ
dd Z  ZS )PositionwiseConvFFr   FTtanhapproximated_modeld_ffn	p_dropoutr   r   r   non_linearityc                    sP   t    || _|| _t|||||d| _t|||||d| _tj	|| _
dS )aV  
        Positionwise Convolutional Feed-Forward layer to replace the MLP layer in transformers.

        Module will take the input with d_model hidden state, project it to d_ffn hidden dimension, perform nonlinear
        transformation, and project the state back into d_model hidden dimension. Finally, it applied dropout.

        Args:
            d_model (int): Input and output dimension of the model.
            d_ffn (int): Hidden dimension of the feed-forward network (usually 4 * d_model).
            p_dropout (float): Dropout probability.
            kernel_size (int): Size of the convolving kernel.
            bias (bool): If True, adds a learnable bias to the convolution layers.
            is_causal (bool): If True, uses causal convolution.
            non_linearity (Callable): Activation function to use (default: GELU).
        )r   r   r   N)r   r   r4   r7   r
   projo_netr   r   Dropoutdropout)r    r4   r5   r6   r   r   r   r7   r!   r#   r$   r   _   s   
zPositionwiseConvFF.__init__c                 C   s4   |  | |dd}| | |dd}|S )z
        x (B, T, C)
        r   r   )r7   r8   	transposer;   r9   )r    xr#   r#   r$   r*      s   zPositionwiseConvFF.forward)r+   r,   r-   r   r   GELUr   floatr.   r   r   r*   r/   r#   r#   r!   r$   r0   ^   s*    "r0   c                       sP  e Zd Z	ddedededef fddZe			dd	ej	d
e
ej	 de
ej	 de
ej	 fddZedeee
eeej	f  f fddZddefddZ				dd	ej	d
e
ej	 de
ej	 de
ej	 de
ej	 deej	eej	 f fddZ				dd	ej	d
e
ej	 de
ej	 de
ej	 de
ej	 deej	eej	 f fddZ  ZS )	AttentionTn_headsr4   r6   r   c                    s   t    || dksJ d|| | _|| _|| _| jd | _|| _tjj	|| j |dd| _
tj|| _d| _|  | _dS )aN  
        Base Attention parent class. Users should not be instantiating this class, but rather use SelfAttention or
        CrossAttention classes as appropriate.
        Does DotProductionAttention and additionally dropout inside the module. The class does not currently support
        RoPE nor ALiBi.

        Args:
            n_heads (int): Number of attention heads.
            d_model (int): Dimension of the model.
            p_dropout (float): Dropout probability.
            is_causal (bool): Whether to use causal attention. Only supported when used in SelfAttention.
        r   zd_model % n_head != 0g      Fr   N)r   r   d_headrA   r4   scaler   r   r   Linearr9   r:   r;   	use_cache_init_cachecache)r    rA   r4   r6   r   r!   r#   r$   r      s   

zAttention.__init__Nquery
query_maskmemorymemory_maskc                 C   s   d S r%   r#   )r    rI   rJ   rK   rL   r#   r#   r$   compute_qkv_and_mask   s   zAttention.compute_qkv_and_maskreturnc                   C   s   dd d d d d dS )NF)is_initializedself_kself_vcross_kvcross_kcross_vr#   r#   r#   r#   r$   rG      s   zAttention._init_cacheFrF   c                 C   s   || _ |  | _d S r%   )rF   rG   rH   r    rF   r#   r#   r$   reset_cache   s   zAttention.reset_cache
attn_priorc                 C   s  | j r+| jd r&|d d dd d d f }|d ur#|d d dd f nd }nd| jd< | j||||d\}}}}	|dd}|dd}|dd}|j\}
}}t||dd| j }|	d urk||	dkt	d	 | j
r|| jd
d |d |f dkt	d	 |d urd}|d d d |f }t|| }|d d d f d| jdd}tj|dd| }tj|dd}ntj|dd}|	d ur||	dkd}| |}t||}|dd |
|d}|||gfS )NrO   T)rI   rJ   rK   rL   r   r      r   z-inf.g:0yE>dim        )rF   rH   rM   r<   shaper   matmulrD   masked_fill_r?   r   causal_masklogrepeatrA   r&   log_softmaxsoftmaxmasked_fillr;   
contiguousview)r    rI   rJ   rK   rL   rW   qkvmaskBT_
attn_scoreepsattn_score_log	attn_probyr#   r#   r$   
attn_naive   s>   
"
(
zAttention.attn_naivec                 C   s.   |  |||||\}}| | |}||fS )a  
        Forward pass of the Attention module.

        Args:
            query (torch.Tensor): Input tensor of shape (B, T1, C).
            query_mask (Optional[torch.Tensor]): Mask for query tensor of shape (B, T1).
            memory (Optional[torch.Tensor]): Memory tensor for cross-attention of shape (B, T2, C).
            memory_mask (Optional[torch.Tensor]): Mask for memory tensor of shape (B, T2).
            attn_prior (Optional[torch.Tensor]): Prior attention weights of shape (B, T1, T2).

        Returns:
            Tuple[torch.Tensor, List[torch.Tensor]]:
                - y: Attention module tensor output of shape (B, T1, C).
                - attn_prob: List containing attention probabilities and scores. returned only in attn_naive.
                    [0]: Attention probabilities used for logging during validation.
                    [1]: Attention scores used for CTC loss (only in naive attention).
        )rt   r;   r9   )r    rI   rJ   rK   rL   rW   rs   rr   r#   r#   r$   r*      s   zAttention.forward)TNNNFNNNN)r+   r,   r-   r   r?   r.   r   r   r   Tensorr   rM   staticmethodr   strr   rG   rV   r   r   rt   r*   r/   r#   r#   r!   r$   r@      sz    	&

8r@   c                       sn   e Zd Z		ddededededef
 fdd	Z	
	
	
ddejde	ej de	ej de	ej fddZ
  ZS )SelfAttentionT   rA   r4   r6   r   max_length_causal_maskc              	      s   t  j||||d |r/|du s|dk rtd| d| dtt||dd|| tjj	|d| | j
 d	d
| _dS )a  
        Implements SelfAttention. See parent class for forward implementation.

        Args:
            n_heads (int): Number of attention heads.
            d_model (int): Dimension of the model.
            p_dropout (float): Dropout probability.
            is_causal (bool): Whether to use causal attention. Only supported when used in SelfAttention.
            max_length_causal_mask (int): Maximum sequence length for Attention module.
        rA   r4   r6   r   Nr   zUSelf Attention was called with is_causal True, but received an inappropriate valueof z for max_length_causal_maskr`   r   rY   FrB   )r   r   r   register_bufferr   trilonesrg   r   rE   rC   qkv_net)r    rA   r4   r6   r   r}   r!   r#   r$   r     s(   "zSelfAttention.__init__NrI   rJ   rK   rL   c                 C   s   |j \}}}| |||d| j| j}|jddd\}	}
}|	d|
d|d}	}
}| jr[| jd d urQt	j
| jd |
gdd}
t	j
| jd |gdd}|
| jd< || jd< |d urk|d d d d d d f nd }|	|
||fS )NrY   r   rZ   rP   r   rQ   )r]   r   reshaperA   rC   chunksqueezerF   rH   r   cat)r    rI   rJ   rK   rL   rl   rm   rn   qkvrh   ri   rj   rk   r#   r#   r$   rM   ?  s   "

$z"SelfAttention.compute_qkv_and_mask)Tr|   ru   )r+   r,   r-   r   r?   r.   r   r   rx   r   rM   r/   r#   r#   r!   r$   r{     s4    )r{   c                
       sd   e Zd Zdedededef fddZ			ddejd	eej d
eej deej fddZ	  Z
S )CrossAttentionrA   r4   d_memoryr6   c                    s`   t  j|||dd |du rtdtjj||| j dd| _tjj|d| | j dd| _dS )al  
        Implements CrossAttention. See parent class for forward implementation. Must be non-causal.

        Args:
            n_heads (int): Number of attention heads.
            d_model (int): Dimension of the model.
            d_memory (int): Dimension of the conditioning / cross-attention input.
            p_dropout (float): Dropout probability.
        Fr~   Nz-d_memory must be provided for cross-attentionrB   r   )	r   r   r   r   r   rE   rC   q_netkv_net)r    rA   r4   r   r6   r!   r#   r$   r   U  s   "zCrossAttention.__init__NrI   rJ   rK   rL   c                 C   s  |j \}}}|j \}}	}| |||| j| j}
| jr)| jd d ur)| jd }n| |||	d| j| j}| jrL| jd d urL| jd }| jd }n&|jddd\}}|	d|	d}}| jrr|| jd< || jd< || jd< |d ur|d d d d f nd }|
|||fS )NrR   r   rS   rT   rZ   )
r]   r   r   rA   rC   rF   rH   r   r   r   )r    rI   rJ   rK   rL   BqTqrn   BkvTkvrh   kvri   rj   rk   r#   r#   r$   rM   p  s"   



z#CrossAttention.compute_qkv_and_maskru   )r+   r,   r-   r   r?   r   r   rx   r   rM   r/   r#   r#   r!   r$   r   T  s,    r   c                       s   e Zd Zdddddejjddfdededed	ed
edede	e de	e dededede
f fddZedefddZd!ddZ			d"dejdejde	ej de	ej de	ej defdd Z  ZS )#TransformerLayerNTr|   r1   r2   r4   r5   
sa_n_headsr   r6   	has_xattnxa_d_memory
xa_n_headsr   apply_norm_to_condr}   conv_non_linearityc                    s   t    || _tjj|dd| _t|||||	d| _| jr?|
| _	tjj|dd| _
t||||d| _| j	r?tjj|dd| _tjj|dd| _t|||||	|d| _d| _|  | _dS )af  
        One layer of the Transformer.
        Args:
            d_model <int>: Model dimension
            d_ffn <int>: Feed forward dimension (usually 4*d_model)
            sa_n_heads <int>: Number of attention heads used in self-attention
            kernel_size <int>: Convolution kernel size for FFN
            p_dropout <float>: Dropout probability
            has_xattn <bool>: Whether to use cross attention
            xa_d_memory <int>: Hidden dimension for cross attention
            xa_n_heads <int>: Number of attention heads used in cross attention
            is_causal <bool>: Whether to use causal attention
            apply_norm_to_cond <bool>: Whether to apply normalization to conditioning tensor
            max_length_causal_mask <int>: Maximum length of causal mask
            conv_non_linearity <Callable>: Convolution non-linearity
        FrB   )rA   r4   r6   r}   r   )rA   r4   r   r6   )r   r   r7   N)r   r   r   r   r   	LayerNorm	norm_selfr{   self_attentionr   norm_xattn_queryr   cross_attentionnorm_xattn_memorynorm_pos_ffr0   pos_ffrF   rG   rH   )r    r4   r5   r   r   r6   r   r   r   r   r   r}   r   r!   r#   r$   r     s6   
zTransformerLayer.__init__rN   c                   C   s   d d d dS )N)self_attn_outputcross_attn_outputrK   r#   r#   r#   r#   r$   rG     s   zTransformerLayer._init_cacheFc                 C   s6   || _ |  | _| j| | jr| j| d S d S r%   )rF   rG   rH   r   rV   r   r   rU   r#   r#   r$   rV     s   
zTransformerLayer.reset_cacher=   x_maskcond	cond_maskrW   c                 C   sL  || d }| j| ||d\}}| jr.| jd dur)tj| jd |gdd}|| jd< || }d}| jr|dur| |}	| jrP| jd durP| jd }
n| j	rX| 
|n|}
| jrb|
| jd< | j|	||
||d\}}| jr| jd	 durtj| jd	 |gdd}|| jd	< || }|| | | }|| d }|||d
dS )a  
        Args:
            x <torch tensor> (B, T1, C): Input tensor
            x_mask <bool mask> (B, T1): Multiplicative mask where True means we keep the input, False we zero it out.
                Mask for self attention input.
            cond <torch tensor> (B, T2, C): Conditioning tensor
            cond_mask <bool mask> (B, T2): Multiplicative mask where True means we keep the input, False we zero
                it out. Mask for cross attention input if it exists.

        Returns dict with keys
            output <torch tensor> (B, T1, C): Output tensor
            attn_probabilities <dict>: Attention probabilities
        rX   )rI   rJ   r   Nr   rZ   rK   )rI   rJ   rK   rL   rW   r   )self_attn_probabilitiescross_attn_probabilitiesoutputattn_probabilities)	unsqueezer   r   rF   rH   r   r   r   r   r   r   r   r   r   )r    r=   r   r   r   rW   x_s_attn_probx_attn_probx_normedrK   x_resr#   r#   r$   r*     s8   





zTransformerLayer.forwardrv   ru   )r+   r,   r-   r   r   r>   r   r?   r.   r   r   r   ry   r   rG   rV   rx   r*   r/   r#   r#   r!   r$   r     sf    		
@
r   c                !       s  e Zd Zddddddddddejjddfded	ed
edededededede	e de	e dededededede
f  fddZd*ddZedd Zedede	eejeej f  d e	eejeej f  d!e	eejeej f  d"e	ee	e   d#ee	ej e	ej e	ej f fd$d%Z				d+d&ejd'ejde	eejeej f  d e	eejeej f  d!e	eejeej f  d"e	ee	e   d#eeeejef f fd(d)Z  ZS ),Transformerr\   FNTr|   r1   r2   n_layersr4   r5   r   r   r6   p_dropout_outr   r   r   r   r   apply_norm_outr}   use_learnable_pos_embr   c                    s<  |r|	du s
|
du rt dt   tj|| _|| _| jdkr+tj| j| _nd| _|| _	| j	r>tjj
|dd| _nd| _tj | _t|D ]}| jt|||||||	|
||||d qK|| _d| _| jrstj||| _| | j |  D ]\}}d|v r|drtjjj|dd	td
|  d q}dS )a  
        Initializes a stack of transformer layers. Can be used for both encoder and decoder.
        Set is_causal is True for autoregressive models. Equivalent to TransformerBlock from Megatron-LM
        Args:
            n_layers <int>: Number of transformer layers
            d_model <int>: Model dimension
            d_ffn <int>: Feed forward dimension (usually 4*d_model)
            sa_n_heads <int>: Number of attention heads used in self-attention
            kernel_size <int>: Convolution kernel size for FFN
            p_dropout <float>: Dropout probability
            p_dropout_out <float>: Dropout probability for output
            has_xattn <bool>: Whether to use cross attention
            xa_d_memory <int>: Hidden dimension for cross attention; required if has_xattn is True
            xa_n_heads <int>: Number of attention heads used in cross attention; required if has_xattn is True
            is_causal <bool>: Whether to make attention and the convolution feedforward networks causal.
            apply_norm_to_cond <bool>: Whether to apply normalization to conditioning tensor; conditioning tensor being
                the input to the memory part of cross-attention.
            apply_norm_out <bool>: Whether to apply normalization to output
            max_length_causal_mask <int>: Maximum length of causal mask
            use_learnable_pos_emb <bool>: Whether to add a learnable positionable embedding inside the class
            conv_non_linearity <Callable>: Convolution non-linearity
        NzWIt requires that `xa_d_memory` and `xa_n_heads` are specified when `has_xattn` is True!r\   FrB   )r4   r5   r   r   r6   r   r   r   r   r   r}   r   r9   weight{Gz?r   meanstd)r   r   r   r   r   r:   r;   r   dropout_outr   r   norm_out
ModuleListlayersrangeappendr   r   position_embeddings	Embeddingapply_init_weights_gpt2named_parametersendswithinitnormal_mathsqrt)r    r   r4   r5   r   r   r6   r   r   r   r   r   r   r   r}   r   r   rn   nameparamr!   r#   r$   r     sP   )

"zTransformer.__init__c                 C   s   | j D ]}|| qd S r%   )r   rV   )r    rF   layerr#   r#   r$   rV   s  s   
zTransformer.reset_cachec                 C   sf   t | tjjtjjtjjfrtjjj| jddd t | tjjr/| j	d ur1tjj
| j	 d S d S d S )Nr\   r   r   )
isinstancer   r   rE   r   r   r   r   r   r   zeros_)moduler#   r#   r$   r   w  s
   zTransformer._init_weights_gpt2idxr   r   rW   multi_encoder_mappingrN   c                 C   s^   |d ur*||  d u rdS |||   |d ur|||   nd |d ur'|||   fS d fS |||fS )Nru   r#   )r   r   r   rW   r   r#   r#   r$   _get_layer_inputs  s   

zTransformer._get_layer_inputsr=   r   c                 C   s   t |trt| jt|k rtdt| j dt| d| jr6tj|d|j	d
d}|| | }g }| |}t| jD ]$\}	}
| |	||||\}}}|
|||||d}|d }||d	  qB| jd
urq| |}| jd
ur{| |}||dS )a  
        Args:
            x <torch tensor> (B, T1, C):
            x_mask <bool mask> (B, T1): Multiplicative mask where True means we keep the input, False we zero it out.
                Mostly used in non-causal self-attention to zero out padding values. In causal self-attention, the
                causal mask will be used in place of this.
            cond <torch tensor> (B, T2, C) or list of such tensors (from different encoders)
            cond_mask <bool mask> (B, T2): Multiplicative mask where True means we keep the input, False we zero it
                out or list of such tensors (from different encoders) output <torch tensor> (B, T1, C)
            multi_encoder_mapping <list> <int>: None or Same size as n_layers, value indicates which cond input to use
                for this layer

        Returns dict with keys:
            output <torch tensor> (B, T1, C): Output tensor
            attn_probabilities <list>: Attention probabilities of each layer
        znInsufficient Transformer layers for multiple conditionals. Each layer must cross-attend one conditional.Found z layers for z conditionals.r   )devicer   )rW   r   r   Nr   )r   listlenr   r   r   r   arangesizer   r   r   r;   	enumerater   r   r   r   )r    r=   r   r   r   rW   r   	positionsr   r   r   _cond
_cond_mask_attn_priorout_dictr#   r#   r$   r*     s2   







zTransformer.forwardrv   rw   )r+   r,   r-   r   r   r>   r   r?   r.   r   r   r   rV   ry   r   r   rx   r   r   r   r   rz   r*   r/   r#   r#   r!   r$   r     s    	

Z
r   )r   abcr   typingr   r   r   r   r   r   r   torch.nn.functionalr   
functionalr&   
nemo.utilsr	   Moduler
   r0   r@   r{   r   r   r   r#   r#   r#   r$   <module>   s    D, <: 