o
    پi~!                     @   s   d dl mZmZmZ d dlZd dlmZ d dlmZ ddl	m
Z
 ddlmZ ddejd	eej fd
dZG dd dejZG dd dejZdS )    )FinalOptionalTypeN)nn)
functional   )use_fused_attn)apply_rot_embed_catscores	attn_maskc                 C   s   |d u r| S | | S N )r
   r   r   r   I/home/ubuntu/.local/lib/python3.10/site-packages/timm/layers/attention.pymaybe_add_mask   s   r   c                       s   e Zd ZU dZee ed< 								dded	ed
edededededede	e
ej  ddf fddZ	ddejde	ej dejfddZ  ZS )	Attentiona  Standard Multi-head Self Attention module with QKV projection.

    This module implements the standard multi-head attention mechanism used in transformers.
    It supports both the fused attention implementation (scaled_dot_product_attention) for
    efficiency when available, and a manual implementation otherwise. The module includes
    options for QK normalization, attention dropout, and projection dropout.
    
fused_attn   FT        Ndim	num_headsqkv_biasqk_norm
scale_norm	proj_bias	attn_drop	proj_drop
norm_layerreturnc
           
         s   t    || dksJ d|s|r|	dusJ d|| _|| | _| jd | _t | _tj||d |d| _	|r?|	| jnt
 | _|rK|	| jnt
 | _t|| _|r\|	|nt
 | _tj|||d| _t|| _dS )ag  Initialize the Attention module.

        Args:
            dim: Input dimension of the token embeddings
            num_heads: Number of attention heads
            qkv_bias: Whether to use bias in the query, key, value projections
            qk_norm: Whether to apply normalization to query and key vectors
            proj_bias: Whether to use bias in the output projection
            attn_drop: Dropout rate applied to the attention weights
            proj_drop: Dropout rate applied after the output projection
            norm_layer: Normalization layer constructor for QK normalization if enabled
        r   z$dim should be divisible by num_headsN<norm_layer must be provided if qk_norm or scale_norm is True         bias)super__init__r   head_dimscaler   r   r   LinearqkvIdentityq_normk_normDropoutr   normprojr   )
selfr   r   r   r   r   r   r   r   r   	__class__r   r   r$      s   

zAttention.__init__xr   c                 C   s   |j \}}}| |||d| j| jddddd}|d\}}}	| || |}}| j	rCt
j|||	|| jr>| jjndd}n!|| j }||dd	 }
t|
|}
|
jd	d
}
| |
}
|
|	 }|dd|||}| |}| |}| |}|S )Nr       r   r      r   r   	dropout_pr   )shaper(   reshaper   r%   permuteunbindr*   r+   r   Fscaled_dot_product_attentiontrainingr   pr&   	transposer   softmaxr-   r.   r   )r/   r2   r   BNCr(   qkvattnr   r   r   forwardB   s*   *





zAttention.forward)r   FFFTr   r   Nr   )__name__
__module____qualname____doc__r   bool__annotations__intfloatr   r   r   Moduler$   torchTensorrK   __classcell__r   r   r0   r   r      sP   
 	
,r   c                       s   e Zd ZU dZejje ed< 										dd	e	d
e	dedede	de
de
dee	 deej dedef fddZ		ddeej deej fddZ  ZS )AttentionRopez A Self Attention module with ROPE support.

    Includes options for:
     * QK normalization option
     * Attention output (scale) normalization
     * Fused or unfused QKV projection support
    r   r   Tr   r   NFr   r   r   	qkv_fusednum_prefix_tokensr   r   attn_head_dimr   r   r   c                    s4  t    |s	|
r|	dusJ d|| _|| }|dur|}|| j }|d | _|| _t | _|rFtj||d |d| _	d | _
 | _| _nd| _	tj|||d| _
tj|||d| _tj|||d| _|
rj|	|nt | _|
ru|	|nt | _t|| _|r|	|nt | _t||| _t|| _dS )a  Initialize the Attention module.

        Args:
            dim: Input dimension of the token embeddings
            num_heads: Number of attention heads
            qkv_bias: Whether to add a bias term to the query, key, and value projections
            num_prefix_tokens: Number of reg/cls tokens at the beginning of the sequence that
                should not have position embeddings applied
            attn_drop: Dropout rate for attention weights
            proj_drop: Dropout rate for the output projection
            attn_head_dim: Dimension of each attention head (if None, computed as dim // num_heads)
            norm_layer: Normalization layer constructor to use for QK and scale normalization
            qk_norm: Enable normalization of query (Q) and key (K) vectors with norm_layer
            scale_norm: Enable normalization (scaling) of attention output with norm_layer
        Nr   r   r    r!   )r#   r$   r   r&   rZ   r   r   r   r'   r(   q_projk_projv_projr)   r*   r+   r,   r   r-   r.   r   )r/   r   r   r   rY   rZ   r   r   r[   r   r   r   r%   attn_dimr0   r   r   r$   k   s0   


zAttentionRope.__init__roper   c              	   C   s(  |j \}}}| jdur*| |}|||d| jdddddd}|d\}}	}
n0| |||| jddd}| |||| jddd}	| 	|||| jddd}
| 
|| |	}}	|dur| j}tj|ddddd|ddf t|dddd|dddf |gdd|
}tj|	ddddd|ddf t|	dddd|dddf |gdd|
}	| jrtj||	|
|| jr| jjnd	d
}n!|| j }||	dd }t||}|jdd}| |}||
 }|dd|||}| |}| |}| |}|S )a  Forward pass for the attention module.

        Args:
            x: Input tensor of shape (batch_size, sequence_length, embedding_dim)
            rope: Rotary position embeddings tensor for position-aware attention
            attn_mask: Optional attention mask to apply during attention computation

        Returns:
            Tensor of shape (batch_size, sequence_length, embedding_dim)
        Nr    r8   r3   r   r   r4   r9   r   r5   r7   )r:   r(   r;   r   r<   r=   r\   rB   r]   r^   r*   r+   rZ   rU   catr	   type_asr   r>   r?   r@   r   rA   r&   r   rC   r-   r.   r   )r/   r2   r`   r   rD   rE   rF   r(   rG   rH   rI   nptrJ   r   r   r   rK      s<   

"   VV





zAttentionRope.forward)
r   TTr   r   r   NNFF)NN)rL   rM   rN   rO   rU   jitr   rP   rQ   rR   rS   r   r   r   rT   r$   rV   rK   rW   r   r   r0   r   rX   a   sV   
 	
<rX   r   )typingr   r   r   rU   r   torch.nnr   r>   configr   pos_embed_sincosr	   rV   r   rT   r   rX   r   r   r   r   <module>   s    R