o
    پi)                     @   s   d Z ddlmZ ddlZddlmZ ddlm  mZ ddlm	Z	 ddl
mZ ddlmZ d	ee fd
dZG dd dejZG dd dejZdS )a]   Halo Self Attention

Paper: `Scaling Local Self-Attention for Parameter Efficient Visual Backbones`
    - https://arxiv.org/abs/2103.12731

@misc{2103.12731,
Author = {Ashish Vaswani and Prajit Ramachandran and Aravind Srinivas and Niki Parmar and Blake Hechtman and
    Jonathon Shlens},
Title = {Scaling Local Self-Attention for Parameter Efficient Visual Backbones},
Year = {2021},
}

Status:
This impl is a WIP, there is no official ref impl and some details in paper weren't clear to me.
The attention mechanism works but it's slow as implemented.

Hacked together by / Copyright 2021 Ross Wightman
    )ListN)nn   )make_divisible)trunc_normal_)_assertpermute_maskc                 C   s   | j \}}}}|j d }|d d }| |dd }	|	d||}	t|	ddgd}
t|
d|| g}
|
d|d |}
|
ddd||d df }	|	||d||dd|dd}	|	|S )a~   Compute relative logits along one dimension

    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925

    Args:
        q: (batch, height, width, dim)
        rel_k: (2 * window - 1, dim)
        permute_mask: permute output dim according to this
    r   r      N)shape	transposereshapeFpadflattenexpandpermute)qrel_kr   BHWdimrel_sizewin_sizexx_pad r   I/home/ubuntu/.local/lib/python3.10/site-packages/timm/layers/halo_attn.pyrel_logits_1d   s   
 
r    c                       s(   e Zd ZdZ fddZdd Z  ZS )PosEmbedRelz Relative Position Embedding
    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925

    c                    sZ   t    || _|| _tt|d d || | _tt|d d || | _	dS )z
        Args:
            block_size (int): block size
            win_size (int): neighbourhood window size
            dim_head (int): attention head dim
            scale (float): scale factor (for init)
        r	   r   N)
super__init__
block_sizedim_headr   	Parametertorchrandn
height_rel	width_rel)selfr$   r   r%   scale	__class__r   r   r#   C   s
   
 $zPosEmbedRel.__init__c           	      C   sl   |j \}}}}|d| j| j| j}t|| jdd}|dd}t|| jdd}|| }||||d}|S )Nr
   )r   r      r	      )r   r   r	   )r   r/   r   r0   r	   )r   r   r$   r%   r    r*   r   r)   )	r+   r   r   BBHW_rel_logits_wrel_logits_h
rel_logitsr   r   r   forwardQ   s   zPosEmbedRel.forward)__name__
__module____qualname____doc__r#   r7   __classcell__r   r   r-   r   r!   =   s    r!   c                       s6   e Zd ZdZ		d fdd		Zd
d Zdd Z  ZS )HaloAttna   Halo Attention

    Paper: `Scaling Local Self-Attention for Parameter Efficient Visual Backbones`
        - https://arxiv.org/abs/2103.12731

    The internal dimensions of the attention module are controlled by the interaction of several arguments.
      * the output dimension of the module is specified by dim_out, which falls back to input dim if not set
      * the value (v) dimension is set to dim_out // num_heads, the v projection determines the output dim
      * the query and key (qk) dimensions are determined by
        * num_heads * dim_head if dim_head is not None
        * num_heads * (dim_out * attn_ratio // num_heads) if dim_head is None
      * as seen above, attn_ratio determines the ratio of q and k relative to the output if dim_head not used

    Args:
        dim (int): input dimension to the module
        dim_out (int): output dimension of the module, same as dim if not set
        feat_size (Tuple[int, int]): size of input feature_map (not used, for arg compat with bottle/lambda)
        stride: output stride of the module, query downscaled if > 1 (default: 1).
        num_heads: parallel attention heads (default: 8).
        dim_head: dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set
        block_size (int): size of blocks. (default: 8)
        halo_size (int): size of halo overlap. (default: 3)
        qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0)
        qkv_bias (bool) : add bias to q, k, and v projections
        avg_down (bool): use average pool downsample instead of strided query blocks
        scale_pos_embed (bool): scale the position embedding as well as Q @ K
    Nr      r/         ?Fc                    sX  t    |p|}|| dksJ |dv sJ || _|p%t||	 dd| | _|| j | _|| j | _|| j | _| jd | _|| _	| | _
| _|| _||d  | _d| _d}|dkrq|pb|| dk}|rgdn|| _| j
| j | _tj|| jd| j|
d	| _tj|| j| j d|
d
| _t| j| j| j| jd| _|rtddnt | _|   d S )Nr   )r   r	   r>   )divisor      r	   r   F)stridebias)rC   )r$   r   r%   r,   )r"   r#   	num_headsr   dim_head_qk
dim_head_v
dim_out_qk	dim_out_vr,   scale_pos_embedr$   block_size_ds	halo_sizer   block_strider   Conv2dr   kvr!   	pos_embed	AvgPool2dIdentitypoolreset_parameters)r+   r   dim_out	feat_sizerB   rD   r%   r$   rK   qk_ratioqkv_biasavg_downrI   use_avg_poolr-   r   r   r#   }   s6   
zHaloAttn.__init__c                 C   sZ   | j jjd d }t| j j|d t| jj|d t| jj| jd t| jj| jd d S )Nr   rA   )std)	r   weightr   r   rN   rO   r)   r,   r*   )r+   rZ   r   r   r   rS      s
   zHaloAttn.reset_parametersc                 C   s  |j \}}}}t|| j dkd t|| j dkd || j }|| j }|| }| |}	|	d| j|| j|| jdddddd}	|	|| j | jd|	dd}	| 
|}
t|
| j| j| j| jg}
|
d| j| jd| j| j|| j | j| j |ddddd}
tj|
| j| jgdd	\}}| jr|	|	dd
 | |	 | j }n|	|	dd
 | j | |	 }|jdd	}|| 	dd}|d| j| j||}|ddddd || j|| j || j }| |}|S )Nr    r
   r   r/      r	   r0   )r   r   )r   r   r$   r   r   rE   rJ   r   rD   r   rN   r   r   rK   unfoldr   rF   r'   splitrI   rO   r,   softmax
contiguousviewrH   rL   rR   )r+   r   r   Cr   r   num_h_blocksnum_w_blocks
num_blocksr   rN   kvattnoutr   r   r   r7      sB   


 
 " 
zHaloAttn.forward)NNr   r>   Nr>   r/   r?   FFF)r8   r9   r:   r;   r#   rS   r7   r<   r   r   r-   r   r=   a   s    %r=   )r;   typingr   r'   r   torch.nn.functional
functionalr   helpersr   weight_initr   trace_utilsr   intr    Moduler!   r=   r   r   r   r   <module>   s    $t