o
    پi                     @   s   d Z ddlmZ ddlZddlmZ ddlm  mZ ddl	m
Z
mZ ddlmZ ddlmZ dee fd	d
ZG dd dejZG dd dejZdS )aP   Bottleneck Self Attention (Bottleneck Transformers)

Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605

@misc{2101.11605,
Author = {Aravind Srinivas and Tsung-Yi Lin and Niki Parmar and Jonathon Shlens and Pieter Abbeel and Ashish Vaswani},
Title = {Bottleneck Transformers for Visual Recognition},
Year = {2021},
}

Based on ref gist at: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2

This impl is a WIP but given that it is based on the ref gist likely not too far off.

Hacked together by / Copyright 2021 Ross Wightman
    )ListN   )	to_2tuplemake_divisible)trunc_normal_)_assertpermute_maskc           	      C   s   | j \}}}}| |dd }|d|d| d }t|ddgd}t|d|d g}|d|d d| d }|ddd||d df }|||d||dd|dd}||S )a   Compute relative logits along one dimension

    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925

    Args:
        q: (batch, heads, height, width, dim)
        rel_k: (2 * width - 1, dim)
        permute_mask: permute output dim according to this
       r   r   N)shape	transposereshapeFpadflattenexpandpermute)	qrel_kr   BHWdimxx_pad r   O/home/ubuntu/.local/lib/python3.10/site-packages/timm/layers/bottleneck_attn.pyrel_logits_1d   s    
r   c                       s(   e Zd ZdZ fddZdd Z  ZS )PosEmbedRelz Relative Position Embedding
    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925
    c                    sh   t    t|\| _| _|| _tt	| jd d || | _
tt	| jd d || | _d S )Nr   r   )super__init__r   heightwidthdim_headnn	Parametertorchrandn
height_rel	width_rel)self	feat_sizer$   scale	__class__r   r   r!   =   s
   
"&zPosEmbedRel.__init__c                 C   sf   |j \}}}||| j| jd}t|| jdd}|dd}t|| jdd}|| }||||}|S )Nr	   )r   r      r      )r   r   r   )r   r0   r   r1   r   )r   r   r"   r#   r   r*   r   r)   )r+   r   r   HW_rel_logits_wrel_logits_h
rel_logitsr   r   r   forwardD   s   zPosEmbedRel.forward)__name__
__module____qualname____doc__r!   r7   __classcell__r   r   r.   r   r   8   s    r   c                       s6   e Zd ZdZ		d fdd	Zd	d
 Zdd Z  ZS )BottleneckAttna   Bottleneck Attention
    Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605

    The internal dimensions of the attention module are controlled by the interaction of several arguments.
      * the output dimension of the module is specified by dim_out, which falls back to input dim if not set
      * the value (v) dimension is set to dim_out // num_heads, the v projection determines the output dim
      * the query and key (qk) dimensions are determined by
        * num_heads * dim_head if dim_head is not None
        * num_heads * (dim_out * attn_ratio // num_heads) if dim_head is None
      * as seen above, attn_ratio determines the ratio of q and k relative to the output if dim_head not used

    Args:
        dim (int): input dimension to the module
        dim_out (int): output dimension of the module, same as dim if not set
        stride (int): output stride of the module, avg pool used if stride == 2 (default: 1).
        num_heads (int): parallel attention heads (default: 4)
        dim_head (int): dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set
        qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0)
        qkv_bias (bool): add bias to q, k, and v projections
        scale_pos_embed (bool): scale the position embedding as well as Q @ K
    Nr   r1         ?Fc
           
         s   t    |d usJ d|p|}|| dksJ || _|p't|| dd| | _|| j | _|| j | _|| j | _| jd | _|	| _	t
j|| jd | j d|d| _t|| j| jd	| _|dkrht
ddnt
 | _|   d S )
NzBA concrete feature size matching expected input (H, W) is requiredr      )divisor      r   r   )bias)r$   r-   )r    r!   	num_headsr   dim_head_qk
dim_head_v
dim_out_qk	dim_out_vr-   scale_pos_embedr%   Conv2dqkvr   	pos_embed	AvgPool2dIdentitypoolreset_parameters)
r+   r   dim_outr,   striderC   r$   qk_ratioqkv_biasrH   r.   r   r   r!   j   s   
 zBottleneckAttn.__init__c                 C   sF   t | jj| jjjd d d t | jj| jd t | jj| jd d S )Nr   rA   )std)r   rJ   weightr   rK   r)   r-   r*   )r+   r   r   r   rO      s   zBottleneckAttn.reset_parametersc                 C   s   |j \}}}}t|| jjkd t|| jjkd | |}tj|| j| j| j	gdd\}}}|
|| j | jddd}|
|| j | jd}|
|| j | jddd}| jrh|| | | | j }	n|| | j | | }	|	jdd}	|	| dd
|| j	||}
| |
}
|
S )N r   )r   r	   r
   )r   r   rK   r"   r#   rJ   r'   splitrF   rG   r   rC   rD   r   rE   rH   r-   softmaxrN   )r+   r   r   Cr   r   r   kvattnoutr   r   r   r7      s   
"
zBottleneckAttn.forward)NNr   r1   Nr>   FF)r8   r9   r:   r;   r!   rO   r7   r<   r   r   r.   r   r=   T   s    r=   )r;   typingr   r'   torch.nnr%   torch.nn.functional
functionalr   helpersr   r   weight_initr   trace_utilsr   intr   Moduler   r=   r   r   r   r   <module>   s    