o
    ÓÙ¾i12  ã                   @   s¢   d dl mZmZmZmZ d dlZd dlmZ d dlmZ	 ddl
mZ ddlmZ ddlmZ dd	lmZ G d
d„ dejƒZG dd„ dejƒZG dd„ dejƒZdS )é    )ÚListÚOptionalÚTypeÚUnionN)Únn)Ú
functionalé   )Úuse_fused_attn)Úcreate_conv2d)Ú	to_2tuple)Úcreate_pool2dc                       sn   e Zd ZdZ						ddedee ded	ed
ededef‡ fdd„Zdd„ Zddee	j
 fdd„Z‡  ZS )ÚMultiQueryAttentionV2aÀ  Multi Query Attention.

    Fast Transformer Decoding: One Write-Head is All You Need
    https://arxiv.org/pdf/1911.02150.pdf

    This is an acceletor optimized version - removing multiple unnecessary
    tensor transpose by re-arranging indices according to the following rules: 1)
    contracted indices are at the end, 2) other indices have the same order in the
    input and output tensores.

    Compared to V1, this gives 3x speed up.
    Né   é@   ç        ÚdimÚdim_outÚ	num_headsÚkey_dimÚ	value_dimÚ	attn_dropÚ	proj_dropc                    s²   t ƒ  ¡  |p|}|| _|| _|| _|d | _t t 	| j| j|g¡¡| _
t t 	|| jg¡¡| _t t 	|| jg¡¡| _t |¡| _t t 	|| j| jg¡¡| _t |¡| _dS )zInitializer.ç      à¿N)ÚsuperÚ__init__r   r   r   Úscaler   Ú	ParameterÚtorchÚrandnÚ
query_projÚkey_projÚ
value_projÚDropoutr   Úout_projr   )Úselfr   r   r   r   r   r   r   ©Ú	__class__© úK/home/ubuntu/.local/lib/python3.10/site-packages/timm/layers/attention2d.pyr      s   

zMultiQueryAttentionV2.__init__c                 C   s$   |j }| |d |d d¡ dd¡S )zBReshapes a tensor to three dimensions, keeping the first and last.r   r   éÿÿÿÿé   )ÚshapeÚreshapeÚ	transpose©r$   ÚtÚsr'   r'   r(   Ú_reshape_input4   s   z$MultiQueryAttentionV2._reshape_inputÚmc                 C   sÄ   |j \}}}}|dur|n|}|  |¡}|  |¡}t d|| j¡}	t d|| j¡}
t d|	|
¡| j }|jdd}|  |¡}t d|| j	¡}t d||¡}t d	|| j
¡}|  |¡}| |d||¡S )
úRun layer computation.Nzbnd,hkd->bnhkzbmd,dk->bmkzbnhk,bmk->bnhmr)   ©r   zbmd,dv->bmvzbnhm,bmv->bnhvzbnhv,dhv->bdn)r+   r1   r   Úeinsumr   r    r   Úsoftmaxr   r!   r#   r   r,   )r$   Úxr2   ÚbÚ_ÚhÚwÚ
reshaped_xÚ
reshaped_mÚqÚkÚattnÚvÚoÚresultr'   r'   r(   Úforward<   s   



zMultiQueryAttentionV2.forward)Nr   r   r   r   r   ©N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Úintr   Úfloatr   r1   r   ÚTensorrD   Ú__classcell__r'   r'   r%   r(   r      s2    øþýüûúùør   c                       s  e Zd ZU dZejje ed< ddddddddddde	j
d	fd
edee dedee dee dededededeeeee f dededee	j def‡ fdd„Zdd„ Zdejfdd„Zdejdedefdd „Zdejded!ed"efd#d$„Zd(d%eej fd&d'„Z‡  ZS ))ÚMultiQueryAttention2daª  Multi Query Attention with spatial downsampling.

     3 parameters are introduced for the spatial downsampling:
     1. kv_stride: downsampling factor on Key and Values only.
     2. query_strides: horizontal & vertical strides on Query only.

    This is an optimized version.
    1. Projections in Attention is explicit written out as 1x1 Conv2D.
    2. Additional reshapes are introduced to bring a up to 3x speed up.
    Ú
fused_attnNr   r   é   Ú r   Fr   r   r   r   r   Úquery_stridesÚ	kv_strideÚdw_kernel_sizeÚdilationÚpaddingr   r   Ú
norm_layerÚuse_biasc                    s,  t ƒ  ¡  |p|}|| _|p|| | _|p|| | _t|ƒ| _|| _tdd„ | jD ƒƒ| _	| jd | _
tƒ | _|| _t ¡ | _| j	rh|
dkrT| j dtd| jdd¡ n| j dtj|d¡ | j d	||ƒ¡ | j d
t|| j| j d|d¡ t ¡ | _|dkr›| j dt|||||	|
dd¡ | j d	||ƒ¡ | j d
t|| jd|
|d¡ t ¡ | _|dkrÌ| j dt|||||	|
dd¡ | j d	||ƒ¡ | j d
t|| jd|d¡ t |¡| _t ¡ | _| j	rö| j dtj| jddd¡ | j d
t| j| j |d|d¡ | j dt |¡¡ d| _dS )a{  Initializer.

        Args:
          num_heads: Number of attention heads.
          key_dim: Size of the attention key dimension.
          value_dim: Size of the attention value dimension.
          query_strides: Vertical stride size for query only.
          kv_stride: Key and value stride size.
          dw_kernel_size: Spatial dimension of the depthwise kernel.
        c                 S   s   g | ]}|d k‘qS )r   r'   )Ú.0r0   r'   r'   r(   Ú
<listcomp>   s    z2MultiQueryAttention2d.__init__.<locals>.<listcomp>r   ÚsameÚ	down_poolÚavg)Úkernel_sizerV   )r^   ÚnormÚprojr   )r^   ÚbiasÚ	down_convT)r^   ÚstriderU   rV   Ú	depthwise)r^   rV   ra   ÚupsampleÚbilinearF)Úscale_factorÚmodeÚalign_cornersÚdropN)r   r   r   r   r   r   rR   rS   ÚanyÚhas_query_stridesr   r	   rO   rj   r   Ú
SequentialÚqueryÚ
add_moduler   Ú	AvgPool2dr
   ÚkeyÚvaluer"   r   ÚoutputÚUpsampler5   )r$   r   r   r   r   r   rR   rS   rT   rU   rV   r   r   rW   rX   r%   r'   r(   r   _   s˜   




ý

ü

ù	
û

ù	
ü


ü
zMultiQueryAttention2d.__init__c                 C   sz   t j | jjj¡ t j | jjj¡ t j | jjj¡ | jdkr2t j | jj	j¡ t j | jj	j¡ t j | j
jj¡ d S )Nr   )r   ÚinitÚxavier_uniform_rn   r`   Úweightrq   rr   rS   rb   rs   )r$   r'   r'   r(   Úinit_weightsÐ   s   
z"MultiQueryAttention2d.init_weightsr/   c                 C   s<   |j }| |d |d d¡ dd¡}| jr|S | d¡ ¡ S )zFReshapes a tensor to three dimensions, keeping the batch and channels.r   r   r)   r*   )r+   r,   r-   r5   Ú	unsqueezeÚ
contiguousr.   r'   r'   r(   r1   Ú   s
   z$MultiQueryAttention2d._reshape_inputc                 C   sD   |j }| |d ||d¡}| jr| dddd¡ ¡ S | dd¡ ¡ S )z?Reshapes projected query: [b, n, n, h x k] -> [b, n x n, h, k].r   r)   rP   r   r*   éþÿÿÿ)r+   r,   r5   Úpermuterz   r-   )r$   r/   r   r   r0   r'   r'   r(   Ú_reshape_projected_queryã   s
   z.MultiQueryAttention2d._reshape_projected_queryÚh_pxÚw_pxc                 C   sH   |j }|d | }| js| dd¡}| |d |||¡ dddd¡ ¡ S )z2Reshape output:[b, n x n x h, k] -> [b, n, n, hk].r)   r   r*   r   rP   )r+   r5   r-   r,   r|   rz   )r$   r/   r   r~   r   r0   Úfeat_dimr'   r'   r(   Ú_reshape_outputì   s
   $z%MultiQueryAttention2d._reshape_outputÚ	attn_maskc                 C   sL  |j  \}}}}}|  |¡}|  || j| j¡}|  |¡}	|  |	¡}	|  |¡}
|  |
¡}
| jrSt	 d||	¡| j
 }|dur@|| }|jdd}|  |¡}t	 d||
¡}n9| jrhtj||	|
|| jrc| jjndd}n$|| j
 }||	 dd¡ }|dur}|| }|jdd}|  |¡}||
 }|  || j|| jd	  || jd
  ¡}|  |¡}|S )r3   zblhk,bpk->blhpNr)   r4   zblhp,bpk->blhkr   ©r‚   Ú	dropout_pr{   r   r   )r+   rn   r}   r   r   rq   r1   rr   r5   r   r   r6   r   rO   ÚFÚscaled_dot_product_attentionÚtrainingÚpr-   r   rR   rs   )r$   r7   r‚   ÚBÚCÚHÚWr0   r>   r?   rA   r@   rB   r'   r'   r(   rD   ô   s<   





ý

&
zMultiQueryAttention2d.forwardrE   )rF   rG   rH   rI   r   ÚjitÚFinalÚboolÚ__annotations__r   ÚBatchNorm2drJ   r   r   Ústrr   rK   r   ÚModuler   rx   rL   r1   r}   r   rD   rM   r'   r'   r%   r(   rN   R   sd   
 
ñþýüûúùø	÷
öõôóòñq
		rN   c                       s|   e Zd ZU ejje ed< 	 							ddede	e d	ed
ededede
de
f‡ fdd„Zdde	ej fdd„Z‡  ZS )ÚAttention2drO   Né    TFr   r   r   r   ra   Úexpand_firstÚ
head_firstr   r   c	           
         s„   t ƒ  ¡  |p|}|r|n|}	|| _|	| | _|| _tƒ | _tj||	d d|d| _	t 
|¡| _tj|	|d|d| _t 
|¡| _d S )NrP   r   )ra   )r   r   r   Údim_headr—   r	   rO   r   ÚConv2dÚqkvr"   r   r`   r   )
r$   r   r   r   ra   r–   r—   r   r   Údim_attnr%   r'   r(   r   *  s   

zAttention2d.__init__r‚   c                 C   sZ  |j \}}}}| jr"|  |¡ || j| jd d¡jddd\}}}	n|  |¡ |d| j| jd¡ d¡\}}}	| j	rit
jjj| dd¡ ¡ | dd¡ ¡ |	 dd¡ ¡ || jrZ| jjndd dd¡ |d||¡}n8| dd¡}|	 dd¡}	|| | d¡d	  }
|d urˆ|
| }
|
jdd}
|  |
¡}
|
|	  dd¡ |d||¡}|  |¡}|  |¡}|S )
NrP   r)   r*   r4   r   r{   r   rƒ   r   )r+   r—   rš   Úviewr   r˜   Úchunkr,   ÚunbindrO   r   r   r   r†   r-   rz   r‡   r   rˆ   Úsizer6   r`   r   )r$   r7   r‚   r‰   rŠ   r‹   rŒ   r>   r?   rA   r@   r'   r'   r(   rD   B  s2   0(ûú


zAttention2d.forward)Nr•   TFFr   r   rE   )rF   rG   rH   r   r   rŽ   r   r   rJ   r   rK   r   rL   rD   rM   r'   r'   r%   r(   r”   &  s8   
 ÷þýüûúùø	÷r”   )Útypingr   r   r   r   r   r   Útorch.nnr   r…   Úconfigr	   r
   Úhelpersr   Úpool2d_samer   r“   r   rN   r”   r'   r'   r'   r(   Ú<module>   s    E U