o
    ÓÙ¾i-  ã                   @   s”   d Z ddlmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZ G d
d„ dejƒZG dd„ dejƒZdS )a:   Attention Pool 2D

Implementations of 2D spatial feature pooling using multi-head attention instead of average pool.

Based on idea in CLIP by OpenAI, licensed Apache 2.0
https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py

Hacked together by / Copyright 2021 Ross Wightman
é    )ÚOptionalÚUnionÚTupleNé   )Úuse_fused_attn)Ú	to_2tuple)Úresample_abs_pos_embed)Úapply_rot_embedÚRotaryEmbedding)Útrunc_normal_c                       sð   e Zd ZU dZejje ed< 											d&d
e	de
e	 dee	ee	e	f f de
e	 de
e	 de
e	 dededededef‡ fdd„Zd'defdd„Zd(de
e	 de
e fdd„Zdejde	de	d ejfd!d"„Zd'd#efd$d%„Z‡  ZS ))ÚRotAttentionPool2daB   Attention based 2D feature pooling w/ rotary (relative) pos embedding.
    This is a multi-head attention based replacement for (spatial) average pooling in NN architectures.

    Adapted from the AttentionPool2d in CLIP w/ rotary embedding instead of learned embed.
    https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py

    NOTE: While this impl does not require a fixed feature size, performance at differeing resolutions from
    train varies widely and falls off dramatically. I'm not sure if there is a way around this... -RW
    Ú
fused_attnNé   é@   TFÚtokenç        Úin_featuresÚout_featuresÚref_feat_sizeÚ	embed_dimÚhead_dimÚ	num_headsÚqkv_biasÚqkv_separateÚ	pool_typeÚclass_tokenÚ	drop_ratec                    sJ  t ƒ  ¡  |	dv sJ ‚|p| | _}|| _|p|| _t|ƒ}|d ur/|| dks*J ‚|| }n|| dks7J ‚|| }|| _|| _|	 ¡ | _	| jd | _
tƒ | _|
r]t t d|¡¡| _nd | _|rtj|||d| _tj|||d| _tj|||d| _d | _ntj||d |d| _t |¡| _t || j¡| _t| jd|d| _d S )	N©Ú r   r   ç      à¿r   ©Úbiasé   F)Ú	in_pixelsÚref_feat_shape)ÚsuperÚ__init__r   r   r   r   r   r   Úlowerr   Úscaler   r   ÚnnÚ	ParameterÚtorchÚzerosÚ	cls_tokenÚLinearÚqÚkÚvÚqkvÚDropoutÚdropÚprojr
   Ú	pos_embed)Úselfr   r   r   r   r   r   r   r   r   r   r   ©Ú	__class__© úP/home/ubuntu/.local/lib/python3.10/site-packages/timm/layers/attention_pool2d.pyr&   "   s8   



zRotAttentionPool2d.__init__Úzero_init_lastc                 C   s²   | j d u rA| jj}t| jj|d d tj | jj¡ t| j	j|d d tj | j	j¡ t| j
j|d d tj | j
j¡ d S | j j}t| j j|d d tj | j j¡ d S ©Nr   )Ústd)r2   r/   r   r   Úweightr)   ÚinitÚzeros_r!   r0   r1   ©r7   r<   r   r:   r:   r;   Úinit_weightsR   s   
zRotAttentionPool2d.init_weightsÚnum_classesc                 C   ó^   |d ur|dv s
J ‚|| _ |d ur-|dkrt | j|¡nt ¡ | _|dkr'|n| j| _d S d S ©Nr   r   ©r   r)   r.   r   ÚIdentityr5   r   r   ©r7   rD   r   r:   r:   r;   Úreset`   ó    þzRotAttentionPool2d.resetÚxÚHÚWÚreturnc                 C   óT   | j dkr|d d …df }|S |d d …dd …f  |jd ||d¡ dddd¡}|S ©Nr   r   r   éÿÿÿÿr"   é   ©r   ÚreshapeÚshapeÚpermute©r7   rL   rM   rN   r:   r:   r;   Ú_pooli   ó
   
2ÿzRotAttentionPool2d._poolÚ
pre_logitsc              	   C   s”  |j \}}}}|| }| d¡ dd¡}| jd u r(tj|jddd|gdd}ntj| j |j d dd¡|gdd}| jd u ry|  	|¡ 
||d | j| j¡ dd¡}|  |¡ 
||d | j| j¡ dd¡}	|  |¡ 
||d | j| j¡ dd¡}
n|  |¡ 
||d d| j| j¡ ddddd	¡}| d¡\}}	}
| j ||f¡\}}tj|d d …d d …d d…d d …f t|d d …d d …dd …d d …f ||ƒgdd |
¡}tj|	d d …d d …d d…d d …f t|	d d …d d …dd …d d …f ||ƒgdd |
¡}	| jrtj ||	|
¡}n|| j }||	 d
d¡ }|jdd}||
 }| dd¡ 
||d d¡}|  |¡}|r<|  |||¡}|S |  |¡}|  |||¡}|S )NrS   r   T©Úkeepdim©Údimr   rR   r"   é   éþÿÿÿ)rV   ÚflattenÚ	transposer-   r+   ÚcatÚmeanÚexpandr2   r/   rU   r   r   r0   r1   rW   Úunbindr6   Ú	get_embedr	   Útype_asr   r)   Ú
functionalÚscaled_dot_product_attentionr(   Úsoftmaxr4   rY   r5   )r7   rL   r[   ÚBÚ_rM   rN   ÚNr/   r0   r1   ÚrseÚrceÚattnr:   r:   r;   Úforwardq   s:   
$
&&(.XX


zRotAttentionPool2d.forward)
Nr   Nr   NTFr   Fr   ©F©NN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r+   ÚjitÚFinalÚboolÚ__annotations__Úintr   r   r   ÚstrÚfloatr&   rC   rJ   ÚTensorrY   rs   Ú__classcell__r:   r:   r8   r;   r      sP   
 	ôþýüûúùø	÷
öõô0	r   c                       sð   e Zd ZU dZejje ed< 											d&d
e	de
e	ee	e	f f dee	 dee	 dee	 dee	 dededededef‡ fdd„Zd'defdd„Zd(dee	 dee fdd„Zdejde	de	d ejfd!d"„Zd'd#efd$d%„Z‡  ZS ))ÚAttentionPool2da©   Attention based 2D feature pooling w/ learned (absolute) pos embedding.
    This is a multi-head attention based replacement for (spatial) average pooling in NN architectures.

    It was based on impl in CLIP by OpenAI
    https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py

    NOTE: This requires feature size upon construction and well prevent adaptive sizing of the network.
    r   r   Nr   TFr   r   r   Ú	feat_sizer   r   r   r   r   r   r   r   r   c                    s€  t ƒ  ¡  |	dv sJ ‚|p| | _}|| _|p|| _|d ur+|| dks&J ‚|| }n|| dks3J ‚|| }t|ƒ| _| jd | jd  | _|| _|| _	|	| _
| j	d | _tƒ | _|
rgt t d|¡¡| _nd | _|r‹tj|||d| _tj|||d| _tj|||d| _d | _nd  | _ | _| _tj||d |d| _t |¡| _t || j¡| _t t | jd |¡¡| _|  ¡  d S )Nr   r   r   r   r    r"   )r%   r&   r   r   r   r   r„   Úseq_lenr   r   r   r(   r   r   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   rC   )r7   r   r„   r   r   r   r   r   r   r   r   r   r8   r:   r;   r&   ¡   s>   



zAttentionPool2d.__init__r<   c                 C   sÂ   | j d u r@| jj}t| jj|d d tj | jj¡ t| j	j|d d tj | j	j¡ t| j
j|d d tj | j
j¡ n| j j}t| j j|d d tj | j j¡ t| j|d d d S r=   )r2   r/   r   r   r?   r)   r@   rA   r!   r0   r1   r6   rB   r:   r:   r;   rC   Õ   s   
zAttentionPool2d.init_weightsrD   c                 C   rE   rF   rG   rI   r:   r:   r;   rJ   ä   rK   zAttentionPool2d.resetrL   rM   rN   rO   c                 C   rP   rQ   rT   rX   r:   r:   r;   rY   í   rZ   zAttentionPool2d._poolr[   c                 C   sê  |j \}}}}|| }| d¡ dd¡}| jd u r(tj|jddd|gdd}ntj| j |j d dd¡|gdd}t| j	 
d¡||fdd}|| }| jd u rŠ|  |¡ ||d | j| j¡ dd¡}	|  |¡ ||d | j| j¡ dd¡}
|  |¡ ||d | j| j¡ dd¡}n|  |¡ |dd	| j| j¡ ddd	dd
¡}| d¡\}	}
}| jr³tj |	|
|¡}n|	| j }	|	|
 dd¡ }|jdd}|| }| dd¡ ||d d¡}|  |¡}|rç|  |||¡}|S |  |¡}|  |||¡}|S )NrS   r   Tr\   r^   r   rR   )Únum_prefix_tokensr"   r`   ra   )rV   rb   rc   r-   r+   rd   re   rf   r   r6   Ú	unsqueezer2   r/   rU   r   r   r0   r1   rW   rg   r   r)   rj   rk   r(   rl   r4   rY   r5   )r7   rL   r[   rm   rn   rM   rN   ro   r6   r/   r0   r1   rr   r:   r:   r;   rs   õ   s8   
$
&&(*


zAttentionPool2d.forward)
r   NNr   NTFr   Fr   rt   ru   )rv   rw   rx   ry   r+   rz   r{   r|   r}   r~   r   r   r   r   r€   r&   rC   rJ   r   rY   rs   r‚   r:   r:   r8   r;   rƒ   –   sP   
 ôþýüûúùø	÷
öõô4	rƒ   )ry   Útypingr   r   r   r+   Útorch.nnr)   Úconfigr   Úhelpersr   r6   r   Úpos_embed_sincosr	   r
   Úweight_initr   ÚModuler   rƒ   r:   r:   r:   r;   Ú<module>   s    	 