o
    oi-                     @  s   d Z ddlmZ ddlmZmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ dgZG d	d
 d
e	ZG dd dejZG dd de	ZG dd dejZG dd de	ZG dd de	ZG dd de	Zg dZdddZdS )ae  Module that implement Vision Transformer (ViT).

Paper: https://paperswithcode.com/paper/an-image-is-worth-16x16-words-transformers-1

Based on: `https://towardsdatascience.com/implementing-visualttransformer-in-pytorch-184f9f16f632`

Added some tricks from: `https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py`
    )annotations)AnyCallableN)nn)ModuleTensorconcatenate)KORNIA_CHECKVisionTransformerc                      s(   e Zd Zd fddZdddZ  ZS )ResidualAddfnCallable[..., Tensor]returnNonec                   s   t    || _d S N)super__init__r   )selfr   	__class__ F/home/ubuntu/.local/lib/python3.10/site-packages/kornia/contrib/vit.pyr   )   s   

zResidualAdd.__init__xr   kwargsr   c                 K  s"   |}| j |fi |}||7 }|S r   )r   )r   r   r   resr   r   r   forward-   s   zResidualAdd.forward)r   r   r   r   )r   r   r   r   r   r   __name__
__module____qualname__r   r   __classcell__r   r   r   r   r   (   s    r   c                      s    e Zd Zdd fd
dZ  ZS )FeedForward        in_featuresinthidden_featuresout_featuresdropout_ratefloatr   r   c              	     s8   t  t||t t|t||t| d S r   )r   r   r   LinearGELUDropout)r   r#   r%   r&   r'   r   r   r   r   5   s   

zFeedForward.__init__)r"   )
r#   r$   r%   r$   r&   r$   r'   r(   r   r   r   r   r   r   r    r   r   r   r   r!   4   s    r!   c                      s(   e Zd Zd fd	d
ZdddZ  ZS )MultiHeadAttentionemb_sizer$   	num_headsatt_dropr(   	proj_dropr   r   c                   s   t    || _|| _|| }|d | _| j| j r'td| j d| j dt||d | _t	|| _
t||| _t	|| _d S )Ng      zySize of embedding inside the transformer decoder must be visible by number of headsfor correct multi-head attention Got: z embedding size and z numbers of heads   )r   r   r.   r/   scale
ValueErrorr   r)   qkvr+   r0   
projectionprojection_drop)r   r.   r/   r0   r1   	head_sizer   r   r   r   @   s"   

zMultiHeadAttention.__init__r   r   c                 C  s   |j \}}}| |||d| j|| j ddddd}|d |d |d }}}td||| j }	|	jdd}	| 	|	}	td	|	|}
|
dddd
 ||d}
| |
}
| |
}
|
S )
Nr2      r         zbhqd, bhkd -> bhqkdimzbhal, bhlv -> bhav )shaper5   reshaper/   permutetorcheinsumr3   softmaxr0   
contiguousviewr6   r7   )r   r   BNCr5   qkvattoutr   r   r   r   T   s   .


zMultiHeadAttention.forward)
r.   r$   r/   r$   r0   r(   r1   r(   r   r   r   r   r   r   r   r   r   r   r   r-   ?   s    r-   c                      s   e Zd Zd fd	d
Z  ZS )TransformerEncoderBlock	embed_dimr$   r/   r'   r(   dropout_attnr   r   c                   sd   t  ttt|dt||||t|ttt|dt||d ||dt| d S )Nư>r;   )r'   )	r   r   r   r   
Sequential	LayerNormr-   r+   r!   )r   rQ   r/   r'   rR   r   r   r   r   j   s    

z TransformerEncoderBlock.__init__)
rQ   r$   r/   r$   r'   r(   rR   r(   r   r   r,   r   r   r   r   rP   i   s    rP   c                      s4   e Zd Z					dd fddZdddZ  ZS )TransformerEncoder      r"   rQ   r$   depthr/   r'   r(   rR   r   r   c                   s8   t    tj fddt|D  | _g | _d S )Nc                 3  s    | ]
}t  V  qd S r   )rP   ).0_rR   r'   rQ   r/   r   r   	<genexpr>   s    z.TransformerEncoder.__init__.<locals>.<genexpr>)r   r   r   rT   rangeblocksresults)r   rQ   rY   r/   r'   rR   r   r\   r   r   ~   s
   

zTransformerEncoder.__init__r   r   c                 C  s2   g | _ |}| j D ]}||}| j | q
|S r   )r`   r_   childrenappend)r   r   rN   mr   r   r   r      s   zTransformerEncoder.forward)rW   rX   rX   r"   r"   )rQ   r$   rY   r$   r/   r$   r'   r(   rR   r(   r   r   rO   r   r   r   r   r   rV   }   s    rV   c                      sB   e Zd ZdZ					dd fddZdddZdddZ  ZS )PatchEmbeddingzJCompute the 2d image patch embedding ready to pass to transformer encoder.r2   rW         Nin_channelsr$   out_channels
patch_size
image_sizebackboneModule | Noner   r   c                   s   t    || _|| _|| _|ptj||||d| _|d ur,| |||f\}}|| _n|| d }t	t
dd|| _t	t
|d || _d S )N)kernel_sizestrider9   r:   )r   r   rg   rh   ri   r   Conv2drk   _compute_feats_dims	ParameterrB   randn	cls_token	positions)r   rg   rh   ri   rj   rk   	feat_sizer   r   r   r      s   
zPatchEmbedding.__init__tuple[int, int, int]tuple[int, int]c                 C  s:   |  tjdg|R   }|jd |jd |jd  fS )Nr:   r<   )rk   rB   zerosdetachr?   )r   rj   rN   r   r   r   rp      s   z"PatchEmbedding._compute_feats_dimsr   r   c                 C  s^   |  |}|j\}}}}|||dddd}| j|dd}t||gdd}|| j7 }|S )Nr<   r   r9   r:   r=   )rk   r?   rF   rA   rs   repeatr   rt   )r   r   rG   rH   r[   
cls_tokensr   r   r   r      s   

zPatchEmbedding.forward)r2   rW   re   rf   N)rg   r$   rh   r$   ri   r$   rj   r$   rk   rl   r   r   )rj   rv   r   rw   rO   )r   r   r   __doc__r   rp   r   r    r   r   r   r   rd      s    
rd   c                      s^   e Zd ZdZ									d)d* fddZed+ddZd,ddZed-d.d'd(Z	  Z
S )/r
   a  Vision transformer (ViT) module.

    The module is expected to be used as operator for different vision tasks.

    The method is inspired from existing implementations of the paper :cite:`dosovitskiy2020vit`.

    .. warning::
        This is an experimental API subject to changes in favor of flexibility.

    Args:
        image_size: the size of the input image.
        patch_size: the size of the patch to compute the embedding.
        in_channels: the number of channels for the input.
        embed_dim: the embedding dimension inside the transformer encoder.
        depth: the depth of the transformer.
        num_heads: the number of attention heads.
        dropout_rate: dropout rate.
        dropout_attn: attention dropout rate.
        backbone: an nn.Module to compute the image patches embeddings.

    Example:
        >>> img = torch.rand(1, 3, 224, 224)
        >>> vit = VisionTransformer(image_size=224, patch_size=16)
        >>> vit(img).shape
        torch.Size([1, 197, 768])

    rf   re   r2   rW   rX   r"   Nrj   r$   ri   rg   rQ   rY   r/   r'   r(   rR   rk   rl   r   r   c
                   s`   t    || _|| _|| _|| _t|||||	| _| jj}
t	|
||||| _
t|
d| _d S )NrS   )r   r   rj   ri   rg   
embed_sizerd   patch_embeddingrh   rV   encoderr   rU   norm)r   rj   ri   rg   rQ   rY   r/   r'   rR   rk   
hidden_dimr   r   r   r      s   
zVisionTransformer.__init__list[Tensor]c                 C  s   | j jS r   )r   r`   )r   r   r   r   encoder_results   s   z!VisionTransformer.encoder_resultsr   r   c              	   C  s   t |tstdt| | jg |jdd  R vr7|jd | jkr7td| j d| j d| j d|j | |}| 	|}| 
|}|S )Nz#Input x type is not a Tensor. Got: ry   rx   zInput image shape must be Bxr   z. Got: )
isinstancer   	TypeErrortyperj   r?   rg   r4   r   r   r   )r   r   rN   r   r   r   r      s   
*"


zVisionTransformer.forwardFvariantstr
pretrainedboolr   r   c           
      K  s   |  d\}}t|}ddddddddddddd	d
ddddddd| }|j||d tdi |}|rHt| }tj|}	||	 |S )a  Build ViT model based on the given config string.

        The format is ``vit_{size}/{patch_size}``.
        E.g. ``vit_b/16`` means ViT-Base, patch size 16x16. If ``pretrained=True``, AugReg weights are loaded.
        The weights are hosted on HuggingFace's model hub: https://huggingface.co/kornia.

        .. note::
            The available weights are: ``vit_l/16``, ``vit_b/16``, ``vit_s/16``, ``vit_ti/16``,
            ``vit_b/32``, ``vit_s/32``.

        Args:
            variant: ViT model variant e.g. ``vit_b/16``.
            pretrained: whether to load pre-trained AugReg weights.
            kwargs: other keyword arguments that will be passed to :func:`kornia.contrib.vit.VisionTransformer`.

        Returns:
            The respective ViT model

        Example:
            >>> from kornia.contrib import VisionTransformer
            >>> vit_model = VisionTransformer.from_config("vit_b/16", pretrained=True)

        /   rX   r2   )rQ   rY   r/   i     rW   i      re   i       )vit_tivit_svit_bvit_lvit_h)ri   Nr   )	splitr$   updater
   _get_weight_urlrB   hubload_state_dict_from_urlload_state_dict)
r   r   r   
model_typepatch_size_strri   model_configmodelurl
state_dictr   r   r   from_config  s"   





zVisionTransformer.from_config)	rf   re   r2   rW   rX   rX   r"   r"   N)rj   r$   ri   r$   rg   r$   rQ   r$   rY   r$   r/   r$   r'   r(   rR   r(   rk   rl   r   r   )r   r   rO   )F)r   r   r   r   r   r   r   r
   )r   r   r   r~   r   propertyr   r   staticmethodr   r    r   r   r   r   r
      s"    
)zvit_l/16zvit_b/16zvit_s/16z	vit_ti/16zvit_b/32zvit_s/32r   r   r   c                 C  s@   t | tv d|  d | d\}}d| | d| d| dS )z$Return the URL of the model weights.zVariant z% does not have pre-trained checkpointr   zhttps://huggingface.co/kornia/z_augreg_i21k_r224/resolve/main/-z.pth)r	   _AVAILABLE_WEIGHTSr   )r   r   ri   r   r   r   r   8  s   r   )r   r   r   r   )r~   
__future__r   typingr   r   rB   r   kornia.corer   r   r   kornia.core.checkr	   __all__r   rT   r!   r-   rP   rV   rd   r
   r   r   r   r   r   r   <module>   s"   	*+u