o
    €o™i>)  ã                   @   sâ   d dl mZmZmZ d dlZd dlmZ d dlmZmZ de	de	defdd	„Z
dde	de	de	de	def
dd„ZG dd„ deƒZG dd„ deƒZG dd„ deƒZG dd„ deƒZG dd„ deƒZG dd„ deƒZG dd„ deƒZdS )é    )ÚAnyÚDictÚTupleN)Únn)ÚModuleÚTensorÚinpÚoupÚreturnc              
   C   s*   t  t j| |dddddt  |¡t  ¡ ¡S )z&Apply 1x1 Convolution with Batch Norm.é   r   F©Úbias©r   Ú
SequentialÚConv2dÚBatchNorm2dÚSiLU)r   r	   © r   úM/home/ubuntu/.local/lib/python3.10/site-packages/kornia/contrib/vit_mobile.pyÚconv_1x1_bn   ó   *r   é   r   Úkernal_sizeÚstridec              
   C   s*   t  t j| |||dddt  |¡t  ¡ ¡S )z&Apply NxN Convolution with Batch Norm.r   Fr   r   )r   r	   r   r   r   r   r   Úconv_nxn_bn   r   r   c                       sH   e Zd Zdededdf‡ fdd„Zdedeee	f defd	d
„Z
‡  ZS )ÚPreNormÚdimÚfnr
   Nc                    s    t ƒ  ¡  t |¡| _|| _d S ©N)ÚsuperÚ__init__r   Ú	LayerNormÚnormr   )Úselfr   r   ©Ú	__class__r   r   r    %   s   

zPreNorm.__init__ÚxÚkwargsc                 K   s   | j |  |¡fi |¤ŽS r   )r   r"   )r#   r&   r'   r   r   r   Úforward*   s   zPreNorm.forward)Ú__name__Ú
__module__Ú__qualname__Úintr   r    r   r   Ústrr   r(   Ú__classcell__r   r   r$   r   r   $   s    &r   c                	       sB   e Zd Zddedededdf‡ fdd„Zd	edefd
d„Z‡  ZS )ÚFeedForwardç        r   Ú
hidden_dimÚdropoutr
   Nc              	      sB   t ƒ  ¡  t t ||¡t ¡ t |¡t ||¡t |¡¡| _d S r   )r   r    r   r   ÚLinearr   ÚDropoutÚnet)r#   r   r1   r2   r$   r   r   r    /   s   
*
ÿzFeedForward.__init__r&   c                 C   s
   |   |¡S r   )r5   ©r#   r&   r   r   r   r(   5   s   
zFeedForward.forward©r0   ©	r)   r*   r+   r,   Úfloatr    r   r(   r.   r   r   r$   r   r/   .   s     r/   c                       sF   e Zd Zddededededd	f
‡ fd
d„Zdedefdd„Z‡  ZS )Ú	Attentioné   é@   r0   r   ÚheadsÚdim_headr2   r
   Nc                    sŠ   t ƒ  ¡  || }|dko||k }|| _|d | _tjdd| _tj||d dd| _|r>t 	t ||¡t 
|¡¡| _d S t ¡ | _d S )Nr   g      à¿éÿÿÿÿ©r   r   Fr   )r   r    r=   Úscaler   ÚSoftmaxÚattendr3   Úto_qkvr   r4   ÚIdentityÚto_out)r#   r   r=   r>   r2   Ú	inner_dimÚproject_outr$   r   r   r    :   s   

2zAttention.__init__r&   c           	         s˜   ˆ  |¡jddd}|d j\‰ ‰‰‰‡ ‡‡‡‡fdd„|D ƒ\}}}t || dd¡¡ˆj }ˆ |¡}t ||¡}| dd¡ ˆ ˆˆˆ¡}ˆ 	|¡S )	Nr   r?   r@   r   c              	   3   s2    | ]}|  ˆ ˆˆˆjˆˆj ¡ d d¡V  qdS )é   r   N)Úreshaper=   Ú	transpose)Ú.0Út©ÚbÚhdÚnÚpr#   r   r   Ú	<genexpr>K   s   €0 z$Attention.forward.<locals>.<genexpr>éþÿÿÿrI   )
rD   ÚchunkÚshapeÚtorchÚmatmulrK   rA   rC   rJ   rF   )	r#   r&   ÚqkvÚqÚkÚvÚdotsÚattnÚoutr   rN   r   r(   G   s    

zAttention.forward)r;   r<   r0   r8   r   r   r$   r   r:   9   s    $r:   c                       sR   e Zd ZdZddedededededed	d
f‡ fdd„Zded	efdd„Z‡  Z	S )ÚTransformera¢  Transformer block described in ViT.

    Paper: https://arxiv.org/abs/2010.11929
    Based on: https://github.com/lucidrains/vit-pytorch

    Args:
        dim: input dimension.
        depth: depth for transformer block.
        heads: number of heads in multi-head attention layer.
        dim_head: head size.
        mlp_dim: dimension of the FeedForward layer.
        dropout: dropout ratio, defaults to 0.

    r0   r   Údepthr=   r>   Úmlp_dimr2   r
   Nc                    s\   t ƒ  ¡  t g ¡| _t|ƒD ]}| j t t|t||||ƒƒt|t	|||ƒƒg¡¡ qd S r   )
r   r    r   Ú
ModuleListÚlayersÚrangeÚappendr   r:   r/   )r#   r   ra   r=   r>   rb   r2   Ú_r$   r   r   r    d   s   
þÿÿÿzTransformer.__init__r&   c                 C   s,   | j D ]\}}||ƒ| }||ƒ| }q|S r   )rd   )r#   r&   r^   Úffr   r   r   r(   q   s   zTransformer.forwardr7   )
r)   r*   r+   Ú__doc__r,   r9   r    r   r(   r.   r   r   r$   r   r`   T   s    ,r`   c                       sJ   e Zd ZdZddededededd	f
‡ fd
d„Zdedefdd„Z‡  ZS )ÚMV2Blockaq  MV2 block described in MobileNetV2.

    Paper: https://arxiv.org/pdf/1801.04381
    Based on: https://github.com/tonylins/pytorch-mobilenet-v2

    Args:
        inp: input channel.
        oup: output channel.
        stride: stride for convolution, defaults to 1, set to 2 if down-sample.
        expansion: expansion ratio for hidden dimension, defaults to 4.

    r   é   r   r	   r   Ú	expansionr
   Nc                    sö   t ƒ  ¡  || _t|| ƒ}| jdko||k| _|dkrCt tj||d|d|ddt |¡t 	¡ tj||dddddt |¡¡| _
d S t tj||dddddt |¡t 	¡ tj||d|d|ddt |¡t 	¡ tj||dddddt |¡¡| _
d S )Nr   r   F)Úgroupsr   r   r   )r   r    r   r,   Úuse_res_connectr   r   r   r   r   Úconv)r#   r   r	   r   rl   r1   r$   r   r   r    †   s,   

ù

õzMV2Block.__init__r&   c                 C   s   | j r
||  |¡ S |  |¡S r   )rn   ro   r6   r   r   r   r(   ¦   s   
zMV2Block.forward)r   rk   )	r)   r*   r+   ri   r,   r    r   r(   r.   r   r   r$   r   rj   x   s    $ rj   c                       s`   e Zd ZdZ	ddededededeeef ded	ed
df‡ fdd„Zded
efdd„Z	‡  Z
S )ÚMobileViTBlockay  MobileViT block mentioned in MobileViT.

    Args:
        dim: input dimension of Transformer.
        depth: depth of Transformer.
        channel: input channel.
        kernel_size: kernel size.
        patch_size: patch size for folding and unfloding.
        mlp_dim: dimension of the FeedForward layer in Transformer.
        dropout: dropout ratio, defaults to 0.

    r0   r   ra   ÚchannelÚkernel_sizeÚ
patch_sizerb   r2   r
   Nc                    sf   t ƒ  ¡  |\| _| _t|||ƒ| _t||ƒ| _t||dd||ƒ| _	t||ƒ| _
td| ||ƒ| _d S )Nrk   r;   rI   )r   r    ÚphÚpwr   Úconv1r   Úconv2r`   ÚtransformerÚconv3Úconv4)r#   r   ra   rq   rr   rs   rb   r2   r$   r   r   r    »   s   

zMobileViTBlock.__init__r&   c           	      C   sö   |  ¡ }|  |¡}|  |¡}|j\}}}}|| j || j }}| || | | j|| j¡ dd¡}| |||| | j| j ¡ dd¡}|  |¡}| dd¡ || | || j| j¡}| dd¡ ||||¡}|  	|¡}t
 ||fd¡}|  |¡}|S )Nr   rI   r   )Úclonerv   rw   rV   rt   ru   rJ   rK   rx   ry   rW   Úcatrz   )	r#   r&   ÚyrO   ÚdÚhÚwÚnhÚnwr   r   r   r(   Ð   s   

$$
$

zMobileViTBlock.forwardr7   )r)   r*   r+   ri   r,   r   r9   r    r   r(   r.   r   r   r$   r   rp   ­   s*    øþýüû
úùø	÷rp   c                       sT   e Zd ZdZ	ddededeeef d	ed
df
‡ fdd„Zde	d
e	fdd„Z
‡  ZS )Ú	MobileViTa>  Module MobileViT. Default arguments is for MobileViT XXS.

    Paper: https://arxiv.org/abs/2110.02178
    Based on: https://github.com/chinhsuanwu/mobilevit-pytorch

    Args:
        mode: 'xxs', 'xs' or 's', defaults to 'xxs'.
        in_channels: the number of channels for the input image.
        patch_size: image_size must be divisible by patch_size.
        dropout: dropout ratio in Transformer.

    Example:
        >>> img = torch.rand(1, 3, 256, 256)
        >>> mvit = MobileViT(mode='xxs')
        >>> mvit(img).shape
        torch.Size([1, 320, 8, 8])

    Úxxsr   ©rI   rI   r0   ÚmodeÚin_channelsrs   r2   r
   Nc           
         s&  t ƒ  ¡  |dkrd}g d¢}g d¢}n|dkr#d}g d¢}g d¢}n|d	kr1d}g d
¢}g d¢}d}g d¢}	t||d dd| _t g ¡| _| j t|d |d d|ƒ¡ | j t|d |d d|ƒ¡ | j t|d |d d|ƒ¡ | j t|d |d d|ƒ¡ | j t|d |d d|ƒ¡ | j t|d |d d|ƒ¡ | j t|d |d d|ƒ¡ t g ¡| _	| j	 t
|d |	d |d ||t|d d ƒ|d¡ | j	 t
|d |	d |d ||t|d d ƒ|d¡ | j	 t
|d |	d |d ||t|d d ƒ|d¡ t|d |d ƒ| _d S )Nr„   rI   )r<   éP   é`   )é   rŠ   é   r‹   é0   rŒ   r<   r<   rˆ   rˆ   i@  Úxsrk   )r‰   éx   é   )rŠ   é    rŒ   rŒ   r<   r<   rˆ   rˆ   r‰   r‰   i€  Ús)r   éÀ   éð   )rŠ   r   r<   r<   r‰   r‰   é€   r”   é    r•   i€  r   )rI   rk   r   r   )r   r   é   é   é   r;   )r2   é	   rT   r?   )r   r    r   rv   r   rc   Úmv2rf   rj   Úmvitrp   r,   r   rw   )
r#   r†   r‡   rs   r2   rl   ÚdimsÚchannelsrr   ra   r$   r   r   r      sF   


,ÿ,ÿ,ÿzMobileViT.__init__r&   c                 C   s¤   |   |¡}| jd |ƒ}| jd |ƒ}| jd |ƒ}| jd |ƒ}| jd |ƒ}| jd |ƒ}| jd |ƒ}| jd |ƒ}| jd |ƒ}| jd |ƒ}|  |¡}|S )Nr   r   rI   r   rk   r–   r—   )rv   rš   r›   rw   r6   r   r   r   r(   .  s   

zMobileViT.forward)r„   r   r…   r0   )r)   r*   r+   ri   r-   r,   r   r9   r    r   r(   r.   r   r   r$   r   rƒ   î   s    ÿÿÿ
ÿÿþ,rƒ   )r   r   )Útypingr   r   r   rW   r   Úkornia.corer   r   r,   r   r   r   r/   r:   r`   rj   rp   rƒ   r   r   r   r   Ú<module>   s    
$5A