o
    oiS                     @  sp  d dl mZ d dlZd dlmZmZmZ d dlZd dlm	  m
Z d dlm	Z	 d dlmZ d dlmZmZmZmZ d dlmZmZ d dlmZ d3ddZG dd de	jZG dd de	jZG dd deZG dd deZG dd deZG dd de	jZG dd deZ G dd deZ!G dd  d eZ"G d!d" d"eZ#d4d&d'Z$d5d6d-d.Z%d5d6d/d0Z&d5d6d1d2Z'dS )7    )annotationsN)AnyOptionalSequence)nn)
checkpoint)DropPathLayerNorm2dwindow_partitionwindow_unpartition)ModuleTensor)KORNIA_CHECKxint | tuple[int, int]returntuple[int, int]c                 C  s   t | tr	| | fS | S N)
isinstanceint)r    r   R/home/ubuntu/.local/lib/python3.10/site-packages/kornia/contrib/models/tiny_vit.py
_make_pair$   s   r   c                      s*   e Zd Zdddejfd fddZ  ZS )ConvBN   r   in_channelsr   out_channelskernel_sizestridepaddinggroups
activationtype[Module]r   Nonec              	     s<   t    tj||||||dd| _t|| _| | _d S )NF)r    bias)super__init__r   Conv2dcBatchNorm2dbnact)selfr   r   r   r   r   r    r!   	__class__r   r   r&   )   s   

zConvBN.__init__)r   r   r   r   r   r   r   r   r   r   r    r   r!   r"   r   r#   )__name__
__module____qualname__r   Identityr&   __classcell__r   r   r-   r   r   (   s    r   c                      s$   e Zd Zejfd
 fdd	Z  ZS )
PatchEmbedr   r   	embed_dimr!   r"   r   r#   c              
     s@   t    tt||d ddd| t|d |ddd| _d S )N      r   )r%   r&   r   
Sequentialr   seq)r,   r   r5   r!   r-   r   r   r&   :   s   
(
zPatchEmbed.__init__)r   r   r5   r   r!   r"   r   r#   r/   r0   r1   r   GELUr&   r3   r   r   r-   r   r4   9   s    r4   c                      s0   e Zd Zejdfd fddZdddZ  ZS )MBConv        r   r   r   expansion_ratiofloatr!   r"   	drop_pathr   r#   c                   sb   t    t|| }t||d|d| _t||ddd||| _t||d| _t|| _| | _	d S )Nr   r!   r7   )
r%   r&   r   r   conv1conv2conv3r   r@   r+   )r,   r   r   r>   r!   r@   hidden_channelsr-   r   r   r&   B   s   

zMBConv.__init__r   r   c                 C  s&   |  || | | | | S r   )r+   r@   rD   rC   rB   r,   r   r   r   r   forwardR   s   &zMBConv.forward)r   r   r   r   r>   r?   r!   r"   r@   r?   r   r#   r   r   r   r   r/   r0   r1   r   r;   r&   rG   r3   r   r   r-   r   r<   A   s
    r<   c                      s.   e Zd Zejfd fddZdddZ  ZS )PatchMerginginput_resolutionr   dimr   out_dimr   r!   r"   r   r#   c              	     s^   t |dv d t   t|| _t||d|d| _t||d|d||d| _t||d| _d S )N)r   r6   zstride must be either 1 or 2r   rA   r7   )r    r!   )	r   r%   r&   r   rK   r   rB   rC   rD   )r,   rK   rL   rM   r   r!   r-   r   r   r&   W   s   

zPatchMerging.__init__r   r   c                 C  sL   |j dkr|ddd| j}| | | |}|ddd}|S )Nr7   r   r6   )ndim	transpose	unflattenrK   rD   rC   rB   flattenrF   r   r   r   rG   f   s
   
zPatchMerging.forward)rK   r   rL   r   rM   r   r   r   r!   r"   r   r#   rH   rI   r   r   r-   r   rJ   V   s    rJ   c                      s6   e Zd Zejddddfd fddZdddZ  ZS )	ConvLayerr=   NF      @rL   r   depthr!   r"   r@   float | list[float]
downsampleOptional[Module]use_checkpointboolconv_expand_ratior?   r   r#   c                   sR   t    || _ttsg| t fddt|D | _|| _	d S )Nc              	     s    g | ]}t  | qS r   )r<   .0ir!   rZ   rL   r@   r   r   
<listcomp>   s     z&ConvLayer.__init__.<locals>.<listcomp>)
r%   r&   rX   r   listr   
ModuleListrangeblocksrV   )r,   rL   rT   r!   r@   rV   rX   rZ   r-   r^   r   r&   o   s   




zConvLayer.__init__r   r   c                 C  >   | j D ]}| jrt||n||}q| jd ur| |}|S r   rc   rX   r   rV   r,   r   blkr   r   r   rG      
   


zConvLayer.forward)rL   r   rT   r   r!   r"   r@   rU   rV   rW   rX   rY   rZ   r?   r   r#   rH   rI   r   r   r-   r   rR   n   s    rR   c                      s&   e Zd Zejdfd fddZ  ZS )MLPr=   in_featuresr   hidden_featuresout_featuresr!   r"   dropr?   r   r#   c                   sV   t    t|| _t||| _| | _t|| _	t||| _
t|| _d S r   )r%   r&   r   	LayerNormnormLinearfc1act1Dropoutdrop1fc2drop2)r,   rj   rk   rl   r!   rm   r-   r   r   r&      s   
zMLP.__init__)rj   r   rk   r   rl   r   r!   r"   rm   r?   r   r#   r:   r   r   r-   r   ri      s    ri   c                      sV   e Zd Z			dd fddZedddZe dd  fddZd!ddZ	  Z
S )"	Attention   rS      rz   rL   r   key_dim	num_heads
attn_ratior?   
resolutionr   r   r#   c           	        s   t    || _|d | _|| _|| | _t|| | _t|| | | _|| _	| j| jd  }t
|| _t
||| _t
| j|| _| |\}}t
t||| _| jd|dd |  d | _d S )Ng      r6   attention_bias_idxsF)
persistent)r%   r&   r|   scaler{   nh_kdr   ddhr}   r   rn   ro   rp   qkvprojbuild_attention_bias	Parametertorchzerosattention_biasesregister_bufferab)	r,   rL   r{   r|   r}   r~   hindicesattn_offset_sizer-   r   r   r&      s"   



zAttention.__init__tuple[Tensor, int]c                 C  s   | \}}t |}t |}||}||}|d d d f |d d d f   }|d d d f |d d d f   }|| | }	t j|	dd\}
}||| || }|
 }||fS )NT)return_inverse)r   arangerepeat_interleaverepeatabsuniqueviewnumel)r~   HWrowscolsrrccdrdckeysunique_keysinverser   r   r   r   r   r      s   



$$zAttention.build_attention_biasTmoderY   c                   s>   t  | |r| jd urd | _| S | jd d | jf | _| S r   )r%   trainr   r   r   )r,   r   r-   r   r   r      s
   zAttention.trainr   r   c                 C  s   |j \}}}| |}| |}|||| jddddd}|j| j| j| jgdd\}}}| j	r<| j
d d | jf n| j}	||dd | j |	 }
|
jdd}
|
| dd||| j}| |}|S )Nr   r6   r   r7   )rL   )shapero   r   r   r|   permutesplitr{   r   trainingr   r   r   rO   r   softmaxreshaper   r   )r,   r   BN_r   qkvr$   attnr   r   r   rG      s   

  
zAttention.forward)rx   rS   ry   )rL   r   r{   r   r|   r   r}   r?   r~   r   r   r#   )r~   r   r   r   )T)r   rY   r   rw   rH   )r/   r0   r1   r&   staticmethodr   r   no_gradr   rG   r3   r   r   r-   r   rw      s    rw   c                      s8   e Zd Zdddddejfd fddZdddZ  ZS )TinyViTBlock   rS   r=   r7   rL   r   rK   r   r|   window_size	mlp_ratior?   rm   r@   local_conv_sizer!   r"   r   r#   c
                   s   t || dkd t   t|| _|| _|| }
t||
|d||f| _t|| _	t
|||d|d || _t|t|| ||	|| _t|| _d S )Nr   z!dim must be divislbe by num_headsg      ?r   r6   )r   r%   r&   r   rK   r   rw   r   r   
drop_path1r   
local_convri   r   mlp
drop_path2)r,   rL   rK   r|   r   r   rm   r@   r   r!   head_dimr-   r   r   r&      s   


zTinyViTBlock.__init__r   r   c           	      C  s   | j \}}|j\}}}|}|||||}t|| j\}}| |dd}t|| j|||f}||||}|| | }|	dd
||||}| |}||||	dd}|| | | }|S )Nr   r6   )rK   r   r   r
   r   r   rQ   r   r   rO   r   r   r   r   )	r,   r   r   r   r   LCres_xpad_hwr   r   r   rG      s   

zTinyViTBlock.forward)rL   r   rK   r   r|   r   r   r   r   r?   rm   r?   r@   r?   r   r   r!   r"   r   r#   rH   rI   r   r   r-   r   r      s    r   c                      s:   e Zd Zddddddejfd! fddZd"dd Z  ZS )#
BasicLayerrS   r=   NFr7   rL   r   rK   r   rT   r|   r   r   r?   rm   r@   rU   rV   rW   rX   rY   r   r!   r"   r   r#   c                   sH   t    |
| _t f	ddt|D | _|	| _d S )Nc                   s6   g | ]}t ttr| n 	qS r   )r   r   r`   r[   	r!   rL   rm   r@   rK   r   r   r|   r   r   r   r_   (  s    z'BasicLayer.__init__.<locals>.<listcomp>)r%   r&   rX   r   ra   rb   rc   rV   )r,   rL   rK   rT   r|   r   r   rm   r@   rV   rX   r   r!   r-   r   r   r&     s   

zBasicLayer.__init__r   r   c                 C  rd   r   re   rf   r   r   r   rG   ;  rh   zBasicLayer.forward)rL   r   rK   r   rT   r   r|   r   r   r   r   r?   rm   r?   r@   rU   rV   rW   rX   rY   r   r   r!   r"   r   r#   rH   rI   r   r   r-   r   r     s    &r   c                      s^   e Zd ZdZdddddddd	d
d
dd	dejdfd0 fd"d#Zd1d&d'Zed2d3d.d/Z	  Z
S )4TinyViTa  TinyViT model, as described in https://arxiv.org/abs/2207.10666.

    Args:
        img_size: Size of input image.
        in_chans: Number of input image's channels.
        num_classes: Number of output classes.
        embed_dims: List of embedding dimensions.
        depths: List of block count for each downsampling stage
        num_heads: List of attention heads used in self-attention for each downsampling stage.
        window_sizes: List of self-attention's window size for each downsampling stage.
        mlp_ratio: Ratio of MLP dimension to embedding dimension in self-attention.
        drop_rate: Dropout rate.
        drop_path_rate: Stochastic depth rate.
        use_checkpoint: Whether to use activation checkpointing to trade compute for memory.
        mbconv_expand_ratio: Expansion ratio used in MBConv block.
        local_conv_size: Kernel size of convolution used in TinyViTBlock
        activation: activation function.
        mobile_same: Whether to use modifications for MobileSAM.

       r7   i  )`        i   r6   r6      r6   )r7   r         r   r   rz   r   rS   r=   Fimg_sizer   in_chansnum_classes
embed_dimsSequence[int]depthsr|   window_sizesr   r?   	drop_ratedrop_path_raterX   rY   mbconv_expand_ratior   r!   r"   
mobile_samr   r#   c                   s  t    || _|| _|  |r3g d}ttj|d ddddtdtjdddddddtd| _ng d}d | _t	||d	 || _
|d
 }dd td	|
t|D }t|}g }tt|||||D ]f\}\}}}}}|t|d t|d  }||d k rt|||||nd }|||t|d | t|d |d   |||d}|d	krtdd|i|}ntd|||||	|d|}|| || }qetj| | _|| _t|d | _t|d || _d S )N)r6   r6   r   r   r      r   F)r$   r7   )r6   r6   r6   r   r      c                 S  s   g | ]}|  qS r   )item)r\   r   r   r   r   r_     s    z$TinyViT.__init__.<locals>.<listcomp>)rL   rT   r@   rV   rX   r!   rZ   )rK   r|   r   r   rm   r   r   )r%   r&   r   r   r   r8   r'   r	   neckr4   patch_embedr   linspacesumlen	enumeratezipminrJ   rR   r   appendlayers	feat_sizern   	norm_headrp   head)r,   r   r   r   r   r   r|   r   r   r   r   rX   r   r   r!   r   stridesrK   dprn_layersr   i_layerr5   rT   num_heads_ir   r   rM   rV   kwargslayerr-   r   r   r&   Y  sh   
&	
	
zTinyViT.__init__r   r   c                 C  sf   |  |}| |}| jr$|d| j| jfdddd}| |}|S |d}| | 	|}|S )zUClassify images if ``mobile_sam=False``, produce feature maps if ``mobile_sam=True``.r   r   r7   r6   )
r   r   r   rP   r   r   r   meanr   r   rF   r   r   r   rG     s   

 

zTinyViT.forwardvariantstr
pretrained
bool | strr   r   c                 K  s*   t | dv d tttd|  |fi |S )a1  Create a TinyViT model from pre-defined variants.

        Args:
            variant: TinyViT variant. Possible values: ``'5m'``, ``'11m'``, ``'21m'``.
            pretrained: whether to use pre-trained weights. Possible values: ``False``, ``True``, ``'in22k'``,
                ``'in1k'``. For TinyViT-21M (``variant='21m'``), ``'in1k_384'``, ``'in1k_512'`` are also available.
            **kwargs: other keyword arguments that will be passed to :class:`TinyViT`.

        .. note::
            When ``img_size`` is different from the pre-trained size, bicubic interpolation will be performed on
            attention biases. When using ``pretrained=True``, ImageNet-1k checkpoint (``'in1k'``) is used.
            For feature extraction or fine-tuning, ImageNet-22k checkpoint (``'in22k'``) is preferred.

        )5m11m21mz+Only variant 5m, 11m, and 21m are supported)r   _tiny_vit_5m_tiny_vit_11m_tiny_vit_21m)r   r   r   r   r   r   from_config  s   zTinyViT.from_config) r   r   r   r   r   r   r   r   r   r   r|   r   r   r   r   r?   r   r?   r   r?   rX   rY   r   r?   r   r   r!   r"   r   rY   r   r#   rH   F)r   r   r   r   r   r   r   r   )r/   r0   r1   __doc__r   r;   r&   rG   r   r   r3   r   r   r-   r   r   C  s*    
Yr   modelurlr   c                 C  s(  |   }tj|}d|v r|d }dd | D }|D ]I}|| j\}}|| j\}}	t||kd| d|  ||	krft|d }
t|	d }|| d||
|
}t	j
|||fdd	}|||	||< q|d
 jd | jjkrd}tj|dd t| jj|d
< t| jj|d< | | | S )Nr  c                 S  s   g | ]}d |v r|qS )r   r   )r\   r   r   r   r   r_     s    z$_load_pretrained.<locals>.<listcomp>zFail to load z/. Pre-trained checkpoint should have num_heads=g      ?r   bicubic)sizer   zhead.weightr   zaNumber of classes does not match pre-trained checkpoint's. Resetting classification head to zeros)
stacklevelz	head.bias)
state_dictr   hubload_state_dict_from_urlr   r   r   r   r   Finterpolater   rl   warningswarn
zeros_likeweightr$   load_state_dict)r  r  model_state_dictr  ab_keysr   n_heads1L1n_heads2L2S1S2r   msgr   r   r   _load_pretrained  s.   
r  Fr   r   r   r   c                 K  V   t dg dg dg dg ddd|}| r)| du rd} d	d
d|  }t||}|S )N)@         i@  r   )r6   r      
   r   r=   r   r   r|   r   r   Tin1kzchttps://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_5m_22k_distill.pthzghttps://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_5m_22kto1k_distill.pthin22kr"  r   r   r  r   r   r  r  r   r   r   r     &   	
r   c                 K  r  )N)r  r  r   i  r   )r6   r   rx   rz   r   g?r!  Tr"  zdhttps://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_11m_22k_distill.pthzhhttps://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_11m_22kto1k_distill.pthr#  r   r%  r&  r   r   r   r     r'  r   c                 K  s~   t dg dg dg dg ddd|}| r=| du r/d} |d	d
}|dkr)d} |dkr/d} ddddd|  }t||}|S )N)r   r   r   i@  r   )r7   r   r      r   g?r!  Tr"  r   r   r   in1k_384i   in1k_512zdhttps://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_22k_distill.pthzhhttps://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_22kto1k_distill.pthzlhttps://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_22kto1k_384_distill.pthzlhttps://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_22kto1k_512_distill.pth)r$  r"  r)  r*  r   )r   getr  )r   r   r  r   r  r   r   r   r   (  s4   	
r   )r   r   r   r   )r  r   r  r   r   r   r   )r   r   r   r   r   r   )(
__future__r   r  typingr   r   r   r   torch.nn.functionalr   
functionalr
  torch.utilsr   kornia.contrib.models.commonr   r	   r
   r   kornia.corer   r   kornia.core.checkr   r   r8   r   r4   r<   rJ   rR   ri   rw   r   r   r   r  r   r   r   r   r   r   r   <module>   s4   
 C./ 
!