o
    پid                     @   s
  d Z ddlZddlmZmZmZ ddlZddlm  m	Z
 ddlmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZmZmZmZm Z  ddl!m"Z# g Z$dzddZ%d{ddZ&d|ddZ'd}ddZ(e)ee%ddddde%ddddde&dddddddd e&dddd!dddd e&dd!dd"dddd fdd#d$d$d%d&d'ee%dd(dd)e%dddd)e&dddd"ddd*e&dd!dd+ddd*e&dd"dd,ddd*fdd#d$d$d%d-d'ee%dd(dd)e%dddd)e&dd"dd,ddd*e&dd.dd/ddd*e&dd0dd1ddd*fdd#d$d$d%d2d'ee%dd(dd)e%dddd)e&dd"dd,ddd*e&dd.dd/ddd*e&dd0dd1ddd*fdd#d$d$d3e)d4d5d2d6e(de(d7e(d8e(de(d9e(d:e(dd;Z*eG d<d= d=ej+Z,G d>d? d?ej+Z-G d@dA dAej+Z.eG dBdC dCej+Z/edDe, edEe/ d~dGdHZ0d~dIdJZ1ddKdLZ2ei dMe2dNdOdPe2dNdOdQe2dNdOdRe2dNdSdTdUe2dNdSdTdVe2dNdSdTdWe2dNdSdTdXe2dNdSdTdYe2dNdSdTdZe2dNdSdTd[e2dNdSdTd\e2dNdSdTd]e2dNdSdTd^e2dNd_d`ddadbe2dNd_d`ddadce2dNd_d`ddaZ3edddefdedfZ4edddefdgdhZ5edddefdidjZ6edddefdkdlZ7edddefdmdnZ8edddefdodpZ9edddefdqdrZ:edddefdsdtZ;edddefdudvZ<edddefdwdxZ=ee>d[d\d]d^dbdcdy dS )a   MobileViT

Paper:
V1: `MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer` - https://arxiv.org/abs/2110.02178
V2: `Separable Self-attention for Mobile Vision Transformers` - https://arxiv.org/abs/2206.02680

MobileVitBlock and checkpoints adapted from https://github.com/apple/ml-cvnets (original copyright below)
License: https://github.com/apple/ml-cvnets/blob/main/LICENSE (Apple open source)

Rest of code, ByobNet, and Transformer block hacked together by / Copyright 2022, Ross Wightman
    N)CallableTupleOptional)nn)	to_2tuplemake_divisible
GroupNorm1ConvMlpDropPathis_exportable   )build_model_with_cfg)register_notrace_module)register_modelgenerate_default_cfgsregister_model_deprecations)register_blockByoBlockCfgByoModelCfgByobNetLayerFn
num_groups)Block      @c                 C   s   t d| ||d|tddddS )Nbottler   T)	bottle_in
linear_out)typedcsgsbrblock_kwargs)r   dictr   r   r    r"    r&   I/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/mobilevit.py_inverted_residual_block!   s   
r(      c                 C   s,   t | |||dtdd|dt|||ddfS )Nr%   	mobilevitr   )transformer_dimtransformer_depth
patch_size)r   r   r   r    r#   r(   r   r$   r   r   r    r+   r,   r-   r"   r&   r&   r'   _mobilevit_block(   s   r0             @      ?c                 C   s.   t | |||dtdd|d|dt||ddfS )Nr%   
mobilevit2r   )r,   r-   )r   r   r   r    r"   r!   r#   r.   )r   r   r    r,   r-   r"   transformer_brr&   r&   r'   _mobilevitv2_block6   s   r6         ?c                    s   d} dkrt  fdd|D }ttd|d dddtd	|d d	ddtd|d	 d	d	d
td|d d	dd
td|d d	dd
ftd  ddddd}|S )N)@           i   r7   c                    s   g | ]}t |  qS r&   )int).0r   
multiplierr&   r'   
<listcomp>F   s    z$_mobilevitv2_cfg.<locals>.<listcomp>r   r   r2   r%   r1   )r   r   r    r,      r)       3x3 silu)blocksstem_chs	stem_type	stem_pool
downsample	act_layer)tupler   r(   r6   r<   )r?   chscfgr&   r>   r'   _mobilevitv2_cfgC   s"   
rO      r%   rA      0   r8   r/   P   `   rC   rD   rE   i@  )rF   rG   rH   rI   rJ   rK   num_featuresrB   )r   r   r    )r   r   r    r+   r,   r-   x      r;   r9            i  seg      ?)rd_ratio)rF   rG   rH   rI   rJ   
attn_layerattn_kwargsrU   g      ?g      ?g      ?g      ?)mobilevit_xxsmobilevit_xsmobilevit_ssemobilevit_smobilevitv2_050mobilevitv2_075mobilevitv2_125mobilevitv2_100mobilevitv2_150mobilevitv2_175mobilevitv2_200c                %       s   e Zd ZdZdddddddddd	d
dddddejfdedee dedededee de	eef dedee dededededede
dededef$ fdd Zd!ejd"ejfd#d$Z  ZS )%MobileVitBlockzS MobileViT block
        Paper: https://arxiv.org/abs/2110.02178?context=cs.LG
    NrA   r   r7   r   r   r2   r1      r)           Fin_chsout_chskernel_sizestridebottle_ratio
group_sizedilation	mlp_ratior+   r,   r-   	num_heads	attn_dropdrop	no_fusiondrop_path_ratelayerstransformer_norm_layerc              	      s   t t|   pt t||}|p|}pt|| j||||||d d| _tj	|ddd| _
tj fddt|
D  | _| _j|ddd| _|r`d | _nj|| ||dd| _t|| _| jd | jd  | _d S )	Nr   rp   rq   groupsrt   r   Frp   biasc                    s(   g | ]}t d  jd	qS )T)ru   rv   qkv_biasrw   	proj_drop	drop_pathrK   
norm_layer)TransformerBlockactr=   _rw   rx   rz   r{   ru   rv   r+   r|   r&   r'   r@      s    z+MobileVitBlock.__init__.<locals>.<listcomp>rp   rq   )superrj   __init__r   r   r   conv_norm_actconv_kxkr   Conv2dconv_1x1
Sequentialrangetransformernorm	conv_projconv_fusionr   r-   
patch_area)selfrn   ro   rp   rq   rr   rs   rt   ru   r+   r,   r-   rv   rw   rx   ry   rz   r{   r|   kwargsr~   	__class__r   r'   r      s(   





zMobileVitBlock.__init__xreturnc                 C   s  |}|  |}| |}| j\}}|j\}}}}t|| | t|| | }	}
|	| |
| }}|| }d}|	|ksB|
|krOtj||	|
fddd}d}||| | |||	dd}||||| j
	dd|| j
 |d}| |}| |}| || j
|d}|	dd|| | |||}|	dd|||| || }|rtj|||fddd}| |}| jd ur| tj||fdd	}|S )
NFbilinearsizemodealign_cornersTr   r1   rA   dim)r   r   r-   shapemathceilFinterpolatereshape	transposer   r   r   
contiguousviewr   r   torchcat)r   r   shortcutpatch_hpatch_wBCHWnew_hnew_wnum_patch_hnum_patch_wnum_patchesr   r&   r&   r'   forward   s2   


& *

  

zMobileVitBlock.forward)__name__
__module____qualname____doc__r   	LayerNormr<   r   floatr   boolr   r   r   r   Tensorr   __classcell__r&   r&   r   r'   rj      sr    
	
<rj   c                       s   e Zd ZdZ			ddededededd	f
 fd
dZdej	dej	fddZ
ej ddej	deej	 dej	fddZddej	deej	 dej	fddZ  ZS )LinearSelfAttentiona  
    This layer applies a self-attention with linear complexity, as described in `https://arxiv.org/abs/2206.02680`
    This layer can be used for self- as well as cross-attention.
    Args:
        embed_dim (int): :math:`C` from an expected input of size :math:`(N, C, H, W)`
        attn_drop (float): Dropout value for context scores. Default: 0.0
        bias (bool): Use bias in learnable layers. Default: True
    Shape:
        - Input: :math:`(N, C, P, N)` where :math:`N` is the batch size, :math:`C` is the input channels,
        :math:`P` is the number of pixels in the patch, and :math:`N` is the number of patches
        - Output: same as the input
    .. note::
        For MobileViTv2, we unfold the feature map [B, C, H, W] into [B, C, P, N] where P is the number of pixels
        in a patch and N is the number of patches. Because channel is the first dimension in this unfolded tensor,
        we use point-wise convolution (instead of a linear layer). This avoids a transpose operation (which may be
        expensive on resource-constrained devices) that may be required to convert the unfolded tensor from
        channel-first to channel-last format in case of a linear layer.
    rm   T	embed_dimrw   r   r   r   Nc                    s\   t    || _tj|dd|  |dd| _t|| _tj|||dd| _t|| _	d S )Nr   r1   )in_channelsout_channelsr   rp   )
r   r   r   r   r   qkv_projDropoutrw   out_projout_drop)r   r   rw   r   r   r   r&   r'   r   $  s    

zLinearSelfAttention.__init__r   c           	      C   s~   |  |}|jd| j| jgdd\}}}tj|dd}| |}|| jddd}t||| }| 	|}| 
|}|S )Nr   r   r   Tr   keepdim)r   splitr   r   softmaxrw   sumrelu	expand_asr   r   )	r   r   qkvquerykeyvaluecontext_scorescontext_vectoroutr&   r&   r'   _forward_self_attn=  s   



z&LinearSelfAttention._forward_self_attnx_prevc                 C   s  |j \}}}}|j dd  \}}||ksJ dtj|| jjd | jd  | jjd | jd  d}	|	jd| jgdd\}
}tj|| jj| jd  | jjd urX| jj| jd  nd d}tj|
dd}| 	|}|| j
ddd}t||| }| |}| |}|S )	NzJThe number of pixels in a patch for query and key_value should be the samer   )weightr   r   r   Tr   )r   r   conv2dr   r   r   r   r   r   rw   r   r   r   r   r   )r   r   r   
batch_sizein_dimkv_patch_areakv_num_patchesq_patch_areaq_num_patchesqkr   r   r   r   r   r   r&   r&   r'   _forward_cross_attnU  s.   
 


z'LinearSelfAttention._forward_cross_attnc                 C   s    |d u r	|  |S | j||dS )N)r   )r   r   )r   r   r   r&   r&   r'   r     s   
zLinearSelfAttention.forward)rm   rm   TN)r   r   r   r   r<   r   r   r   r   r   r   jitignorer   r   r   r   r&   r&   r   r'   r     s(    $**r   c                       sj   e Zd ZdZ						ddedededed	ed
df fddZddejde	ej d
ejfddZ
  ZS )LinearTransformerBlockaF  
    This class defines the pre-norm transformer encoder with linear self-attention in `MobileViTv2 paper <>`_
    Args:
        embed_dim (int): :math:`C_{in}` from an expected input of size :math:`(B, C_{in}, P, N)`
        mlp_ratio (float): Inner dimension ratio of the FFN relative to embed_dim
        drop (float): Dropout rate. Default: 0.0
        attn_drop (float): Dropout rate for attention in multi-head attention. Default: 0.0
        drop_path (float): Stochastic depth rate Default: 0.0
        norm_layer (Callable): Normalization layer. Default: layer_norm_2d
    Shape:
        - Input: :math:`(B, C_{in}, P, N)` where :math:`B` is batch size, :math:`C_{in}` is input embedding dim,
            :math:`P` is number of pixels in a patch, and :math:`N` is number of patches,
        - Output: same shape as the input
    r2   rm   Nr   ru   rx   rw   r   r   c                    sr   t    |p	tj}|pt}||| _t|||d| _t|| _	||| _
t|t|| ||d| _t|| _d S )N)r   rw   r   )in_featureshidden_featuresrK   rx   )r   r   r   SiLUr   norm1r   attnr
   
drop_path1norm2r	   r<   mlp
drop_path2)r   r   ru   rx   rw   r   rK   r   r   r&   r'   r     s   






zLinearTransformerBlock.__init__r   r   c                 C   sj   |d u r||  | | | }n|}| |}| ||}|  || }|| | | | }|S r   )r   r   r   r   r   r   )r   r   r   resr&   r&   r'   r     s   
zLinearTransformerBlock.forward)r2   rm   rm   rm   NNr   )r   r   r   r   r<   r   r   r   r   r   r   r   r&   r&   r   r'   r     s,    	*r   c                       s   e Zd ZdZddddddddd	d
d
d
defdedee dededee deeef dedee dededededede	de
f fddZdejdejfddZ  ZS ) MobileVitV2Blockz8
    This class defines the `MobileViTv2 block <>`_
    NrA   r7   r   rk   r2   r1   rl   rm   rn   ro   rp   rr   rs   rt   ru   r+   r,   r-   rw   rx   rz   r{   r|   c                    s   t t|   pt t||}|p|}pt|| j|||d||d d| _tj	|ddd| _
tj fddt|	D  | _| _j|dddd| _t|
| _| jd | jd  | _t | _d S )	Nr   r   r}   Fr   c                    s$   g | ]}t  jd qS ))ru   rw   rx   r   rK   r   )r   r   r   rw   rx   rz   r{   ru   r+   r|   r&   r'   r@     s    
z-MobileVitV2Block.__init__.<locals>.<listcomp>)rp   rq   	apply_act)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r-   r   r   coreml_exportable)r   rn   ro   rp   rr   rs   rt   ru   r+   r,   r-   rw   rx   rz   r{   r|   r   r~   r   r   r'   r     s$   






zMobileVitV2Block.__init__r   r   c                 C   s|  |j \}}}}| j\}}t|| | t|| | }}	|| |	| }
}|
| }||ks4|	|kr?tj|||	fddd}| |}| |}|j d }| jr^tj	|||f||fd}n|
|||
|||ddddd	d
}|
||d|}| |}| |}| jr|
||| | |
|}tj||d}n|
|||||
|ddd
d	dd}|
|||
| || }| |}|S )Nr   Tr   r   r   r   rA      r1   r)   r   )upscale_factor)r   r-   r   r   r   r   r   r   r   unfoldr   permuter   r   pixel_shuffler   )r   r   r   r   r   r   r   r   r   r   r   r   r   r&   r&   r'   r     s.   
&


$

$
zMobileVitV2Block.forward)r   r   r   r   r   r<   r   r   r   r   r   r   r   r   r   r   r&   r&   r   r'   r     s`    
	
2r   r*   r4   Fc                 K   0   t t| |f|st|  nt| tddd|S NT)flatten_sequential)	model_cfgfeature_cfgr   r   
model_cfgsr$   variantcfg_variant
pretrainedr   r&   r&   r'   _create_mobilevit#     r  c                 K   r  r  r  r  r&   r&   r'   _create_mobilevit2+  r  r  c                 K   s    | ddddddddd	d
d|S )Ni  )rA   r:   r:   )rl   rl   g?bicubic)rm   rm   rm   )r7   r7   r7   z	stem.convzhead.fcF)urlnum_classes
input_size	pool_sizecrop_pctinterpolationmeanstd
first_conv
classifierfixed_input_sizer&   )r  r   r&   r&   r'   _cfg3  s   r  zmobilevit_xxs.cvnets_in1kztimm/)	hf_hub_idzmobilevit_xs.cvnets_in1kzmobilevit_s.cvnets_in1kzmobilevitv2_050.cvnets_in1kg"~j?)r  r  zmobilevitv2_075.cvnets_in1kzmobilevitv2_100.cvnets_in1kzmobilevitv2_125.cvnets_in1kzmobilevitv2_150.cvnets_in1kzmobilevitv2_175.cvnets_in1kzmobilevitv2_200.cvnets_in1kz$mobilevitv2_150.cvnets_in22k_ft_in1kz$mobilevitv2_175.cvnets_in22k_ft_in1kz$mobilevitv2_200.cvnets_in22k_ft_in1kz(mobilevitv2_150.cvnets_in22k_ft_in1k_384)rA   r;   r;   )   r  )r  r  r  r  z(mobilevitv2_175.cvnets_in22k_ft_in1k_384z(mobilevitv2_200.cvnets_in22k_ft_in1k_384r   c                 K      t dd| i|S )Nr_   r  )r_   r  r  r   r&   r&   r'   r_   o     r_   c                 K   r  )Nr`   r  )r`   r  r   r&   r&   r'   r`   t  r!  r`   c                 K   r  )Nra   r  )ra   r  r   r&   r&   r'   ra   y  r!  ra   c                 K   r  )Nrc   r  )rc   r  r   r&   r&   r'   rc   ~  r!  rc   c                 K   r  )Nrd   r  )rd   r  r   r&   r&   r'   rd     r!  rd   c                 K   r  )Nrf   r  )rf   r  r   r&   r&   r'   rf     r!  rf   c                 K   r  )Nre   r  )re   r  r   r&   r&   r'   re     r!  re   c                 K   r  )Nrg   r  )rg   r  r   r&   r&   r'   rg     r!  rg   c                 K   r  )Nrh   r  )rh   r  r   r&   r&   r'   rh     r!  rh   c                 K   r  )Nri   r  )ri   r  r   r&   r&   r'   ri     r!  ri   )mobilevitv2_150_in22ft1kmobilevitv2_175_in22ft1kmobilevitv2_200_in22ft1kmobilevitv2_150_384_in22ft1kmobilevitv2_175_384_in22ft1kmobilevitv2_200_384_in22ft1k)r   )r)   r   )r1   r2   r3   )r7   )NF)rD   )F)?r   r   typingr   r   r   r   torch.nn.functionalr   
functionalr   timm.layersr   r   r   r	   r
   r   _builderr   _features_fxr   	_registryr   r   r   byobnetr   r   r   r   r   r   vision_transformerr   r   __all__r(   r0   r6   rO   r$   r  Modulerj   r   r   r   r  r  r  default_cfgsr_   r`   ra   rc   rd   rf   re   rg   rh   ri   r   r&   r&   r&   r'   <module>   s     



Lkw:
]






!%(+1