o
    پi%F                     @   s  d Z ddlmZ ddlmZmZmZmZ ddlZddl	m
Z
 ddlmZmZ ddlmZmZmZmZmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZmZ ddl m!Z! G dd de
j"Z#dvddZ$dwddZ%		dxdee&ej'f de!de&de(dee&ej'f f
ddZ)dydd Z*dzd"d#Z+ei d$e+d%d&dd'd(d)e+d*d&d'd+d,dd-d.e+d/d&dd0d1e+d2d&d+d,dd3d4e+ d5e+d6d&d+d,d7d8e+d9d&dd0d:e+d;d&d+d,dd3d<e+d=d&d>d?d'dd@dAe+dBd&d>d?ddCdDe+d&dd?dEdFe+dGd&d>d?ddCdHe+eedIdJdKe+eedIdJdLe+eedIdJdMe+eedIdJdNe+dOdPdQdRdSdTdUdVe+dWdXdQdRdSdTdUiZ,ed{de!fdYdZZ-ed{de!fd[d\Z.ed{de!fd]d^Z/ed{de!fd_d`Z0ed{de!fdadbZ1ed{de!fdcddZ2ed{de!fdedfZ3ed{de!fdgdhZ4ed{de!fdidjZ5ed{de!fdkdlZ6ed{de!fdmdnZ7ed{de!fdodpZ8ed{de!fdqdrZ9ed{de!fdsdtZ:ee;d<dAdDdDdFd5du dS )|a   Hybrid Vision Transformer (ViT) in PyTorch

A PyTorch implement of the Hybrid Vision Transformers as described in:

'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale'
    - https://arxiv.org/abs/2010.11929

`How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers`
    - https://arxiv.org/abs/2106.10270

NOTE These hybrid model definitions depend on code in vision_transformer.py.
They were moved here to keep file sizes sane.

Hacked together by / Copyright 2020, Ross Wightman
    )partial)DictTupleTypeUnionN)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)StdConv2dSame	StdConv2dConvNormAct	to_ntupleHybridEmbed   )build_model_with_cfg)generate_default_cfgsregister_modelregister_model_deprecations)	resnet26d	resnet50d)ResNetV2create_resnetv2_stem)VisionTransformerc                       s   e Zd Zddddddejejfdededeeeedf f d	eeeedf f d
eeeedf f dee	eeedf f de
ej de
ej f fddZ  ZS )ConvStem   @   )   r   r    in_chansdepthchannels.kernel_sizestridepadding
norm_layer	act_layerc	                    s   t    t trt fddt|D d d d  t||}t||}|t|  kr>t|  kr>t ksAJ  J |}	tt D ]+}
|
t d k}| |
 t	|	 |
 ||
 ||
 ||
 || | ||d
  |
 }	qId S )Nc                    s   g | ]} d |  qS )r    ).0ir   r%   Y/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/vision_transformer_hybrid.py
<listcomp>/   s    z%ConvStem.__init__.<locals>.<listcomp>r   )r    r!   r"   bias
apply_norm	apply_actr#   r$   )
super__init__
isinstanceinttupleranger   len
add_moduler   )selfr   r   r   r    r!   r"   r#   r$   in_chsr'   	last_conv	__class__r(   r)   r0   !   s.   

$2

zConvStem.__init__)__name__
__module____qualname__nnBatchNorm2dReLUr2   r   r   strr   Moduler0   __classcell__r%   r%   r:   r)   r       s4    	r   r      	   c              	   K   sz   | dd}|r
dnd}|rttddnttdd}t| r/t| dd| dd	d
||d}|S t| dd	|d
|d}|S )z ResNet-V2 backbone helperpadding_sameTsamer   g:0yE>)epsr   r   r   F)layersnum_classesglobal_poolr   preact	stem_type
conv_layer)rO   rN   rP   )getr   r	   r
   r5   r   r   )rK   kwargsrH   rO   rP   backboner%   r%   r)   	_resnetv2G   s   rT   image_encoder.model.c                 C   s  i }|   D ]\}}||sq||d}|dd}|dd}|dd}|dd	}|d
d}|dd}|dd}|dd}|dd}|dd}|dd}|dd}|dkrid}|d}d|v r|dd}|dd}|j}t|jd ||< |||< q|S ) Nr   z
patch_emb.zpatch_embed.backbone.z
block.convconvz
block.normbnzpost_transformer_norm.znorm.zpre_norm_mha.0norm1zpre_norm_mha.1attnzpre_norm_ffn.0norm2zpre_norm_ffn.1zmlp.fc1zpre_norm_ffn.4zmlp.fc2z	qkv_proj.zqkv.z	out_proj.zproj.ztransformer.zblocks.zpos_embed.pos_embed.pos_embed	pos_embedr   zclassifier.projz	head.biaszhead.weight)items
startswithreplacesqueezeTtorchzerosshape)
state_dictmodelprefixoutkvbias_kr%   r%   r)   _convert_mobileclipV   s6   


rk   bicubicTrd   re   interpolation	antialiasreturnc                 C   s.   ddl m} d| v rt| |} || |||dS )Nr   )checkpoint_filter_fnz1image_encoder.model.patch_emb.0.block.conv.weight)rm   rn   )vision_transformerrp   rk   )rd   re   rm   rn   
_filter_fnr%   r%   r)   rp   t   s   
rp   Fc                 K   sb   | dd}|p	i }ttfd|i|}|d| |dd tt| |ftt|ddd	|S )
Nout_indicesr   rS   embed_layer
patch_sizer   getter)rs   feature_cls)pretrained_filter_fnfeature_cfg)popr   r   
setdefaultr   r   rp   dict)variantrS   
embed_args
pretrainedrR   rs   rt   r%   r%   r)   !_create_vision_transformer_hybrid   s   
r   r   c                 K   s    | ddd dddddddd	|S )
Ni  )r      r   ?rl   T)      ?r   r   zpatch_embed.backbone.stem.convhead)urlrL   
input_size	pool_sizecrop_pctrm   fixed_input_sizemeanstd
first_conv
classifierr%   )r   rR   r%   r%   r)   _cfg   s   r   z*vit_tiny_r_s16_p8_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npzztimm/zpatch_embed.backbone.conv)r   	hf_hub_idcustom_loadr   z*vit_tiny_r_s16_p8_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz)r     r         ?)r   r   r   r   r   r   z*vit_small_r26_s32_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R26_S_32-i21k-300ep-lr_0.001-aug_light0-wd_0.03-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.03-res_224.npz)r   r   r   z*vit_small_r26_s32_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R26_S_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz)r   r   r   r   r   zvit_base_r26_s32_224.untrainedz'vit_base_r50_s16_384.orig_in21k_ft_in1kzthttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_resnet50_384-9fd3c705.pth)r   r   r   r   z*vit_large_r50_s32_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R50_L_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npzz*vit_large_r50_s32_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R50_L_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npzz"vit_tiny_r_s16_p8_224.augreg_in21kzohttps://storage.googleapis.com/vit_models/augreg/R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npziSU  r   )r   r   rL   r   r   r   z"vit_small_r26_s32_224.augreg_in21kzshttps://storage.googleapis.com/vit_models/augreg/R26_S_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.03-do_0.0-sd_0.0.npz)r   r   rL   r   r   zvit_base_r50_s16_224.orig_in21k)r   rL   r   z"vit_large_r50_s32_224.augreg_in21kzrhttps://storage.googleapis.com/vit_models/augreg/R50_L_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0.npzz!vit_small_resnet26d_224.untrainedzpatch_embed.backbone.conv1.0)r   r   r   z%vit_small_resnet50d_s16_224.untrainedz vit_base_resnet26d_224.untrainedz vit_base_resnet50d_224.untrainedzvit_base_mci_224.apple_mclip_ltzapple/mobileclip_b_lt_timmzYhttps://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_blt.pti   )        r   r   )r   r   r   zpatch_embed.backbone.0.conv)r   r   rL   r   r   r   zvit_base_mci_224.apple_mclipzapple/mobileclip_b_timmzWhttps://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_b.ptc                 K   H   t dddi|}tddddd}t	d|| d	t|fi |}|S )z3 R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 224 x 224.
    rK   r%            r   ru   	embed_dimr   	num_headsvit_tiny_r_s16_p8_224rS   r   N)r   rT   r|   r   r   rR   rS   
model_argsre   r%   r%   r)   r         r   c                 K   r   )z3 R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 384 x 384.
    rK   r%   r   r   r   r   r   vit_tiny_r_s16_p8_384r   N)r   r   r   r%   r%   r)   r      r   r   c                 K   B   t d	i |}tdddd}t	d
|| dt|fi |}|S ) R26+ViT-S/S32 hybrid.
    r   r   r   r   r   r      r   r   r   vit_small_r26_s32_224r   Nr   )r   r   r   r%   r%   r)   r        r   c                 K   r   )r   r   r   r   r   r   vit_small_r26_s32_384r   Nr   )r   r   r   r%   r%   r)   r     r   r   c                 K   B   t di |}tdddd}t	d	|| dt|fi |}|S )
z R26+ViT-B/S32 hybrid.
    r      r   r   vit_base_r26_s32_224r   Nr   )r   r   r   r%   r%   r)   r     r   r   c                 K   r   )
zR R50+ViT-B/S16 hybrid from original paper (https://arxiv.org/abs/2010.11929).
    rE   r   r   r   vit_base_r50_s16_224r   NrE   )r   r   r   r%   r%   r)   r   #  r   r   c                 K   r   )
z R50+ViT-B/16 hybrid from original paper (https://arxiv.org/abs/2010.11929).
    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
    rE   r   r   r   vit_base_r50_s16_384r   Nr   )r   r   r   r%   r%   r)   r   .  s   r   c                 K   r   ) R50+ViT-L/S32 hybrid.
    r   rF   r   r            r   vit_large_r50_s32_224r   Nr   )r   r   r   r%   r%   r)   r   :  r   r   c                 K   r   )r   r   r   r   r   r   vit_large_r50_s32_384r   Nr   )r   r   r   r%   r%   r)   r   E  r   r   c                 K   sP   t | |ddddgd}tddddd}t		d|| d
t|fi |}|S )zL Custom ViT small hybrid w/ ResNet26D stride 32. No pretrained weights.
    r   r   TrF   r   r   features_onlyrs   r   r   r   r   r   	mlp_ratiovit_small_resnet26d_224r   N)r   r   rQ   r|   r   r   r%   r%   r)   r   P     r   c                 K   sP   t | |ddddgd}tddddd}t	d|| d	t|fi |}|S )zV Custom ViT small hybrid w/ ResNet50D 3-stages, stride 16. No pretrained weights.
    r   r   Tr   r   r   r   vit_small_resnet50d_s16_224r   N)r   r   rQ   r|   r   r   r%   r%   r)   r   [  r   r   c                 K   N   t | |ddddgd}tdddd}t		d|| d
t|fi |}|S )zK Custom ViT base hybrid w/ ResNet26D stride 32. No pretrained weights.
    r   r   TrF   r   r   r   r   vit_base_resnet26d_224r   N)r   r   r   r%   r%   r)   r   f     r   c                 K   r   )K Custom ViT base hybrid w/ ResNet50D stride 32. No pretrained weights.
    r   r   TrF   r   r   r   r   vit_base_resnet50d_224r   N)r   r   r   r%   r%   r)   r   q  r   r   c              	   K   s\   t dddd|ddtjd}tdddd	d
}t	d|tdd| dt|fi |}|S )r   )r   r   r   )rF   r   r   r   r   r   )r   r!   r    r"   r   r$   r   r   T)r   r   r   no_embed_classvit_base_mci_224F)proj)rS   r~   r   N)r   )r   rQ   r?   GELUr|   r   r   r%   r%   r)   r   |  s$   

r   )vit_tiny_r_s16_p8_224_in21kvit_small_r26_s32_224_in21kvit_base_r50_s16_224_in21kvit_base_resnet50_224_in21kvit_large_r50_s32_224_in21kvit_base_resnet50_384r   )rU   )rl   T)NF)r   )F)<__doc__	functoolsr   typingr   r   r   r   ra   torch.nnr?   	timm.datar   r   timm.layersr	   r
   r   r   r   _builderr   	_registryr   r   r   resnetr   r   resnetv2r   r   rq   r   
Sequentialr   rT   rk   rB   Tensorboolrp   r   r   default_cfgsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r<   r%   r%   r%   r)   <module>   sJ   
'
!


%)-179;=@FO











