o
    پi                     @   sx  d Z ddlZddlmZ ddlmZ ddlmZmZ ddl	m
Z
mZmZmZ ddlZddlmZ ddlmZmZ dd	lmZmZmZmZmZ d
dlmZ d
dlmZ d
dlmZ d
dlm Z  d
dl!m"Z"m#Z# ddgZ$eG dd dZ%dd Z&G dd dej'Z(e	dedee) de*deej+eej+ f fddZ,e	dfde)deej+ deej+ee) f fdd Z-ed!ej+d"ej+de*d#ee) d$ee) d%ej+d&ej+fd'd(Z.G d)d* d*ej'Z/G d+d, d,ej'Z0G d-d. d.ej'Z1G d/d0 d0ej'Z2G d1d dej'Z3d2d3 Z4e5e%d4d5e%d6d5e%d7d5e%d8d9d:d;d<e%d6dd=e%d7dd=e%d8d9d:ddd>e%d?d@dAddd>dBZ6dgdCdDZ7dhdFdGZ8e#e8dHdIdJe8dKdIdJe8dLdIdJe8dMdIdJe8dEdNe8dOdIdPdQe8dRdIdPdQe8dSdIdPdQdTZ9e"dide3fdUdVZ:e"dide3fdWdXZ;e"dide3fdYdZZ<e"dide3fd[d\Z=e"dide3fd]d^Z>e"dide3fd_d`Z?e"dide3fdadbZ@e"dide3fdcddZAdS )ja   Multi-Scale Vision Transformer v2

@inproceedings{li2021improved,
  title={MViTv2: Improved multiscale vision transformers for classification and detection},
  author={Li, Yanghao and Wu, Chao-Yuan and Fan, Haoqi and Mangalam, Karttikeya and Xiong, Bo and Malik, Jitendra and Feichtenhofer, Christoph},
  booktitle={CVPR},
  year={2022}
}

Code adapted from original Apache 2.0 licensed impl at https://github.com/facebookresearch/mvit
Original copyright below.

Modifications and timm support by / Copyright 2022, Ross Wightman
    N)OrderedDict)	dataclass)partialreduce)UnionListTupleOptional)nnIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)MlpDropPathtrunc_normal_tf_get_norm_layer	to_2tuple   )build_model_with_cfg)feature_take_indices)register_notrace_function)
checkpoint)register_modelgenerate_default_cfgsMultiScaleVitMultiScaleVitCfgc                   @   s  e Zd ZU dZeedf ed< dZeeeedf f ed< dZ	eeeedf f ed< dZ
eed	< d
Zeed< dZeed< dZeed< d
Zeed< d
Zeed< dZeed< dZeed< dZeeef ed< dZeeeeef   ed< dZeeeeef   ed< dZeeeef  ed< dZeeef ed< dZeeef ed< dZeeef ed< d Zeed!< d"Zeed#< d$Zeeeeef f ed%< d&Z eeeeef f ed'< d(Z!eed)< d*d+ Z"dS ),r            r   .depths`   	embed_dimr   	num_heads      @	mlp_ratioF
pool_firstTexpand_attnqkv_biasuse_cls_tokenuse_abs_posresidual_poolingconvmoder   r   
kernel_qkv)r   r   r   r   r1   r1   stride_qN	stride_kv   r5   stride_kv_adaptive   r8   patch_kernelpatch_stridepatch_paddingmax	pool_typespatialrel_pos_typegelu	act_layer	layernorm
norm_layergư>norm_epsc                    s  t j}tjttfstfddt|D _t j|ks$J tjttfs:tfddt|D _t j|ksCJ jd urj	d u rj g }t|D ]!t
j dkrp fddtt  D  |t  qVt|_	d S d S d S )Nc                 3       | ]
} j d |  V  qdS r   N)r"   .0iself F/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/mvitv2.py	<genexpr>A       z1MultiScaleVitCfg.__post_init__.<locals>.<genexpr>c                 3   rE   rF   )r#   rG   rJ   rL   rM   rN   E   rO   r   c                    s(   g | ]}t  | j |  d qS r   )r<   r2   )rH   d
_stride_kvrI   rK   rL   rM   
<listcomp>M   s    z2MultiScaleVitCfg.__post_init__.<locals>.<listcomp>)lenr    
isinstancer"   tuplelistranger#   r6   r3   minr2   append)rK   
num_stagespool_kv_striderL   rR   rM   __post_init__>   s$   

zMultiScaleVitCfg.__post_init__)#__name__
__module____qualname__r    r   int__annotations__r"   r   r#   r%   floatr&   boolr'   r(   r)   r*   r+   r-   strr/   r2   r	   r3   r6   r9   r:   r;   r=   r?   rA   rC   rD   r^   rL   rL   rL   rM   r   $   s2   
 c                 C   s   t tj| dS )Nr   )r   operatormul)iterablerL   rL   rM   prodU   s   rj   c                       sH   e Zd ZdZ					d fdd	Zd	eejee	 f fd
dZ
  ZS )
PatchEmbedz
    PatchEmbed.
    r      r7   r4   r.   c                    s$   t    tj|||||d| _d S )N)kernel_sizestridepadding)super__init__r
   Conv2dproj)rK   dim_indim_outkernelrn   ro   	__class__rL   rM   rq   ^   s   
zPatchEmbed.__init__returnc                 C   s*   |  |}|ddd|jdd  fS )Nr   r   )rs   flatten	transposeshaperK   xrL   rL   rM   forwardp   s   
 zPatchEmbed.forward)r   rl   r7   r4   r.   )r_   r`   ra   __doc__rq   r   torchTensorr   rb   r   __classcell__rL   rL   rw   rM   rk   Y   s    $rk   T	feat_sizehas_cls_tokenry   c                 C   s   |\}}|r(| d d d d d dd d f | d d d d dd d d f }} nd }|  d||| jd dddd } | |fS )Nr   r   r   r   )reshaper}   permute
contiguous)r   r   r   HWcls_tokrL   rL   rM   reshape_pre_poolv   s   D&r   r#   r   c                 C   sh   | j d | j d g}| j d | j d  }| d|| j d |dd} |d ur0tj|| fdd} | |fS )Nr   r   r   r   dim)r}   r   r|   r   cat)r   r#   r   r   L_pooledrL   rL   rM   reshape_post_pool   s   r   attnqq_sizek_size	rel_pos_h	rel_pos_wc                 C   s  |rdnd}|\}}	|\}
}t |
| d}t ||
 d}tj||jdd| tj|
|jdd|  }||
d | 7 }t ||	 d}t |	| d}tj|	|jdd| tj||jdd|  }||d | 7 }||  }||  }|j\}}}}|dddd|df ||||	|}td||}td||}| dddd|d|df 	|d||	|
||d |d	 	|d||	 |
| | dddd|d|df< | S )
z1
    Spatial Relative Positional Embeddings.
    r   r   g      ?)devicer   Nzbyhwc,hkc->byhwkzbyhwc,wkc->byhwkrz   )
r<   r   aranger   	unsqueezelongr}   r   einsumview)r   r   r   r   r   r   r   sp_idxq_hq_wk_hk_w	q_h_ratio	k_h_ratiodist_h	q_w_ratio	k_w_ratiodist_wrel_hrel_wBn_headq_Nr   r_qrL   rL   rM   cal_rel_pos_type   s>   (.r   c                       H   e Zd Zddddddddddejf fdd	Zdee fd	d
Z  Z	S )MultiScaleAttentionPoolFirst   Tr,   r0   r>   c              	      s  t    || _|| _|| | _| jd | _|| _tdd |D }tdd |D }tj	|||d| _
tj	|||d| _tj	|||d| _t	||| _t|dkr[t|	dkr[d }t|dkrit|
dkrid }|| _|dk| _d\| _| _| _d\| _| _| _|d	v r|d
krtjntj}|r|||	|| _|r|||
|| _|||
|| _nT|dks|dkr|dkr|| n|}|rtj||||	||dd| _||| _|rtj||||
||dd| _||| _tj||||
||dd| _||| _ntd| || _| jdkr`|d |d ksJ |d }t|	dkr"||	d  n|}t|
dkr1||
d  n|}dt|| d }tt || j| _!tt || j| _"t#| j!dd t#| j"dd || _$d S )N      c                 S      g | ]}t |d  qS r   rb   rH   r   rL   rL   rM   rT          z9MultiScaleAttentionPoolFirst.__init__.<locals>.<listcomp>c                 S   r   r   r   rH   kvrL   rL   rM   rT      r   biasr   conv_unsharedNNNavgr<   r<   r,   Frn   ro   groupsr   Unsupported model r>   r   r   {Gz?std)%rp   rq   r#   ru   head_dimscaler   rW   r
   Linearr   kvrs   rj   r-   unsharedpool_qpool_kpool_vnorm_qnorm_knorm_v	MaxPool2d	AvgPool2drr   NotImplementedErrorr?   rU   r<   	Parameterr   zerosr   r   r   r+   rK   r   ru   r   r#   r(   r-   kernel_q	kernel_kvr2   r3   r   r?   r+   rC   	padding_q
padding_kvpool_opdim_convsizer   kv_size
rel_sp_dimrw   rL   rM   rq      s   



	
	
	
z%MultiScaleAttentionPoolFirst.__init__r   c                 C   s  |j \}}}| jrdn| j}||||ddddd}| } }}	| jd ur?t||| j\}}
| |}t|| j|
\}}n|}| j	d urK| 	|}| j
d urht||| j\}}| 
|}t|| j|\}}n|}| jd urt| |}| jd urt|	|| j\}	}| |	}	t|	| j|\}	}n|}| jd ur| |	}	|d |d  t| j }|dd||d}| |||| jddd}|d |d  t| j }|dd||d}| |||| jd}|d |d  t| j }|	dd||d}	| |	||| jddd}	|| j | }| jdkr+t||| j||| j| j}|jdd}||	 }| jr=|| }|dd|d| j}| |}||fS )Nr   r   r   r   r   r>   r   )r}   r   r#   r   r   r   r   r   r   r   r   r   r   r   rb   r|   r   r   r   r   r?   r   r   r   softmaxr+   ru   rs   )rK   r   r   r   N_fold_dimr   r   r   q_tokr   k_tokr   v_tokv_sizer   k_Nv_Nr   rL   rL   rM   r   +  sh   











  	
z$MultiScaleAttentionPoolFirst.forward
r_   r`   ra   r
   	LayerNormrq   r   rb   r   r   rL   rL   rw   rM   r      s    dr   c                       r   )MultiScaleAttentionr   Tr,   r0   r>   c              	      s  t    || _|| _|| | _| jd | _|| _tdd |D }tdd |D }tj	||d |d| _
t	||| _t|dkrKt|	dkrKd }t|dkrYt|
dkrYd }|| _|dk| _d	\| _| _| _d	\| _| _| _|d
v r|dkr|tjntj}|r|||	|| _|r|||
|| _|||
|| _nT|dks|dkr|dkr|| n|}|rtj||||	||dd| _||| _|rtj||||
||dd| _||| _tj||||
||dd| _||| _ntd| || _| jdkrP|d |d ksJ |d }t|	dkr||	d  n|}t|
dkr!||
d  n|}dt|| d }tt|| j| _tt|| j| _ t!| jdd t!| j dd || _"d S )Nr   c                 S   r   r   r   r   rL   rL   rM   rT     r   z0MultiScaleAttention.__init__.<locals>.<listcomp>c                 S   r   r   r   r   rL   rL   rM   rT     r   r   r   r   r   r   r   r<   r,   Fr   r   r>   r   r   r   r   )#rp   rq   r#   ru   r   r   r   rW   r
   r   qkvrs   rj   r-   r   r   r   r   r   r   r   r   r   rr   r   r?   rU   r<   r   r   r   r   r   r   r+   r   rw   rL   rM   rq   q  s   



	
	
	
zMultiScaleAttention.__init__r   c                 C   s  |j \}}}| |||d| jdddddd}|jdd\}}}	| jd ur@t||| j\}}
| |}t	|| j|
\}}n|}| j
d urL| 
|}| jd urit||| j\}}| |}t	|| j|\}}n|}| jd uru| |}| jd urt|	|| j\}	}| |	}	t	|	| j|\}	}| jd ur| |	}	|| j |dd }| jd	krt||| j||| j| j}|jdd}||	 }| jr|| }|dd|d| j}| |}||fS )
Nr   r   r   r   r   r5   r   rz   r>   )r}   r   r   r#   r   unbindr   r   r   r   r   r   r   r   r   r   r|   r?   r   r   r   r   r+   ru   rs   )rK   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rL   rL   rM   r     sR   (












	
zMultiScaleAttention.forwardr   rL   rL   rw   rM   r   p  s    br   c                       s`   e Zd Zdddejddddddddddf fdd		Zd
ee fddZd
ee fddZ	  Z
S )MultiScaleBlockr$   T        r0   r,   Fr>   c                    sL  t    ||k}|| _|| _|| _||| _|r!|r!t||nd | _|rEt	|dkrEdd |D }|}dd |D }t
|||| _nd | _|rL|n|}|rRtnt}|||||||	|
|||||||d| _|dkrot|nt | _||| _|}|r|st||nd | _t|t|| |d| _|dkrt|| _d S t | _d S )Nr   c                 S   s    g | ]}|d kr|d  n|qS rP   rL   )rH   srL   rL   rM   rT   )  s     z,MultiScaleBlock.__init__.<locals>.<listcomp>c                 S   r   r   r   )rH   skiprL   rL   rM   rT   +  r   )r#   r   r(   r   r   r2   r3   rC   r   r-   r?   r+   r   )in_featureshidden_featuresout_features)rp   rq   r   ru   r   norm1r
   r   shortcut_proj_attnrj   r   shortcut_pool_attnr   r   r   r   Identity
drop_path1norm2shortcut_proj_mlpr   rb   mlp
drop_path2)rK   r   ru   r#   r   r%   r(   	drop_pathrC   r   r   r2   r3   r-   r   r'   r&   r?   r+   proj_neededkernel_skipstride_skippadding_skipatt_dim
attn_layermlp_dim_outrw   rL   rM   rq   
  sR   



$zMultiScaleBlock.__init__r   c           	      C   s   | j d u r|S | jr&|d d d dd d f |d d dd d d f }}nd }|j\}}}|\}}|||||dddd }|  |}|||ddd}|d ur_tj||fdd}|S )Nr   r   r   r   r   r   )	r   r   r}   r   r   r   r|   r   r   )	rK   r   r   r   r   LCr   r   rL   rL   rM   _shortcut_poolN  s   
8 
zMultiScaleBlock._shortcut_poolc                 C   s   |  |}| jd u r|n| |}| ||}| ||\}}|| | }| |}| jd u r2|n| |}|| | | }||fS N)	r   r   r  r   r   r   r   r  r  )rK   r   r   x_norm
x_shortcutfeat_size_newrL   rL   rM   r   ^  s   

zMultiScaleBlock.forward)r_   r`   ra   r
   r   rq   r   rb   r  r   r   rL   rL   rw   rM   r   	  s$    Dr   c                       sN   e Zd Zddddddddddddejdf fdd		Zd
ee fddZ  Z	S )MultiScaleVitStager$   Tr,   r0   Fr>   r   c              	      s&  t    d| _t | _|r|f| }n
|f|d  |f }t|D ]j}tdi d|d|| d|d|d|d|d	|	d
|
d|dkrI|ndd|d|d|d|d|d|d|d|dt|t	t
fro|| n|}|| }| j| |dkrt
dd t||D }q#|| _d S )NFr   r   ru   r#   r   r%   r(   r   r   r2   r   r0   r3   r-   r   r&   r?   r+   r'   rC   r  c                 S   s   g | ]\}}|| qS rL   rL   )rH   r   rn   rL   rL   rM   rT     r   z/MultiScaleVitStage.__init__.<locals>.<listcomp>rL   )rp   rq   grad_checkpointingr
   
ModuleListblocksrY   r   rV   rX   rW   r[   zipr   )rK   r   ru   depthr#   r   r%   r(   r-   r   r   r2   r3   r   r'   r&   r?   r+   rC   r  out_dimsrI   attention_blockrw   rL   rM   rq   n  sh   

	

zMultiScaleVitStage.__init__r   c                 C   sD   | j D ]}| jrtj st|||\}}q|||\}}q||fS r  )r  r  r   jitis_scriptingr   )rK   r   r   blkrL   rL   rM   r     s
   
zMultiScaleVitStage.forwardr   rL   rL   rw   rM   r  l  s"    	;r  c                       sp  e Zd ZdZ						d4dedeeef d	ed
ee dede	de	f fddZ
dd Zejjdd Zejjd5ddZejjd6ddZejjdejfddZd7ded
ee fddZ					d8d ejd!eeeee f  d"ed#ed$ed%edeeej eejeej f f fd&d'Z	(		d9d!eeee f d)ed*efd+d,Zd-d. Zd5d/efd0d1Zd2d3 Z  ZS ):r   a  
    Improved Multiscale Vision Transformers for Classification and Detection
    Yanghao Li*, Chao-Yuan Wu*, Haoqi Fan, Karttikeya Mangalam, Bo Xiong, Jitendra Malik,
        Christoph Feichtenhofer*
    https://arxiv.org/abs/2112.01526

    Multiscale Vision Transformers
    Haoqi Fan*, Bo Xiong*, Karttikeya Mangalam*, Yanghao Li*, Zhicheng Yan, Jitendra Malik,
        Christoph Feichtenhofer*
    https://arxiv.org/abs/2104.11227
       r  r   N  r   cfgimg_sizein_chansglobal_poolnum_classesdrop_path_rate	drop_ratec              	      s&  t    t|}tt|j|jd}|| _|| _|d u r$|j	r"dnd}|| _
t|j| _|j| _|jd }	t||	|j|j|jd| _|d |jd  |d |jd  f}
t|
}|j	rottdd|	| _d| _|d }nd| _d | _|}|jrttd||	| _nd | _t|j}|
}t|j}dd td|t |j!|jD }t" | _#g | _$t%|D ]}|jr|j| }n|jt&|d |d  }t'd"i d	|	d
|d|j| d|j(| d|d|j)d|j*d|j+d|j,d|jd|j-d|j-d|j.| d|j/| d|j	d|j0d|j1d|d|| }|t|j.| 9 }|  j$t2d| ||dg7  _$|}	|j3}| j#4| q|	 | _5| _6||	| _7t8t9dt:| jfd|dkrht;| j5|nt< fg| _=| jd ur~t>| jd d! | jd urt>| jd d! | ?| j@ d S )#N)epstokenr   r   )rt   ru   rv   rn   ro   r   c                 S   s   g | ]}|  qS rL   )tolist)rH   r   rL   rL   rM   rT     s    z*MultiScaleVit.__init__.<locals>.<listcomp>r   ru   r  r#   r   r%   r(   r-   r&   r'   r   r   r2   r3   r   r?   r+   rC   r  zblock.)modulenum_chs	reductiondropfcr   r   rL   )Arp   rq   r   r   r   rC   rD   r$  r&  r)   r#  rW   r    r'   r"   rk   r9   r:   r;   patch_embedrj   r
   r   r   r   	cls_tokennum_prefix_tokensr*   	pos_embedrU   r<   linspacesumsplitr  stagesfeature_inforY   rZ   r  r#   r%   r(   r-   r&   r/   r2   r3   r?   r+   dictr   r[   num_featureshead_hidden_sizenorm
Sequentialr   Dropoutr   r   headr   apply_init_weights)rK   r   r!  r"  r#  r$  r%  r&  rC   r"   
patch_dimsnum_patchespos_embed_dimr\   r   curr_stridedprrI   ru   stagerw   rL   rM   rq     s   


$


&


	


 
"
zMultiScaleVit.__init__c                 C   sP   t |tjr"t|jdd t |tjr$|jd ur&tj|jd d S d S d S d S )Nr   r   r   )rV   r
   r   r   weightr   init	constant_)rK   mrL   rL   rM   r@     s   zMultiScaleVit._init_weightsc                 C   s   dd |   D S )Nc                    s*   h | ]\ }t  fd ddD r qS )c                 3   s    | ]}| v V  qd S r  rL   )rH   nr   rL   rM   rN   )  s    z:MultiScaleVit.no_weight_decay.<locals>.<setcomp>.<genexpr>)r2  r   r   r0  )any)rH   r   rL   rL  rM   	<setcomp>(  s    
z0MultiScaleVit.no_weight_decay.<locals>.<setcomp>)named_parametersrJ   rL   rL   rM   no_weight_decay&     zMultiScaleVit.no_weight_decayFc                 C   s   t dddgd}|S )Nz^patch_embed)z^stages\.(\d+)N)z^norm)i )stemr  )r8  )rK   coarsematcherrL   rL   rM   group_matcher+  s
   zMultiScaleVit.group_matcherTc                 C   s   | j D ]}||_qd S r  )r6  r  )rK   enabler   rL   rL   rM   set_grad_checkpointing3  s   
z$MultiScaleVit.set_grad_checkpointingry   c                 C   s   | j jS r  )r>  r.  rJ   rL   rL   rM   get_classifier8  s   zMultiScaleVit.get_classifierc              	   C   sV   || _ |d ur
|| _ttdt| jfd|dkr t| j|nt	 fg| _
d S )Nr-  r.  r   )r$  r#  r
   r<  r   r=  r&  r   r9  r   r>  )rK   r$  r#  rL   rL   rM   reset_classifier<  s    zMultiScaleVit.reset_classifierNCHWr   indicesr;  
stop_early
output_fmtintermediates_onlyc                 C   sN  |dv sJ d|dk}g }t t| j|\}	}
| |\}}|jd }| jdur:| j|dd}tj||fdd}| j	durD|| j	 }t| jd }t
| jD ]E\}}|||\}}||	v r|rk||krk| |}n|}|r| jdur~|ddddf }|||d |d ddd	dd
}|| qP|r|S ||kr| |}||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )rZ  NLCz!Output shape must be NCHW or NLC.rZ  r   Nr   r   r   r   r   )r   rU   r6  r/  r}   r0  expandr   r   r2  	enumerater;  r   r   r[   )rK   r   r[  r;  r\  r]  r^  r   intermediatestake_indices	max_indexr   r   
cls_tokenslast_idxfeat_idxrF  x_interrL   rL   rM   forward_intermediatesE  s:   




$

z#MultiScaleVit.forward_intermediatesr   
prune_norm
prune_headc                 C   s6   t t| j|\}}|rt | _|r| dd |S )z@ Prune layers not required for specified intermediates.
        r    )r   rU   r6  r
   r   r;  rY  )rK   r[  rj  rk  rc  rd  rL   rL   rM   prune_intermediate_layers  s   
z'MultiScaleVit.prune_intermediate_layersc                 C   s   |  |\}}|j\}}}| jd ur#| j|dd}tj||fdd}| jd ur-|| j }| jD ]	}|||\}}q0| |}|S )Nr   r   r   )	r/  r}   r0  r`  r   r   r2  r6  r;  )rK   r   r   r   r   r  re  rF  rL   rL   rM   forward_features  s   




zMultiScaleVit.forward_features
pre_logitsc                 C   sP   | j r| j dkr|d d | jd f d}n|d d df }|r#|S | |S )Nr   r   r   )r#  r1  meanr>  )rK   r   ro  rL   rL   rM   forward_head  s
   
zMultiScaleVit.forward_headc                 C   s   |  |}| |}|S r  )rn  rq  r~   rL   rL   rM   r     s   

zMultiScaleVit.forward)r  r   Nr  r   r   FTr  )NFFrZ  F)r   FT) r_   r`   ra   r   r   r   rb   r	   rf   rd   rq   r@  r   r  ignorerP  rU  rW  r
   ModulerX  rY  r   r   r   re   ri  rm  rn  rq  r   r   rL   rL   rw   rM   r     s    
a
 
=
c                    s  d| v rL|   D ]A}d|v rI| | }| | j}|jd |d krItjjj|d|jd dddd|d dd}|d|d dd| |< q| S dd l	}d	| v rX| d	 } t
|d
d }t
|dd}|d uslJ di d t|D ]\}	 fddt  |	 D   |	7  qti }
|  D ].\}}|dfdd|}|r|dd|}n|dd|}d|v r|dd}||
|< q|
S )Nzstages.0.blocks.0.norm1.weightrel_posr   r   r   r   linear)r   r-   model_stater    r'   Tz3model requires depth attribute to remap checkpointsc                    s   i | ]	}||  fqS rL   rL   rG   )	block_idx	stage_idxrL   rM   
<dictcomp>  s    z(checkpoint_filter_fn.<locals>.<dictcomp>zblocks\.(\d+)c                    s4   d t | d d  d t | d d  S )Nzstages.r   r   z.blocks.)rb   group)r   )	depth_maprL   rM   <lambda>  s   4 z&checkpoint_filter_fn.<locals>.<lambda>z stages\.(\d+).blocks\.(\d+).projz&stages.\1.blocks.\2.shortcut_proj_attnz%stages.\1.blocks.\2.shortcut_proj_mlpr>  zhead.projectionhead.fc)keys
state_dictr}   r   r
   
functionalinterpolater   r   regetattrra  updaterY   itemssubreplace)r  modelr   rv  dest_rel_pos_shaperel_pos_resizedr  r    r'   rQ   out_dictr   rL   )ry  r}  rz  rM   checkpoint_filter_fn  sN   $


r  )r   r      r   )r    )r   r      r   r   )r      $   r5      r   F)r    r"   r#   r'   )r    r)   )r    r"   r#   r)   r'   )r5   r   <   r      r   )mvitv2_tinymvitv2_smallmvitv2_basemvitv2_largemvitv2_small_clsmvitv2_base_clsmvitv2_large_clsmvitv2_huge_clsc                 K   s@   | dd}tt| |f|st|  nt| tt|ddd|S )Nout_indicesr5   getter)r  feature_cls)	model_cfgpretrained_filter_fnfeature_cfg)popr   r   
model_cfgsr  r8  )variantcfg_variant
pretrainedkwargsr  rL   rL   rM   _create_mvitv2	  s   
r  rl  c                 K   s    | ddd ddt tdddd|S )	Nr  )r   r  r  g?bicubiczpatch_embed.projr  T)urlr$  
input_size	pool_sizecrop_pctinterpolationrp  r   
first_conv
classifierfixed_input_sizer   )r  r  rL   rL   rM   _cfg  s   r  zDhttps://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_T_in1k.pythztimm/)r  	hf_hub_idzDhttps://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_S_in1k.pythzDhttps://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_B_in1k.pythzDhttps://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_L_in1k.pyth)r  zEhttps://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_B_in21k.pythiJ  )r  r  r$  zEhttps://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_L_in21k.pythzEhttps://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_H_in21k.pyth)zmvitv2_tiny.fb_in1kzmvitv2_small.fb_in1kzmvitv2_base.fb_in1kzmvitv2_large.fb_in1kr  zmvitv2_base_cls.fb_inw21kzmvitv2_large_cls.fb_inw21kzmvitv2_huge_cls.fb_inw21kc                 K      t dd| i|S )Nr  r  )r  r  r  r  rL   rL   rM   r  =  rQ  r  c                 K   r  )Nr  r  )r  r  r  rL   rL   rM   r  B  rQ  r  c                 K   r  )Nr  r  )r  r  r  rL   rL   rM   r  G  rQ  r  c                 K   r  )Nr  r  )r  r  r  rL   rL   rM   r  L  rQ  r  c                 K   r  )Nr  r  )r  r  r  rL   rL   rM   r  Q  rQ  r  c                 K   r  )Nr  r  )r  r  r  rL   rL   rM   r  V  rQ  r  c                 K   r  )Nr  r  )r  r  r  rL   rL   rM   r  [  rQ  r  c                 K   r  )Nr  r  )r  r  r  rL   rL   rM   r  `  rQ  r  rs  r  )NF)rl  rr  )Br   rg   collectionsr   dataclassesr   	functoolsr   r   typingr   r   r   r	   r   r
   	timm.datar   r   timm.layersr   r   r   r   r   _builderr   	_featuresr   _features_fxr   _manipulater   	_registryr   r   __all__r   rj   ru  rk   rb   re   r   r   r   r   r   r   r   r  r   r  r8  r  r  r  default_cfgsr  r  r  r  r  r  r  r  rL   rL   rL   rM   <module>   s.   02 + cF /
*
