o
    پi|                     @   s  d dl mZ d dlmZmZmZmZmZmZ d dl	Z	d dl
mZ d dlm  mZ d dlmZmZ d dlmZmZmZmZmZmZmZmZmZ ddlmZ ddlmZm Z  dd	l!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' dd
l(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1 ddgZ2e+G dd dej3Z4G dd dej3Z5G dd dej3Z6d0de7de8de6fddZ9d0de7de8de6fddZ:				d1de7de;de8de8de6f
dd Z<d2d"e7fd#d$Z=e0e=d%d d&e=d%d d&e=d'd(d)Z>e1d0de8de6fd*d+Z?e1d0de8de5fd,d-Z@e1d0de8de5fd.d/ZAdS )3    )partial)CallableListOptionalSequenceTupleUnionNIMAGENET_INCEPTION_MEANIMAGENET_INCEPTION_STD)	SelectAdaptivePool2dLinear	LayerTypePadType	RmsNorm2dConvNormActcreate_conv2dget_norm_act_layer	to_2tuple   )build_model_with_cfg)SqueezeExciteUniversalInvertedResidual)	BlockArgsEfficientNetBuilderdecode_arch_defefficientnet_init_weightsround_channelsresolve_act_layer)feature_take_indices)register_notrace_module)checkpoint_seq
checkpoint)generate_default_cfgsregister_modelMobileNetV5MobileNetV5Encoderc                       s   e Zd ZdZ						ddeeee f deded	ed
ede	e de
de	e de	e f fddZdeej dejfddZ  ZS )"MobileNetV5MultiScaleFusionAdaptera  Multi-layer fusion token adapter.

  Args:
    in_chs: List of input channel counts for each feature scale.
    out_chs: The number of output channels.
    output_resolution: The output resolution.
    expansion_ratio: The FFN expansion ratio.
    interpolation_mode: The upsampling interpolation mode.
    layer_scale_init_value: The initial value of the layer scale, no layer scale if None.
         @nearestNTin_chsout_chsoutput_resolutionexpansion_ratiointerpolation_modelayer_scale_init_valuenoskip	act_layer
norm_layerc
           
   
      s   t    t|trt|n|| _|| _t|| _|| _	|| _
|| _|| _|p)tj}|	p-t}	t| j| jd| j	||	| j| jd| _|	| j| _d S )Nr   )r*   r+   dw_kernel_size_mid	exp_ratior1   r2   r0   r/   )super__init__
isinstancer   sumin_channelsout_channelsr   r,   r-   r.   r/   r0   nnGELUr   r   ffnnorm)
selfr*   r+   r,   r-   r.   r/   r0   r1   r2   	__class__ K/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/mobilenetv5.pyr6   &   s*   


z+MobileNetV5MultiScaleFusionAdapter.__init__inputsreturnc           
      C   s8  |d j dd  }g }t|D ])\}}|j dd  }|d |d k s*|d |d k r3tj||| jd}|| qtj|dd}| |}|d | j	d ksW|d | j	d kr|d | j	d  dksm|d | j	d  dkrwtj|| j	dd}n|d | j	d  }|d | j	d  }	tj
|||	f||	fd}| |}|S )Nr   r   )sizemode)dimbilinear)kernel_sizestride)shape	enumerateFinterpolater.   appendtorchcatr=   r,   
avg_pool2dr>   )
r?   rD   high_resolutionresized_inputs_img	feat_sizechannel_cat_imgs	h_strides	w_stridesrB   rB   rC   forwardJ   s,    
$
z*MobileNetV5MultiScaleFusionAdapter.forward)r(   r)   NTNN)__name__
__module____qualname____doc__r   intr   floatstrr   boolr   r6   rR   Tensorr]   __classcell__rB   rB   r@   rC   r'      s8    	
"$r'   c                )       s  e Zd ZdZdddddddd	dd
d
d
d
deddd
dfdedededededededede	e dede
e de
e de
e de
e dedededede
e d ef( fd!d"Zd#d$ ZejjdFd%efd&d'ZejjdGd(efd)d*Zejjd+ejfd,d-ZdHded efd.d/Z	
			0		dId1ejd2e
eee	e f  d3ed4ed5ed6ed7ed+ee	ej eeje	ej f f fd8d9Z	:			dJd2eee	e f d;ed<ed7efd=d>Zd1ejd+ejfd?d@ZdFd1ejdAed+ejfdBdCZd1ejd+ejfdDdEZ   Z!S )Kr%   z MobiletNet-V5
            F    T)rF   N        avg
block_argsnum_classesin_chans	stem_sizefix_stemnum_featurespad_typeuse_msfamsfa_indicesmsfa_output_resolutionr1   r2   aa_layerse_layerse_from_expround_chs_fn	drop_ratedrop_path_rater/   global_poolc                    s  t    |p	tj}|pt}t||}|pt}| _| _d _	|	 _
|
 _|s,||}t||dd|||d _td|||||||||d
}tj|||  _|j _dd  jD  _|j _|r|  _ _tt j j
d	  _
t fd
d j
D  _t j| j||d _t|d _d _d _ n(|j _| _d _t|d _ j j!  }t"| jd|d _| j _ |rt#dnt$  _%|d	krt& j|nt$  _'t(  dS )a  
        Args:
            block_args: Arguments for blocks of the network.
            num_classes: Number of classes for classification head.
            in_chans: Number of input image channels.
            stem_size: Number of output channels of the initial stem convolution.
            fix_stem: If True, don't scale stem by round_chs_fn.
            num_features: Number of output channels of the conv head layer.
            head_bias: If True, add a learnable bias to the conv head layer.
            pad_type: Type of padding to use for convolution layers.
            act_layer: Type of activation layer.
            norm_layer: Type of normalization layer.
            aa_layer: Type of anti-aliasing layer.
            se_layer: Type of Squeeze-and-Excite layer.
            se_from_exp: If True, calculate SE channel reduction from expanded mid channels.
            round_chs_fn: Callable to round number of filters based on depth multiplier.
            drop_rate: Dropout rate.
            drop_path_rate: Stochastic depth rate.
            layer_scale_init_value: Enable layer scale on compatible blocks if not None.
            global_pool: Type of pooling to use for global pooling features of the FC head.
        Fri      rK   rL   paddingr2   r1       
output_striderw   r~   r}   r1   r2   r{   r|   r   r/   c                 S      g | ]}|d  qS stagerB   .0frB   rB   rC   
<listcomp>       z(MobileNetV5.__init__.<locals>.<listcomp>r   c                       g | ]	} j | d  qS num_chsfeature_infor   mir?   rB   rC   r          r*   r+   r,   r2   r1   	pool_typeNr   )r   ))r5   r6   r;   r<   r   r   r   rr   r   grad_checkpointingry   rz   r   	conv_stemr   
Sequentialblocksfeaturesr   
stage_endsr*   rv   head_hidden_sizer   lenr8   msfa_in_chsr'   msfar   r   	conv_head	norm_head	feat_multr   FlattenIdentityflattenr   
classifierr   )r?   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r1   r2   r{   r|   r}   r~   r   r   r/   r   norm_act_layerbuildernum_pooled_chsr@   r   rC   r6   p   sz   
,

zMobileNetV5.__init__c                 C   sx   | j | jg}|| j || j | jd ur|| j | jd ur(|| j |t	 t
| j| jg tj| S N)r   bn1extendr   rQ   r   r   r   r;   r   Dropoutr   r   r   )r?   layersrB   rB   rC   as_sequential   s   


zMobileNetV5.as_sequentialcoarsec                 C   s   t d|rddS ddS )Nz^conv_stem|bn1z^blocks\.(\d+)z^blocks\.(\d+)\.(\d+))stemr   )dict)r?   r   rB   rB   rC   group_matcher   s   zMobileNetV5.group_matcherenablec                 C   s
   || _ d S r   )r   )r?   r   rB   rB   rC   set_grad_checkpointing   s   
z"MobileNetV5.set_grad_checkpointingrE   c                 C   s   | j S r   )r   r   rB   rB   rC   get_classifier   s   zMobileNetV5.get_classifierc                 C   sR   || _ t|d| _|rtdnt | _|dkr"t| j|| _	d S t | _	d S )Nr   r   r   )
rr   r   r   r;   r   r   r   r   r   r   )r?   rr   r   rB   rB   rC   reset_classifier   s   (zMobileNetV5.reset_classifierNCHWxindicesr>   
stop_early
output_fmtintermediates_onlyextra_blocksc                    s   |dv sJ d|r|sJ dg }|r!t t jd |\}	}
nt t j|\}	}
 fdd|	D }	 j|
 }
d} |}||	v rI|| tj sP|sT j}n jd|
 }|D ]}|d7 }||}||	v rp|| q]|ru|S ||fS )	aa   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
            extra_blocks: Include outputs of all blocks and head conv in output, does not align with feature_info
        Returns:

        r   Output shape must be NCHW./Must use intermediates_only for early stopping.r   c                       g | ]} j | qS rB   r   r   ir   rB   rC   r   &      z5MobileNetV5.forward_intermediates.<locals>.<listcomp>r   N)	r   r   r   r   r   rQ   rR   jitis_scripting)r?   r   r   r>   r   r   r   r   intermediatestake_indices	max_indexfeat_idxr   blkrB   r   rC   forward_intermediates  s2   



z!MobileNetV5.forward_intermediatesr   
prune_norm
prune_headc                 C   s   |rt t| jd |\}}nt t| j|\}}| j| }| jd| | _|t| jk r3d| _d| _|rAd| _d| _| dd |S )z@ Prune layers not required for specified intermediates.
        r   Nr   rl   )r   r   r   r   r   r   r   )r?   r   r   r   r   r   r   rB   rB   rC   prune_intermediate_layersE  s   	
z%MobileNetV5.prune_intermediate_layersc                 C   s   | j d ur7d}g }| |}|| jv r|| | jD ]}|d7 }||}|| jv r/|| q|  |}|S | |}| jrNtj sNt	| j|dd}|S | |}|S )Nr   r   T)r   )
r   r   ry   rQ   r   r   rR   r   r   r!   r?   r   r   r   r   rB   rB   rC   forward_features]  s(   









zMobileNetV5.forward_features
pre_logitsc                 C   sl   |  |}| jd ur| |}| jd ur| |}| |}| jdkr-tj|| j| jd}|r1|S | |S )Nro   )ptraining)	r   r   r   r   r   rO   dropoutr   r   )r?   r   r   rB   rB   rC   forward_headt  s   







zMobileNetV5.forward_headc                 C   s   |  |}| |}|S r   )r   r   r?   r   rB   rB   rC   r]     s   

zMobileNetV5.forwardF)T)rp   NFFr   FF)r   FTF)"r^   r_   r`   ra   r   r   rb   re   rd   r   r   r   r   rc   r6   r   rR   r   ignorer   r   r;   Moduler   r   rf   r   r   r   r   r   r   r]   rg   rB   rB   r@   rC   r%   l   s    	
v
 	
@
c                "       sD  e Zd ZdZddddddddddd	ed
d
dfdededededede	e dede
e de
e de
e de
e dedededede
e f  fddZ						d.dejde
eeee f  d ed!ed"ed#ed$ed%eeej eejeej f f fd&d'Zdejd%ejfd(d)Zdejd%ejfd*d+Zdejd%ejfd,d-Z  ZS )/r&   zMobileNetV5 Vision Encoderri   @   Frl   )rF   rn   rj   NTro   rq   rs   rt   ru   rw   ry   rz   r1   r2   r{   r|   r}   r~   r   r   r/   c                    s
  t    |p	tj}|	pt}	|pt}d _| _d _|s!||}t	||dd||	|d _
td|||||	|
|||d
}tj|||  _|j _dd	  jD  _d
  _ _tt j|d  _t fdd	 jD  _| _t j j j|	|d _t  d S )Nr   Fri   r   r   r   r   c                 S   r   r   rB   r   rB   rB   rC   r     r   z/MobileNetV5Encoder.__init__.<locals>.<listcomp>rk   c                    r   r   r   r   r   rB   rC   r     r   r   )r5   r6   r;   r<   r   r   rr   r   r   r   r   r   r   r   r   r   r   rv   r   r   r   ry   r8   r   rz   r'   r   r   )r?   rq   rs   rt   ru   rw   ry   rz   r1   r2   r{   r|   r}   r~   r   r   r/   r   r@   r   rC   r6     sZ   


zMobileNetV5Encoder.__init__r   r   r   r>   r   r   r   r   rE   c                    s&  ~|dv s	J d|r|sJ dg }g }	|r$t t jd |\}
}nt t j|\}
} fdd|
D }
 j| }d} |}||
v rL|| | jv rV|	| tj	 s]|sa j}n jd| }|D ]}|d7 }||}||
v r}|| | jv r|	| qj|r|S  
|	|fS )	al   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: (Unused) Applies norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
            extra_blocks: Include outputs of all blocks and head conv in output, does not align with feature_info
        Returns:

        r   r   r   r   c                    r   rB   r   r   r   rB   rC   r     r   z<MobileNetV5Encoder.forward_intermediates.<locals>.<listcomp>r   N)r   r   r   r   r   rQ   ry   rR   r   r   r   )r?   r   r   r>   r   r   r   r   r   msfa_intermediatesr   r   r   r   r   rB   r   rC   r     s>   







z(MobileNetV5Encoder.forward_intermediatesc                 C   s`   d}g }|  |}|| jv r|| | jD ]}|d7 }||}|| jv r*|| q| |S )Nr   r   )r   ry   rQ   r   r   r   rB   rB   rC   r     s   






z#MobileNetV5Encoder.forward_featuresc                 C   s   t d)Nz=MobileNetV5Encoder does not support classification use cases.)NotImplementedErrorr   rB   rB   rC   r   (  s   zMobileNetV5Encoder.forward_headc                 C   s
   |  |S r   )r   r   rB   rB   rC   r]   +  s   
zMobileNetV5Encoder.forwardr   )r^   r_   r`   ra   r   r   rb   re   rd   r   r   r   r   rc   r6   rR   rf   r   r   r   r   r   r   r]   rg   rB   rB   r@   rC   r&     s    	
K 	
EFvariant
pretrainedrE   c                 K   s<   | dd}t|dd}d}tt| |fd||d|}|S )Nout_indicesr   r   r   ri      getterr   feature_cls)rr   rv   	head_conv	head_bias	head_normr   F)pretrained_strictfeature_cfgkwargs_filter)popr   r   r&   )r   r   kwargsr   r   r   modelrB   rB   rC   _create_mnv5_encoder/  s   	r   c                 K   s6   | dd}t|dd}tt| |fd|d|}|S )Nr   r   r   r   F)r   r   )r   r   r   r%   )r   r   r   r   r   r   rB   rB   rC   _create_mnv5F  s   r         ?channel_multiplierencoderc           	   	   K   s   d| v rg dg dg dg dg}ng dg dg dg dg}t t||dd	|d
k tt|dttjdd}t |fi |}|rLt| |fi |}|S t| |fi |}|S )Nmobilenetv5_base)er_r1_k3_s2_e4_c128er_r1_k3_s1_e4_c128r   )uir_r1_a3_k5_s2_e6_c256uir_r1_a5_k0_s1_e4_c256uir_r1_a3_k0_s1_e4_c256r  r  )uir_r1_a5_k5_s2_e6_c512uir_r1_a5_k0_s1_e4_c512r  uir_r1_a0_k0_s1_e1_c512mqa_r1_k3_h8_s2_d64_c512uir_r1_a0_k0_s1_e2_c512r  r  r  r  r  r  r  r  r  r  )uir_r1_a5_k5_s2_e6_c1024mqa_r1_k3_h16_s1_d64_c1024uir_r1_a0_k0_s1_e2_c1024r	  r
  r	  r
  r	  r
  r	  r
  r	  r
  r	  r
  )%uir_r1_a5_k5_s2_e6_c640uir_r1_a5_k0_s1_e4_c640r  r  r  r  r  r  uir_r1_a0_k0_s1_e1_c640mqa_r1_k3_h12_v2_s1_d64_c640uir_r1_a0_k0_s1_e2_c640r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )'uir_r1_a5_k5_s2_e6_c1280mqa_r1_k3_h16_s1_d96_c1280uir_r1_a0_k0_s1_e2_c1280r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )
group_sizer   r   )
multipliergh㈵>)rq   rt   ru   r~   r2   r1   r/   )	r   r   r   r   r   r;   r<   r   r   )	r   r   r  r   r   r   arch_defmodel_kwargsr   rB   rB   rC   _gen_mobilenet_v5T  s4   8(c

	r  rl   urlc                 K   s   | dddddt tddd
|S )	Nrh   )ri      r  )rj   rj   r   bicubiczconv_stem.convr   )
r  rr   
input_size	pool_sizecrop_pctinterpolationmeanstd
first_convr   r	   )r  r   rB   rB   rC   _cfg  s   r"  )ri      r#  )r  rr   rh   )rr   )mobilenetv5_300m_encmobilenetv5_300mzmobilenetv5_base.untrainedc                 K   s(   | dd}t	d| d|d|}|S )zMobileNet V5 Vision Encoderrw   samer$  T)r   r   rw   N)r$  )r   r  )r   r   rw   r   rB   rB   rC   r$  #  s   r$  c                 K      t dd| i|}|S )Nr%  r   )r%  r  r   r   r   rB   rB   rC   r%  1     r%  c                 K   r'  )Nr   r   )r   r(  r)  rB   rB   rC   r   7  r*  r   r   )r   NFF)rl   )B	functoolsr   typingr   r   r   r   r   r   rR   torch.nnr;   torch.nn.functional
functionalrO   	timm.datar
   r   timm.layersr   r   r   r   r   r   r   r   r   _builderr   _efficientnet_blocksr   r   _efficientnet_builderr   r   r   r   r   r   	_featuresr   _features_fxr    _manipulater!   r"   	_registryr#   r$   __all__r   r'   r%   r&   rd   re   r   r   rc   r  r"  default_cfgsr$  r%  r   rB   rB   rB   rC   <module>   sx     , R   )
 4
