o
    ߥi                     @   s   d dl Z d dlmZmZ d dlZd dlZd dlmZ d dlm  m	Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ dd	lmZmZ ejejejd
G dd deZdS )    N)AnyDict)Models)
TorchModel)MODELS)
OutputKeys)	ModelFileTasks   )CLIPProbingModel)module_namec                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )StructuredProbingModelz
    The implementation of 'Structured Model Probing: Empowering
        Efficient Adaptation by Structured Regularization'.
    c                    sz   t t|   tj|d}t|}|d d | _|d d | _	t
dd|d d| _t| j| j	| _| j|d	  d
S )zc
        Initialize a probing model.
        Args:
            model_dir: model id or path
        zfood101-clip-vitl14-full.pt	meta_infofeature_sizenum_classesCLIP_ViTL14_FP16Tbackbone_model_state_dict)use_pretrain
state_dictprobing_model_state_dictN)superr   __init__ospathjointorchloadr   r   r   backboner   probing_modelload_state_dict)self	model_dirargskwargs
model_file	__class__ b/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/image_probing_model/model.pyr      s   
zStructuredProbingModel.__init__c           	      C   s   g }t ddD ]}|d| |d| |d| q|d | | }g }|D ]}| || d}|| q2tj|dd	}| |	 }|S )
ze
        Forward Function of SMP.
        Args:
            x: the input images (B, 3, H, W)
        r      zlayer_{}_pre_attnzlayer_{}_attnzlayer_{}_mlp
pre_logitsi   r
   dim)
rangeappendformatr   halfaggregate_tokenr   catr   float)	r!   xkeysidxfeaturesfeatures_aggiaggregated_featureoutputsr(   r(   r)   forward.   s   
zStructuredProbingModel.forwardc                 C   s   t |jdkrA|j\}}}||krd}n
|| }t|| }|dkr:t|d}tjj||d|}tj|dd}ntj|dd}tjj	j
|dd}|S )z
        Aggregating features from tokens.
        Args:
            output: the output of intermidiant features
                from a ViT model
            target_size: target aggregated feature size
           r   )r      r
   )kernel_sizestrider
   )	start_dimr,   )lenshapeintr   permutenn	AvgPool1dflattenmean
functional	normalize)r!   outputtarget_size_n_tokenchannels	pool_sizen_groupsr(   r(   r)   r2   D   s"   z&StructuredProbingModel.aggregate_token)__name__
__module____qualname____doc__r   r=   r2   __classcell__r(   r(   r&   r)   r      s
    r   )r   typingr   r   jsonr   torch.nnrG   torch.nn.functionalrK   Fmodelscope.metainfor   'modelscope.models.base.base_torch_modelr   modelscope.models.builderr   modelscope.outputsr   modelscope.utils.constantr   r	   r   r   r   register_moduleimage_classificationimage_probing_modelr   r(   r(   r(   r)   <module>   s    