o
    ߥiC                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ eje
jejdG dd deZdS )    N)Models)
TorchModel)MODELS)	ModelFileTasks   )SwinTransformer)DeformableTransformer)FPNFusionModule)Detector)module_namec                       s.   e Zd ZdZdef fddZdd Z  ZS )	VidtModelaA  
        The implementation of 'ViDT for joint-learning of object detection and instance segmentation'.
        This model is dynamically initialized with the following parts:
            - 'backbone': pre-trained backbone model with parameters.
            - 'head': detection and segentation head with fine-tuning.
    	model_dirc           	         s   t t|   tj|tj}tj	|dd}t
ddgdg dg dddd	}|jd
dddgd || _| jj|d dd t|jdd}tddddddddddd
}t||dddd|ddddddd}|| _| jj|d dd dS )z Initialize a Vidt Model.
        Args:
          model_dir: model id or path, where model_dir/pytorch_model.pt contains:
                    - 'backbone_weights': parameters of backbone.
                    - 'head_weights': parameters of head.
        cpu)map_location   `   )   r      r   )   r            g?)pretrain_img_size	embed_dimdepths	num_headswindow_sizedrop_path_ratevidti,     r   )methoddet_token_numpos_dimcross_indicesbackbone_weightsT)strict)fuse_dim   r   i   g?relu   F)
d_modelnheadnum_decoder_layersdim_feedforwarddropout
activationreturn_intermediate_decnum_feature_levelsdec_n_pointstoken_labelr   N)num_classesnum_queriesaux_losswith_box_refineepffwith_vectorprocessor_dct	iou_awarer4   vector_hidden_dimdistilhead_weights)superr   __init__ospathjoinr   TORCH_MODEL_FILEtorchloadr   finetune_detbackboneload_state_dictr
   num_channelsr	   r   head)	selfr   kwargs
model_path
model_dictrI   r9   deform_transformersrL   	__class__ S/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/vidt/model.pyrA      s`   
zVidtModel.__init__c              	   C   s:   |  ||\}}}}}}| |||||||\}	}
|	|
fS )z Dynamic forward function of VidtModel.
        Args:
            x: input images (B, 3, H, W)
            mask: input padding masks (B, H, W)
        )rI   rL   )rM   xmask
features_0
features_1
features_2
features_3det_tgtdet_posout_pred_logitsout_pred_boxesrT   rT   rU   forwardW   s   zVidtModel.forward)__name__
__module____qualname____doc__strrA   r`   __classcell__rT   rT   rR   rU   r      s    =r   )rB   rF   modelscope.metainfor   'modelscope.models.base.base_torch_modelr   modelscope.models.builderr   modelscope.utils.constantr   r   rI   r   deformable_transformerr	   
fpn_fusionr
   rL   r   register_moduleimage_object_detectionr   r   rT   rT   rT   rU   <module>   s   