o
    ߥi                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZ ddlmZmZmZmZmZ dd	lmZ e	jejejd
G dd deZdS )    N)Models)
TorchModel)MODELS)Config)	ModelFileTasks   )BboxRegressorQ2VRankerStage1Q2VRankerStage2V2QRankerStage1V2QRankerStage2)SwinTransformerV2_1D)module_namec                       sJ   e Zd ZdZdef fddZdd Zdd Z										dd
dZ  Z	S )SOONeta  
        The implementation of 'Scanning Only Once: An End-to-end Framework for Fast Temporal Grounding
        in Long Videos'. The model is dynamically initialized with the following parts:
            - q2v_stage1: calculate qv_ctx_score.
            - v2q_stage1: calculate vq_ctx_score.
            - q2v_stage2: calculate qv_ctn_score.
            - v2q_stage2: calculate vq_ctn_score.
            - regressor: predict the offset of bounding box for each candidate anchor.
    	model_dirc           
         s  t    tj|tj}t|j	| _
| j
j}| j
j}| j
j}| j
j| _| j
j| _|| _t|||dg| dg| dg| dddddtjddd	g| d
| _t||| _t||| _| jrjt|||| _t||| _t|| j| _tj|d}tj|ddd }	| j |	dd dS )zl
            Initialize SOONet Model

            Args:
                model_dir: model id or path
              @   g       @Tg        g?Fr   )
patch_sizein_chans	embed_dimdepths	num_headswindow_size	mlp_ratioqkv_bias	drop_rateattn_drop_ratedrop_path_rate
norm_layer
patch_normuse_checkpointpretrained_window_sizesz"SOONet_MAD_VIT-B-32_4Scale_10C.pthcpu)map_locationmodel)strictN)!super__init__ospathjoinr   CONFIGURATIONr   	from_filehyperparamsconfignscales
hidden_dimsnippet_lengthenable_stage2stage2_topkr   nn	LayerNormvideo_encoderr
   
q2v_stage1r   
v2q_stage1r   
q2v_stage2r   
v2q_stage2r	   	regressortorchloadload_state_dict)
selfr   argskwargsconfig_pathr1   r2   r3   
model_path
state_dict	__class__ ^/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/soonet/model.pyr)      sN   


zSOONet.__init__c                 K   s&   | j r| jdi |S | jdi |S )NrI   )trainingforward_trainforward_testrA   rC   rI   rI   rJ   forwardO   s   zSOONet.forwardc                 K   s   t )N)NotImplementedErrorrN   rI   rI   rJ   rL   U   s   zSOONet.forward_trainNc                 K   s  |}|  |ddd}| ||}	| jrt }
t }t }t }t| jD ]n}tj|	| ddd\}}tt	tt
|ddd| jf     \}}||j}|
| |t|| d| || }||d  }t||| d|}t||| d|}|| || q&tj|dd}tj|dd}| |||
|	\}}}|}nd}|	}|}|}| |||}ttj|dd}||||fS )a  
            Obtain matching scores and bbox bias of the top-k candidate anchors, with
            pre-extracted query features and video features as input.

            Args:
                query_feats: the pre-extracted text features.
                video_feats: the pre-extracted video features.
                start_ts: the start timestamps of pre-defined multi-scale anchors.
                end_ts: the end timestamps of pre-defined multi-scale anchors.
                scale_boundaries: the begin and end anchor index for each scale in start_ts and end_ts.

            Returns:
                [final_scores, bbox_bias, starts, ends]
        r   r   r   T)dim
descendingN)rQ   )r8   permuter9   r4   listranger1   r>   sort
LongTensorsetr5   flattenr$   numpytolisttodeviceappendindex_selectcatr;   r=   sigmoid)rA   query_featsvideo_featsstart_tsend_tsscale_boundariesrC   	sent_feat	ctx_featsqv_ctx_scoreshit_indicesstartsendsfiltered_ctx_featsi_indicesscale_first
scale_lastfiltered_startfiltered_endqv_merge_scoresqv_ctn_scores	ctn_feats	bbox_biasfinal_scoresrI   rI   rJ   rM   X   sb   




zSOONet.forward_test)NNNNN)
__name__
__module____qualname____doc__strr)   rO   rL   rM   __classcell__rI   rI   rG   rJ   r      s    
0r   )r*   r>   torch.nnr6   modelscope.metainfor   'modelscope.models.base.base_torch_modelr   modelscope.models.builderr   modelscope.utils.configr   modelscope.utils.constantr   r   blocksr	   r
   r   r   r   swin_transformerr   register_modulevideo_temporal_groundingsoonetr   rI   rI   rI   rJ   <module>   s   