o
    ߥi                     @   s   d dl Z d dlmZ d dlZd dlmZ d dlm  mZ	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZ ddlmZ dd	lmZmZ ejejejd
G dd deZG dd deZdS )    N)Models)
TorchModel)MODELS)Config)	ModelFileTasks   )	load_clip)get_state_dictset_seed)module_namec                       sB   e Zd ZdZdef fddZdddZdd	 Zdd
dZ  Z	S )VideoTextRetrievalModelSeriesa  
        The implementation of 'VoP: Text-Video Co-operative Prompt Tuning for Cross-Modal Retrieval'.
        This model is dynamically initialized with the following parts:
            - clip: the upstream pre-trained backbone model (CLIP in this code).
                - The pretrain param (ViT-B/32) downloads from OpenAI:
                - "https://openaipublic.azureedge.net/clip/models/
                - 40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"
            - pool_frames: the frames pooling method
            - visual_prompt_learner: visual prompt
            - ImageEncoder: get image encoder
            - TextPromptLearner: text prompt
            - TextEncoder: get text encoder
    	model_dirc                    sv   t t|   t|d}t|d}t|tj}t|j	| _
t|d| _t| j
j| _| t| |   dS )zl
            Initialize a VoP Model

            Args:
                model_dir: model id or path,
        zVoPSE_msrvtt9k.pthzViT-B-32.pt)nameN)superr   __init__ospjoinr   CONFIGURATIONr   	from_file
hyperparamconfigr	   clipBaselinePoolingpooling_typepool_framesload_state_dictr
   eval)selfr   argskwargs
model_path	clip_archconfig_path	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/vop_retrieval/model_se.pyr   $   s   z&VideoTextRetrievalModelSeries.__init__Fc                 C   sl   |j d }|dd| jj| jj}| j|}||jddd }||| jjd}| |}|r4||fS |S )z
            Get video Features

            Args:
                videos: the dim is [1, 12, 3, 224, 224]
                return_all_frames: default False
        r      Tdimkeepdim)	shapereshaper   	input_resr   encode_imagenorm
num_framesr   )r   videosreturn_all_frames
batch_size
video_datavideo_featuresvideo_features_pooledr&   r&   r'   get_video_features;   s   

z0VideoTextRetrievalModelSeries.get_video_featuresc                 C   s"   | j |}||jddd }|S )zh
            Get Text Features

            Args:
                text_data: the dim is [1, 69]
        r(   Tr*   )r   encode_textr1   )r   	text_datatext_featuresr&   r&   r'   get_text_featuresU   s
   z/VideoTextRetrievalModelSeries.get_text_featuresc           	      C   s   |d j d }|d }|d }|dd| jj| jj}| j|}| j|}||jddd }||jddd }||| jjd}| 	|}|rN|||fS ||fS )z
            Dynamic Forward Function of VoP

            Args:
                data: the input data
                return_all_frames: default False
        videor   textr(   r)   Tr*   )
r-   r.   r   r/   r   r:   r0   r1   r2   r   )	r   datar4   r5   r;   r6   r<   r7   r8   r&   r&   r'   forwardb   s*   

z%VideoTextRetrievalModelSeries.forward)F)
__name__
__module____qualname____doc__strr   r9   r=   rA   __classcell__r&   r&   r$   r'   r      s    
r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )r   z(
        Redefined Pooling Function
    c                    s&   t t|   |dkr| j| _d S t)Navg)r   r   r   _avg_poolingpooling_funcNotImplementedError)r   r   r$   r&   r'   r      s   zBaselinePooling.__init__c                 C   s   |j dd}|S )z
            Pooling mean of frames

            Args:
                video_embeds: the input video embedding with [1, 12, 512].

            Returns:
                video_embeds_pooled: num_vids x embed_dim
        r   )r+   )mean)r   video_embedsvideo_embeds_pooledr&   r&   r'   rI      s   
zBaselinePooling._avg_poolingc                 C   s
   |  |S )N)rJ   )r   rM   r&   r&   r'   rA      s   
zBaselinePooling.forward)rB   rC   rD   rE   r   rI   rA   rG   r&   r&   r$   r'   r      s
    r   )osos.pathpathr   torchtorch.nnnntorch.nn.functional
functionalFmodelscope.metainfor   'modelscope.models.base.base_torch_modelr   modelscope.models.builderr   modelscope.utils.configr   modelscope.utils.constantr   r   backboner	   basic_utilsr
   r   register_modulevop_retrievalvop_retrieval_model_ser   r   r&   r&   r&   r'   <module>   s"   m