o
    ߥi4                     @   s  d dl Z d dlmZ d dlZd dlmZ d dlm  mZ	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZ ddlmZ dd	lmZmZ ejejejd
G dd deZG dd deZG dd deZG dd deZ G dd deZ!G dd deZ"dS )    N)Models)
TorchModel)MODELS)Config)	ModelFileTasks   )	load_clip)get_state_dictset_seed)module_namec                       sB   e Zd ZdZdef fddZdddZdd	 Zdd
dZ  Z	S )VoPa  
        The implementation of 'VoP: Text-Video Co-operative Prompt Tuning for Cross-Modal Retrieval'.
        This model is dynamically initialized with the following parts:
            - clip: the upstream pre-trained backbone model (CLIP in this code)
            - pool_frames: the frames pooling method
            - visual_prompt_learner: visual prompt
            - ImageEncoder: get image encoder
            - TextPromptLearner: text prompt
            - TextEncoder: get text encoder
    	model_dirc                    s   t t|   t|d}t|d}t|tj}t|j	| _
t|d| _tt| jjjj| j
_tt| jjj| j
_t| j
j| j
| _t| j| j
| _t| j| j
| _t| j| j
| _t| j| j
| _| t | | !  dt"j#d< t$| j
j% dS )zl
            Initialize a VoP Model

            Args:
                model_dir: model id or path,
        zVoP_msrvtt9k.pthzViT-B-32.pt)namefalseTOKENIZERS_PARALLELISMN)&superr   __init__ospjoinr   CONFIGURATIONr   	from_file
hyperparamconfigr	   cliplistrangevisualtransformerlayers
vpt_layers
tpt_layersBaselinePoolingpooling_typepool_framesVisualPromptLearnervisual_prompt_learnerImageEncoderimage_encoderTextPromptLearnertext_prompt_learnerTextEncodertext_encoderload_state_dictr
   evalosenvironr   seed)selfr   argskwargs
model_path	clip_archconfig_path	__class__ \/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/vop_retrieval/model.pyr   !   s.   
zVoP.__init__Fc                 C   sv   |j d }|dd| jj| jj}|  }| ||}||jddd }||| jjd}| d|}|r9||fS |S )z
            Get video Features

            Args:
                videos: the dim is [1, 12, 3, 224, 224]
                return_all_frames: default False
        r      TdimkeepdimN)	shapereshaper   	input_resr&   r(   norm
num_framesr$   )r2   videosreturn_all_frames
batch_size
video_datavisual_promptsvideo_featuresvideo_features_pooledr:   r:   r;   get_video_featuresF   s    
zVoP.get_video_featuresc                 C   s*   |   }| ||}||jddd }|S )zh
            Get Text Features

            Args:
                text_data: the dim is [1, 69]
        r<   Tr>   )r*   r,   rD   )r2   	text_datatext_promptstext_featuresr:   r:   r;   get_text_featuresa   s   zVoP.get_text_featuresc                 C   s   |d j d }|d }|d }|dd| jj| jj}|  }| ||}|  }| ||}	|	|	jddd }	||jddd }||| jj	d}| 
|	|}
|rW|	||
fS |	|
fS )z
            Dynamic Forward Function of VoP

            Args:
                data: the input data
                return_all_frames: default False
        videor   textr<   r=   Tr>   )rA   rB   r   rC   r&   r(   r*   r,   rD   rE   r$   )r2   datarG   rH   rN   rI   rJ   rK   rO   rP   rL   r:   r:   r;   forwardo   s.   
zVoP.forward)F)
__name__
__module____qualname____doc__strr   rM   rQ   rU   __classcell__r:   r:   r8   r;   r      s    
%r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )r"   z(
        Redefined Pooling Function
    c                    s&   t t|   |dkr| j| _d S t)Navg)r   r"   r   _avg_poolingpooling_funcNotImplementedError)r2   r#   r   r8   r:   r;   r      s   zBaselinePooling.__init__c                 C   s   |j dd}|S )a$  
            Pooling mean of frames

            Args:
                text_embeds: the input text embedding which is None here.
                video_embeds: the input video embedding with [1, 12, 512].

            Returns:
                video_embeds_pooled: num_vids x embed_dim
        r   r?   )mean)r2   text_embedsvideo_embedsvideo_embeds_pooledr:   r:   r;   r]      s   zBaselinePooling._avg_poolingc                 C   s   |  ||S N)r^   )r2   rb   rc   r:   r:   r;   rU      s   zBaselinePooling.forward)rV   rW   rX   rY   r   r]   rU   r[   r:   r:   r8   r;   r"      s
    r"   c                       (   e Zd ZdZ fddZdd Z  ZS )r%   a  
        The implementation of visual prompt.
        This module is used to define the learnable prompt parameters:
            the number of tokens is 8,
            the prompt dimension is 768,
            and the initialization weight std used is 0.02.
    c                    sd   t t|   |j}|jjjjd }|j}t	j
t|jd|||d}tjj|dd t|| _d S )Nr   r   dtype{Gz?std)r   r%   r   vp_token_numr   ln_postweightrA   rh   torchemptylenr    nninitnormal_	ParameterrJ   )r2   
clip_modelr   rl   vp_dimrh   rJ   r8   r:   r;   r      s   zVisualPromptLearner.__init__c                 C   s
   | j }|S re   )rJ   )r2   vpr:   r:   r;   rU      s   zVisualPromptLearner.forwardrV   rW   rX   rY   r   rU   r[   r:   r:   r8   r;   r%      s    r%   c                       rf   )r)   a  
        The implementation of visual prompt.
        This module is used to define the learnable prompt parameters:
            the number of tokens is 4,
            the prompt dimension is 512,
            and the initialization weight std used is 0.02.
    c                    s   t t|   |j}|j}|dkr|dksJ |jjjd }|j}t	j
t|j|| ||d}tjj|dd t|| _|| _|| _d S )Nr   rg   ri   rj   )r   r)   r   tp_prefix_token_numtp_suffix_token_numln_finalrn   rA   rh   ro   rp   rq   r!   rr   rs   rt   ru   rO   )r2   rv   r   rz   r{   tp_dimrh   rO   r8   r:   r;   r      s    
zTextPromptLearner.__init__c                 C   s<   | j d d d | jd d f | j d d | jd d d f fS re   )rO   rz   )r2   r:   r:   r;   rU      s   zTextPromptLearner.forwardry   r:   r:   r8   r;   r)      s    r)   c                       rf   )r'   z
        The implementation of image encoder.
        This module is used to obtain the features of each frame of the video.
    c                    sv   t t|   || _|j| _|j| _|j| _|jj| _|jj	| _	|jj
| _
|jj| _|jj| _|jj| _|jj| _d S re   )r   r'   r   r   r    rl   rE   r   conv1class_embeddingpositional_embeddingln_prer   rm   projr2   rv   r   r8   r:   r;   r      s   





zImageEncoder.__init__c           	   	   C   s  |j d }| |}|||j d d}|ddd}| j|j}tj|d|j d |j|j	d}|| }tj
||gdd}|| j|j }t| jjD ]}|| jv r| j|}||ddddddf |dd}tj
|ddddddf ||ddddddf gdd}|dkr| |}|ddd}| jj| |}|ddd}|d | jv rtj
|ddddddf |ddd| j dddf gdd}qM| |dddddf }| jdur|| j }|S )a   
            The forward function of image encoder.

            Args:
                visual_prompts: the visual prompt, dim is [12, 1, 8, 768]
                x: the input data, dim is [12, 3, 224, 224]

            Returns:
                x: the output data, dim is [12, 512]
        r   r   r<      )rh   devicer`   N)rA   r~   rB   permuter   torh   ro   zerosr   catr   r   r   r   r    indexrepeatr   	resblocksrl   rm   r   )	r2   rJ   xrH   x_1x_2i_layeri_promptcur_layer_vpr:   r:   r;   rU     s@   


@
<

zImageEncoder.forwardry   r:   r:   r8   r;   r'      s    r'   c                       rf   )r+   z
        The implementation of text encoder.
        This module is used to obtain the features of each word of the sentence.
    c                    sv   t t|   |j| _|j| _|j| _|j| _|j| _|j| _|j	| _	d| j	v s*J |j
| _
|j| _|j
|j | _d S )Nr   )r   r+   r   r   token_embeddingr   r|   text_projectionrh   r!   rz   r{   tp_token_numr   r8   r:   r;   r   9  s   zTextEncoder.__init__c              	   C   s8  |  || j}|jd }|\}}t| jjD ]}|| jv r| j|}| j	dkr^|||d ddddf 
|dd}	tj|ddddddf |	|ddddddf gdd}| jdkr|||d ddddf 
|dd}
tj|ddddddf |
|ddddddf gdd}|dkr|| j| j }|ddd}| jj| |}|ddd}|d | jv r|ddddddf }|ddd| j	 d| j ddf }|ddddddf }tj|||gdd}|}q| || j}|t|jd |jdd| j f | j }|S )a  
            The forward function of text encoder.

            Args:
                text_prompts: the text prompt, dim is 2 x [12, 4, 512]
                text: the input data, dim is [1, 69]

            Returns:
                x: the output data, dim is [1, 512]
        r   r   Nr<   r`   r   )r   typerh   rA   r   r   r   r!   r   rz   expandro   r   r{   r   r   r   r|   arangeargmaxr   r   )r2   rO   rS   r   rH   prompt_prefixprompt_suffixr   r   cur_layer_tp_prefixcur_layer_tp_suffixtemp_1temp_2temp_3tempr:   r:   r;   rU   H  sp   


6
4zTextEncoder.forwardry   r:   r:   r8   r;   r+   3  s    r+   )#r/   os.pathpathr   ro   torch.nnrr   torch.nn.functional
functionalFmodelscope.metainfor   'modelscope.models.base.base_torch_modelr   modelscope.models.builderr   modelscope.utils.configr   modelscope.utils.constantr   r   backboner	   basic_utilsr
   r   register_modulevop_retrievalvop_retrieval_modelr   r"   r%   r)   r'   r+   r:   r:   r:   r;   <module>   s*   }"G