o
    ॵi                     @   s,  d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlm	Z	m
Z
 d dlmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZ d dlmZ d d	lmZm Z  d d
l!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z) d dl*m+Z+ e+ Z,e"j-e)j.ej.dG dd de Z/dS )    N)defaultdictdeque)AnyDict)tqdm)	Pipelines)Model)LengthAdaptiveTokenizerVoPinit_transform_dict	load_dataload_frames_from_video)
OutputKeys)InputPipeline)	PIPELINES)
load_image)Config)	ModelFileTasks)
get_logger)module_namec                       s   e Zd Zdef fddZdedeeef fddZdeeef deeef fdd	Z	d
eeef deeef fddZ
  ZS )VopRetrievalPipelinemodelc                    s   t  jdd|i| td| j| _td || _	t
t|tj| _t| jjjd | _td tt|d dd}t| jj|| _td	 tt|d
| j| _td dS )z
        use `model` to create a vop pipeline for retrieval
        Args:
            model: model id on modelscope hub.
        r   zdamo/cv_vit-b32_retrieval_vopzload model done	clip_testzload transform donezbpe_simple_vocab_16e6.txt.gzzutf-8
zload tokenizer donezVoP_msrvtt9k_features.pklzload database doneN )super__init__r   from_pretrainedtodevicer   loggerinfo	local_pthr   	from_fileospjoinr   CONFIGURATIONcfgr   
hyperparam	input_resimg_transformgzipopenreaddecodesplitr	   	tokenizerr   database)selfr   kwargsbpe_path	__class__r   b/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/pipelines/cv/vop_retrieval_pipeline.pyr   $   s2   



zVopRetrievalPipeline.__init__inputreturnc                    s   t |trfd|v r@g }|fD ]!}t j|}t| jjj jjj	\}} 
|}|| qtj|ddj jdd}d}n/ j|dddd}t |tjrX|j jdd}n fd	d
| D }d}n	tdt| ||d}|S )Nz.mp4r   )dimTnon_blockingv2tpt)return_tensorspadding
truncationc                    s"   i | ]\}}||j  jd dqS )Tr=   )r    r!   ).0keyvalr4   r   r9   
<dictcomp>Y   s    z3VopRetrievalPipeline.preprocess.<locals>.<dictcomp>t2vz input should be a str,  but got )
input_datamode)
isinstancestrr&   r'   r$   r   r)   r*   
num_framesvideo_sample_typer,   appendtorchstackr    r!   r2   Tensoritems	TypeErrortype)r4   r:   query
video_pathimgsidxsrK   resultr   rG   r9   
preprocessD   s@   





zVopRetrievalPipeline.preprocessc                 C   s   | j \}}}}t h |d dkr8| j|d }||j }tj|| jjjddd 	 
 }t|| }	n+|d dkrc| j|d }||j }tj|| jjjddd 	 
 }t|| }	|	|d d}
|
W  d    S 1 svw   Y  d S )	NrK   rI   rJ   )kr<      r?   )output_datarK   )r3   rQ   no_gradr   get_text_featuresTtopkr)   r*   cpunumpynparrayget_video_features)r4   r:   text_embedsvid_embeds_pooledvid_idstextsquery_featsscoreretrieval_idxsresresultsr   r   r9   forwardd   s>   




$zVopRetrievalPipeline.forwardinputsc                 C   s   |S )Nr   )r4   rt   r   r   r9   postprocessy   s   z VopRetrievalPipeline.postprocess)__name__
__module____qualname__rM   r   r   r   r   r\   rs   ru   __classcell__r   r   r7   r9   r       s
     " *r   )0r-   mathosos.pathpathr&   picklerandomcollectionsr   r   typingr   r   rf   rg   rQ   r   modelscope.metainfor   modelscope.modelsr   "modelscope.models.cv.vop_retrievalr	   r
   r   r   r   modelscope.outputsr   modelscope.pipelines.baser   r   modelscope.pipelines.builderr   modelscope.preprocessorsr   modelscope.utils.configr   modelscope.utils.constantr   r   modelscope.utils.loggerr   r"   register_modulevop_retrievalr   r   r   r   r9   <module>   s4   