o
    ߥiD                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlZd dlZ	d dl
Z
d dlZd dlmZ d dlmZ dddZdd	 Zd
d Zdd Zdd ZdddZdddZdS )    N)OrderedDict)Image)
transforms   c              
   C   sd   t t j| tjdt | t ddgt t j| ddt  t j	ddddt ddgd}|S )	z
        The implementation of transforms functions.
        The default image resolution is 224.
        The normalize parameter follows the mainstream setting.
    )interpolation)g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?)g      ?g      ?)scaler   )
brightness
saturationhue)	clip_test
clip_train)
r   ComposeResizer   BICUBIC
CenterCrop	NormalizeRandomResizedCropRandomHorizontalFlipColorJitter)	input_res	tsfm_dict r   b/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/vop_retrieval/basic_utils.pyinit_transform_dict   s"   r   c                 C   sF   t | }|d j|d}|d j|d}|d }|d }||||gS )a7  
        Loading dataset from 'feature_path' as a retrieval docs.
        The default dataset is MSRVTT-9K.

        Args:
            feature_path: 'VoP_msrvtt9k_features.pkl'
            mydevice: device(type='cuda', index=0)

        Returns:
            [text_embeds, vid_embeds_pooled, vid_ids, texts]
    text_embeds)device
vid_embedsvid_idstexts)torchloadto)feature_pathmydevicefeature_contentr   vid_embeds_pooledr   r   r   r   r   	load_data,   s   
r&   c                 C   s8   t | d}t|W  d   S 1 sw   Y  dS )z
        Load json files.
    rN)openjsonr    )filenamefr   r   r   	load_json@   s   $r,   c                 C   sP   | dkr&t |  tj|  t j|  t|  dt jj_	dt jj_
dS dS )z
        Set random seed.
    r   TFN)r   manual_seednprandomseedcudamanual_seed_allbackendscudnndeterministic	benchmark)r0   r   r   r   set_seedH   s   


r7   c                 C   sB   t | }|d }t }| D ]\}}|||dd< q|}|S )z0
        Load pre-train parameters for VoP.
    
state_dictzmodule. )r   r    r   itemsreplace)checkpoint_path
checkpointr8   new_state_dictkvr   r   r   get_state_dictU   s   
rA   randc                 C   s@  t ||}tjd||d dt}g }t|dd D ]\}}||||d  d f q|dkr:dd |D }	nd	d |D }	g }
|	D ]A}| tj	| | 
 \}}|sid
}t|D ]}| 
 \}}|rh nq\|sn dS t|tj}t|}|ddd}|
| qEt|
|k r|
|
d   t|
|k s|
|	fS )a  
        Get indexes of sampled frames.

        Args:
            cap: cv2.VideoCapture
            num_frames: int - number of frames to sample
            vlen: video length, int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), 325
            sample: 'rand' | 'uniform' how to sample

        Returns:
            frames: torch.tensor of stacked sampled video frames
                    of dim (num_frames, C, H, W)
            frame_idxs: list(int) indices of where the frames where sampled
    r      )startstopnumNrB   c                 S   s$   g | ]}t t|d  |d qS )r   rC   )r/   choicerange.0xr   r   r   
<listcomp>z   s   $ z$get_valid_frames.<locals>.<listcomp>c                 S   s    g | ]}|d  |d  d qS )r   rC      r   rJ   r   r   r   rM   |   s        )NNrN   )minr.   linspaceastypeint	enumerateappendsetcv2CAP_PROP_POS_FRAMESreadrI   cvtColorCOLOR_BGR2RGBr   
from_numpypermutelenclone)cap
num_framesvlensampleacc_samples	intervalsrangesidxinterv
frame_idxsframesindexretframen_tries_r   r   r   get_valid_framesd   sB   


rp   c                 C   s^   t | }| sJ | t|t j}t||||\}}t|	 d }|
  ||fS )a  
        Get indexes of sampled frames.

        Args:
            video_path: the local video path
            num_frames: Frame number, 12 frames for each video
            sample: 'rand' | 'uniform' how to sample

        Returns:
            frames: torch.tensor of stacked sampled video frames
                    of dim (num_frames, C, H, W)
            frame_idxs: list(int) indices of where the frames where sampled
       )rW   VideoCaptureisOpenedrS   getCAP_PROP_FRAME_COUNTrp   r   stackfloatrelease)
video_pathra   rc   r`   rb   rj   ri   r   r   r   load_frames_from_video   s   
rz   )r   )rB   )ospickler/   shutilzipfilecollectionsr   rW   numpyr.   r   ujsonr)   PILr   torchvisionr   r   r&   r,   r7   rA   rp   rz   r   r   r   r   <module>   s&   

2