o
    ॵip3                     @   sN  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
Zd dlZd dlZd dlm  mZ d dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lm Z  ddl!m"Z" ddl#m$Z$ 		dddZ%dd Z&dd Z'	dddZ(dddZ)G dd de*Z+e$j,ej-ej.dG dd de"Z/dS )    N)exists)TemporaryDirectory)urlparse)VideoReader)Compose)http_get_file)Preprocessors)FieldsModeKeys)type_assert   )Preprocessor)PREPROCESSORSc                 C   s  t |}|jdv rt|jrt| ||}n-t "}t j}t	|||dd t
j||}t| ||}W d   n1 s=w   Y  |durN|}	t| |}
n| jj}	t| | jj}
g }t|dD ]}t|	D ]}|
jd | ||
||  qhqbtj|ddS )aw   simple interface to load video frames from file

    Args:
        cfg (Config): The global config object.
        video_path (str): video file path
        num_spatial_crops_override (int): the spatial crops per clip
        num_temporal_views_override (int): the temporal clips per video
    Returns:
        data (Tensor): the normalized video clips for model inputs
    )file N)url	local_dir	file_namecookiesr   r   dim)r   schemer   path_decode_videor   uuiduuid4hexr   osjoinkinetics400_tranformTESTNUM_SPATIAL_CROPSrangesize
transformsset_spatial_indexappendtorchstack)cfg
video_pathnum_spatial_crops_overridenum_temporal_views_override
url_parseddatatemporary_cache_dir
random_strtemp_file_pathnum_spatial_crops	transform	data_listij r7   R/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/video.pyReadVideoData   s<   
r9   c                 C   sH   t | jj| jjg| jj|d}t |tj| jj| jjddg}t	|S )aI  
    Configs the transform for the kinetics-400 dataset.
    We apply controlled spatial cropping and normalization.
    Args:
        cfg (Config): The global config object.
        num_spatial_crops (int): the spatial crops per clip
    Returns:
        transform_function (Compose): the transform function for input clips
    )short_side_range	crop_sizer2   T)meanstdinplace)
KineticsResizedCropDATA
TEST_SCALETEST_CROP_SIZEr$   ToTensorVideoNormalizeVideoMEANSTDr   )r)   r2   resize_videostd_transform_listr7   r7   r8   r   E   s   
r   c                 C   s   |dkrt d| d g}|S || | | }	t| |	 d}
|dkr'|
d }n|t|
|d   }|r;||	 | }n||	 d }t|||}t|d| d  }|S )a  
        Generates the frame index list using interval based sampling.

        Args:
            vid_length (int): the length of the whole video (valid selection range).
            vid_fps (int): the original video fps
            target_fps (int): the normalized video fps
            clip_idx (int):
                -1 for random temporal sampling, and positive values for sampling specific
                clip from the video
            num_clips (int):
                the total clips to be sampled from each video. combined with clip_idx,
                the sampled video is the "clip_idx-th" video from "num_clips" videos.
            num_frames (int): number of frames in each sampled clips.
            interval (int): the interval to sample each frame.
            minus_interval (bool): control the end index

        Returns:
            index (tensor): the sampled frame indexes
    r   r      )	randomrandintmaxmathfloorr'   linspaceclamplong)
vid_lengthvid_fps
target_fpsclip_idx	num_clips
num_framesintervalminus_intervalindexclip_lengthmax_idx	start_idxend_idxr7   r7   r8   _interval_based_sampling[   s   
r_   c           
   
      s   t  tsJ |dur|}n| jj}g }t|D ]1}tt || jj||| jj	| jj
| jj}d}ttj fdd| D dd}|| qt|}~	|S )a  
        Decodes the video given the numpy frames.
        Args:
            cfg          (Config): The global config object.
            frames_list  (list):  all frames for a video, the frames should be numpy array.
            vid_fps      (int):  the fps of this video.
            num_temporal_views_override (int): the temporal clips per video
        Returns:
            frames            (Tensor): video tensor data
    Nc                    s   g | ]} | qS r7   r7   ).0rZ   frames_listr7   r8   
<listcomp>   s    z-_decode_video_frames_list.<locals>.<listcomp>r   )axis)
isinstancelistr    NUM_ENSEMBLE_VIEWSr"   r_   lenr@   
TARGET_FPSNUM_INPUT_FRAMESSAMPLING_RATEMINUS_INTERVALr'   
from_numpynpr(   tolistr&   )
r)   rb   rS   r,   num_clips_per_video
frame_listrU   list_framesvrr7   ra   r8   _decode_video_frames_list   s0   

ru   c           
   
   C   s   t |}|dur|}n| jj}g }t|D ]U}tt|| | jj||| jj	| jj
| jj}d}|drYtd|d d}	t|t|	|g  }||	jd d }nt||  }|| qt|}~|S )aK  
        Decodes the video given the numpy frames.
        Args:
            cfg          (Config): The global config object.
            path          (str): video file path.
            num_temporal_views_override (int): the temporal clips per video
        Returns:
            frames            (Tensor): video tensor data
    Nz.avir      )r   r    rg   r"   r_   rh   get_avg_fpsr@   ri   rj   rk   rl   endswithr'   arangedlpackfrom_dlpack	get_batchcat	to_dlpackcloneshaper&   r(   )
r)   r   r,   rt   rp   rq   rU   rr   rs   append_listr7   r7   r8   r      sJ   




r   c                   @   s<   e Zd ZdZ	dddZdd Zdd Zd	d
 Zdd ZdS )r?   aA  Perform resize and crop for kinetics-400 dataset
    Args:
        short_side_range (list): The length of short side range. In inference, this shoudle be [256, 256]
        crop_size         (int): The cropped size for frames.
        num_spatial_crops (int): The number of the cropped spatial regions in each video.
    r   c                 C   s    d| _ || _t|| _|| _d S )N)idxr:   intr;   r2   )selfr:   r;   r2   r7   r7   r8   __init__   s   

zKineticsResizedCrop.__init__c                 C   sv  |j \}}}}| jd }||k r)t|}t|| | }tjjj|||fdd}nt|}t|| | }tjjj|||fdd}t|| j }	t|| j }
| jdkr]|	d }|
d }nH| jdkr| j	dkr}||krr|	d }d}n3||kr|d}|
d }n(| j	dkr|	d }|
d }n| j	dkr||kr|	d }|
}n
||kr|	}|
d }|dddd||| j ||| j f S )zPerform controlled crop for video tensor.
        Args:
            clip (Tensor): the video data, the shape is [T, C, H, W]
        r   bilinearr#   moder   rI      N)
r   r:   r   r'   nn
functionalinterpolater;   r2   r   )r   clip_clip_height
clip_widthlengthnew_clip_heightnew_clip_widthnew_clipx_maxy_maxxyr7   r7   r8   _get_controlled_crop   sL   









,z(KineticsResizedCrop._get_controlled_cropc                 C   s   |j \}}}}t||}t||}ttj| j }t|| | }||k r*|}	|}
n|}	|}
tjj	j
||	|
fdd}t|
| j }t|	| j }ttd|}ttd|}|d d d d ||| j ||| j f S )Nr   r   r   )r   minrL   r   rJ   uniformr:   r'   r   r   r   r;   )r   r   r   r   r   
short_side	long_sidenew_short_sidenew_long_sider   r   r   r   r   r   r   r7   r7   r8   _get_random_crop  s$   


,z$KineticsResizedCrop._get_random_cropc                 C   s
   || _ dS )zSet the spatial cropping index for controlled cropping..
        Args:
            idx (int): the spatial index. The value should be in [0, 1, 2], means [left, center, right], respectively.
        N)r   )r   r   r7   r7   r8   r%   6  s   
z%KineticsResizedCrop.set_spatial_indexc                 C   s
   |  |S N)r   )r   r   r7   r7   r8   __call__=  s   
zKineticsResizedCrop.__call__N)r   )	__name__
__module____qualname____doc__r   r   r   r%   r   r7   r7   r7   r8   r?      s    
,r?   )module_namec                       s>   e Zd Z fddZdd Zdd Zeeedd Z  Z	S )	"MovieSceneSegmentationPreprocessorc                    sv   t  j|i | |dd| _|tjd| _|tjd| _|dd| _	ddl
m} || j| _|| j| _dS )z7
        movie scene segmentation preprocessor
        is_trainTNnum_keyframer   r   )get_transform)superr   popr   r
   TRAINpreprocessor_train_cfgEVALpreprocessor_test_cfgr   movie_scene_segmentationr   train_transformtest_transform)r   argskwargsr   	__class__r7   r8   r   E  s   z+MovieSceneSegmentationPreprocessor.__init__c                 C   
   d| _ d S )NTr   r   r7   r7   r8   trainT     z(MovieSceneSegmentationPreprocessor.trainc                 C   r   )NFr   r   r7   r7   r8   evalX  r   z'MovieSceneSegmentationPreprocessor.evalc                 C   s>   | j r| j}n| j}tj||dd}|d| jddd}|S )Nr   r   r   r      )r   r   r   r'   r(   viewr   )r   resultsr$   r7   r7   r8   r   \  s   z+MovieSceneSegmentationPreprocessor.__call__)
r   r   r   r   r   r   r   objectr   __classcell__r7   r7   r   r8   r   A  s    r   )NNr   )0rM   r   rJ   r   os.pathr   tempfiler   urllib.parser   numpyrn   r'   torch.utils.datatorch.utils.dlpackutilsrz   (torchvision.transforms._transforms_videor$   _transforms_videodecordr   torchvision.transformsr   modelscope.hub.file_downloadr   modelscope.metainfor   modelscope.utils.constantr	   r
   modelscope.utils.type_assertr   baser   builderr   r9   r   r_   ru   r   r   r?   register_modulecv%movie_scene_segmentation_preprocessorr   r7   r7   r7   r8   <module>   sB    
,/

+.a