o
    ϯiw                     @   s   d dl Z d dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlmZmZmZmZmZ d dlmZmZ d dlmZ G dd	 d	eZG d
d deZdS )    N)fields)Enum)AnyCallableDictListOptional)EncodedVideoInfoVideoClipInfoVideoDatasetTypeVideoFrameInfo	VideoInfo)
ActionDataEpicKitchenDataset)Videoc                   @   s   e Zd ZdZdS )ClipSampling   N)__name__
__module____qualname__RandomOffsetUniform r   r   ^/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/data/epic_kitchen_recognition.pyr      s    r   c                       s  e Zd ZdZejejddddfdedededed	ed
e	de
e deeeef gef def fddZedeeeef geeef f deeeef geeef f fddZededeee gee f fddZed
e	dedeeeef eeee f gee f fddZ  ZS )EpicKitchenRecognitionz
    Action recognition video data set for EpicKitchen-55 Dataset.
    <https://epic-kitchens.github.io/2019/>

    This dataset handles the loading, decoding, and clip sampling for the videos.
    g       @NTvideo_info_file_pathactions_file_pathvideo_data_manifest_file_pathclip_samplingdataset_typeseconds_per_clipframes_per_clip	transformmultithreaded_ioc
              
      s   ddd t tD  ddd t tD  ddd t tD  dd	d t tD  d
	 t||}
t|}|d ur?t|nd }t	 j
|||||||
|	d d S )Nz
        Args:
            video_info_file_path (str):
                Path or URI to manifest with basic metadata of each video.
                File must be a csv (w/header) with columns:
                c                 S      g | ]}|j qS r   name.0fr   r   r   
<listcomp>1       z3EpicKitchenRecognition.__init__.<locals>.<listcomp>z

            actions_file_path (str):
                Path or URI to manifest with action annotations for each video.
                File must ber a csv (w/header) with columns:
                c                 S   r#   r   r$   r&   r   r   r   r)   6   r*   a  

            video_data_manifest_file_path (str):
                The path to a json file outlining the available video data for the
                associated videos. File must be a csv (w/header) with columns either:

                For Frame Videos:
                c                 S   r#   r   r$   r&   r   r   r   r)   =   r*   z6

                For Encoded Videos:
                c                 S   r#   r   r$   r&   r   r   r   r)   @   r*   a  

                To generate this file from a directory of video frames, see helper
                functions in Module: pytorchvideo.data.epic_kitchen.utils

            clip_sampling (ClipSampling):
                The type of sampling to perform to perform on the videos of the dataset.

            dataset_type (VideoDatasetType): The dataformat in which dataset
                video data is store (e.g. video frames, encoded video etc).

            seconds_per_clip (float): The length of each sampled clip in seconds.

            frames_per_clip (Optional[int]): The number of frames per clip to sample.

            transform (Callable[[Dict[str, Any]], Any]):
                This callable is evaluated on the clip output before the clip is returned.
                It can be used for user-defined preprocessing and augmentations to the clips.
                The clip input is a dictionary with the following format:
                    {
                        'video_id': <str>,
                        'video': <video_tensor>,
                        'audio': <audio_tensor>,
                        'label': <List[ActionData]>,
                        'start_time': <float>,
                        'stop_time': <float>
                    }

                If transform is None, the raw clip output in the above format is
                    returned unmodified.

            multithreaded_io (bool):
                Boolean to control whether parllelizable io operations are performed across
                multiple threads.
        )r   r   r   r   r!   frame_filterclip_samplerr"   )dataclass_fieldsr   r   r   r	   r    _define_clip_structure_generator_transform_generator_frame_filter_generatorsuper__init__)selfr   r   r   r   r   r   r    r!   r"   define_clip_structure_fnr+   	__class__r   r   r2       s8   
8


zEpicKitchenRecognition.__init__returnc                    s*   dt ttf dt ttf f fdd}|S )aB  
        Args:
            transform (Callable[[Dict[str, Any]], Dict[str, Any]]): A function that performs
            any operation on a clip before it is returned in the default transform function.

        Returns:
            A function that performs any operation on a clip and returns the transformed clip.
        clipr7   c                    sR    fdd d D }| d<  D ]} | d u r t g  |< qr'   S )Nc                    s,   g | ]}|j  d  kr|j d kr|qS )	stop_time
start_time)r:   r9   )r'   ar8   r   r   r)      s    zWEpicKitchenRecognition._transform_generator.<locals>.transform_clip.<locals>.<listcomp>actions)torchtensor)r8   actions_in_clipkeyr!   r<   r   transform_clip   s   
zCEpicKitchenRecognition._transform_generator.<locals>.transform_clip)r   strr   )r!   rC   r   rB   r   r/   z   s   &z+EpicKitchenRecognition._transform_generatorc                    s"   dt t dt t f fdd}|S )z
        Args:
            frames_per_clip (int): The number of frames per clip to sample.

        Returns:
            A function that takes in a list of frame indicies and outputs a subsampled list.
        frame_indicesr7   c                    s:   t | }t| }ttd||  fddt| D S )Nr   c                    s   g | ]
\}}| v r|qS r   r   )r'   ixselected_framesr   r   r)      s    zWEpicKitchenRecognition._frame_filter_generator.<locals>.frame_filer.<locals>.<listcomp>)lenintsetrange	enumerate)rE   
num_frames
frame_stepr    rH   r   frame_filer   s   zCEpicKitchenRecognition._frame_filter_generator.<locals>.frame_filer)r   rK   )r    rR   r   rQ   r   r0      s   z.EpicKitchenRecognition._frame_filter_generatorc                    sX   |t jkstdt j d| ddtttf dtttt f dtt f fdd}|S )	a  
        Args:
            seconds_per_clip (float): The length of each sampled clip in seconds.
            clip_sampling (ClipSampling):
                The type of sampling to perform to perform on the videos of the dataset.

        Returns:
            A function that takes a dictionary of videos and a dictionary of the actions
            for each video and outputs a list of sampled clips.
        zOnly z is implemented. z not implemented.videosr=   r7   c                    sp   g }|   D ]/\}}t   }t|j|   }t|D ]}|  | }|  }	t|||	}
||
 qq|S )N)itemsrandomrK   durationrM   r
   append)rS   r=   clipsvideo_idvideooffset	num_clipsrF   r:   r9   r8   r   r   r   define_clip_structure   s   zVEpicKitchenRecognition._define_clip_structure_generator.<locals>.define_clip_structure)	r   r   NotImplementedErrorr   rD   r   r   r   r
   )r   r   r^   r   r]   r   r.      s   


z7EpicKitchenRecognition._define_clip_structure_generator)r   r   r   __doc__r   r   r   FramerD   floatr   rK   r   r   r   boolr2   staticmethodr/   r   r0   r   r   r
   r.   __classcell__r   r   r5   r   r      s^    	
Z"&r   )rU   dataclassesr   r-   enumr   typingr   r   r   r   r   r>   (pytorchvideo.data.dataset_manifest_utilsr	   r
   r   r   r   pytorchvideo.data.epic_kitchenr   r   pytorchvideo.data.videor   r   r   r   r   r   r   <module>   s   