o
    ϯiG                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZmZmZmZmZ d dlmZmZ d d	lm Z  zd dl!Z!W n e"ym   d
Z#Y nw dZ#ddddddZ$ddddddddddddddZ%ddd d!Z&G d"d# d#eZ'e'j(e$e'j)e%e'j*e&iZ+eG d$d% d%eZ,	d;d&e-d'e.d(ee/ d)e.fd*d+Z0d,e-d-e-d.e-d/e-d)eee-e-f  f
d0d1Z1G d2d3 d3ej2j3j4Z5G d4d5 d5ej2j3j4Z6d<d7e7d8e.d)efd9d:Z8dS )=    N)	dataclass)Enum)AnyCallableDictListOptionalTuple)	g_pathmgr)Image)ImageDatasetImageFrameInfoVideoClipInfoVideoDatasetVideoDatasetType)DataclassFieldCasterload_dataclass_dict_from_csv)VideoFTnoneindoornaturecrowded_environmenturban)r               walkingrunningstandingbikingdrivingplayingcookingeating	observingin_conversationbrowsingshopping)r   r   r   r   r               	   
         paying_attentioninteracting)r   r   r   c                   @   s   e Zd ZdZdZdZdS )	LabelTyper   r   r   N)__name__
__module____qualname__EnvironmentActivityUserAttention r:   r:   L/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/data/domsev.pyr3   A   s    r3   c                   @   sJ   e Zd ZU dZeed< eed< eed< eed< eed< eed< eed< d	S )
	LabelDataz\
    Class representing a contiguous label for a video segment from the DoMSEV dataset.
    video_id
start_time	stop_timestart_frame
stop_framelabel_id
label_nameN)r4   r5   r6   __doc__str__annotations__floatintr:   r:   r:   r;   r<   N   s   
 r<   time_in_secondsfpszero_indexedreturnc                 C   s   t | | }|s|d7 }|S )a  
    Converts a point in time (in seconds) within a video clip to its closest
    frame indexed (rounding down), based on a specified frame rate.

    Args:
        time_in_seconds (float): The point in time within the video.
        fps (int): The frame rate (frames per second) of the video.
        zero_indexed (Optional[bool]): Whether the returned frame should be
            zero-indexed (if True) or one-indexed (if False).

    Returns:
        (int) The index of the nearest frame (rounding down to the nearest integer).
    r   )mathfloor)rI   rJ   rK   	frame_idxr:   r:   r;   _seconds_to_frame_index^   s   rP   t1_startt1_stopt2_startt2_stopc                 C   s0   | |kr||krt | |}t||}||fS dS )z
    Calculates the overlap between two time ranges, if one exists.

    Returns:
        (Optional[Tuple]) A tuple of <overlap_start_time, overlap_stop_time> if
        an overlap is found, or None otherwise.
    N)maxmin)rQ   rR   rS   rT   overlap_start_timeoverlap_stop_timer:   r:   r;    _get_overlap_for_time_range_pairt   s
   

rY   c                   @   s   e Zd ZdZ		ddedededeeeeef gef  de	d	dfd
dZ
edeeef deeee f fddZd	eeef fddZd	efddZdeeef d	eeef fddZdS )DomsevFrameDatasetz
    Egocentric video classification frame-based dataset for
    `DoMSEV <https://www.verlab.dcc.ufmg.br/semantic-hyperlapse/cvpr2018-dataset/>`_

    This dataset handles the loading, decoding, and configurable sampling for
    the image frames.
    NFvideo_data_manifest_file_pathvideo_info_file_pathlabels_file_path	transformmultithreaded_iorL   c                 C   sp   |sJ |sJ |sJ t |||}t|tddd}t||| _|| _| j| _	t
| | _t| j dS )a/  
        Args:
            video_data_manifest_file_path (str):
                The path to a json file outlining the available video data for the
                associated videos.  File must be a csv (w/header) with columns:
                ``{[f.name for f in dataclass_fields(EncodedVideoInfo)]}``

                To generate this file from a directory of video frames, see helper
                functions in module: ``pytorchvideo.data.domsev.utils``

            video_info_file_path (str):
                Path or URI to manifest with basic metadata of each video.
                File must be a csv (w/header) with columns:
                ``{[f.name for f in dataclass_fields(VideoInfo)]}``

            labels_file_path (str):
                Path or URI to manifest with temporal annotations for each video.
                File must be a csv (w/header) with columns:
                ``{[f.name for f in dataclass_fields(LabelData)]}``

            dataset_type (VideoDatasetType): The data format in which dataset
                video data is stored (e.g. video frames, encoded video etc).

            transform (Optional[Callable[[Dict[str, Any]], Any]]):
                This callable is evaluated on the clip output before the clip is returned.
                It can be used for user-defined preprocessing and augmentations to the clips.
                The clip output format is described in __next__().

            multithreaded_io (bool):
                Boolean to control whether io operations are performed across multiple
                threads.
        r=   Tlist_per_keyN)r   _load_imagesr   r<   rZ   _assign_labels_to_frames_labels_per_frame_user_transform_transform_frame
_transformlistvalues_framesrandomshuffle)selfr[   r\   r]   r^   r_   frames_dictvideo_labelsr:   r:   r;   __init__   s"   (
zDomsevFrameDataset.__init__rn   ro   c                 C   sP   i }|   D ]\}}||j }|D ]}|j|jkr$|j|jkr$|j||< qq|S )a8  
        Args:
            frames_dict: The mapping of <frame_id, ImageFrameInfo> for all the frames
                in the dataset.
            video_labels: The list of temporal labels for each video

        Also unpacks one label per frame.
        Also converts them to class IDs and then a tensor.
        )itemsr=   frame_numberr@   rA   rB   )rn   ro   labels_per_frameframe_id
image_infolabels_in_videolabelr:   r:   r;   rc      s   

z+DomsevFrameDataset._assign_labels_to_framesc                 C   sB   | j | }| j|j }t|j}|j||d}| jr| |}|S )a  
        Samples an image frame associated to the given index.

        Args:
            index (int): index for the image frame

        Returns:
            An image frame with the following format if transform is None.

            .. code-block:: text

                {{
                    'frame_id': <str>,
                    'image': <image_tensor>,
                    'label': <label_tensor>,
                }}
        )rt   imagerw   )rj   rd   rt   _load_image_from_pathframe_file_pathrg   )rm   indexframelabel_in_frame
image_data
frame_datar:   r:   r;   __getitem__   s   


zDomsevFrameDataset.__getitem__c                 C   
   t | jS )zK
        Returns:
            The number of frames in the dataset.
        )lenrj   rm   r:   r:   r;   __len__     
zDomsevFrameDataset.__len__r|   c                 C   8   |D ]}|| du rt g ||< q| jr| |}|S )a<  
        Transforms a given image frame, according to some pre-defined transforms
        and an optional user transform function (self._user_transform).

        Args:
            clip (Dict[str, Any]): The clip that will be transformed.

        Returns:
            (Dict[str, Any]) The transformed clip.
        Ntorchtensorre   )rm   r|   keyr:   r:   r;   rf        
z#DomsevFrameDataset._transform_frame)NF)r4   r5   r6   rD   rE   r   r   r   r   boolrp   staticmethodr   r   r<   rc   r   rH   r   rf   r:   r:   r:   r;   rZ      s4    
B
"&rZ   c                   @   s   e Zd ZdZejddddfdedededeeee	f eee
e f ge
e f d	ed
edeeeeef gef  deee
e ge
e f  deddfddZdeeef fddZdefddZdeeef deeef fddZdS )DomsevVideoDataseta3  
    Egocentric classification video clip-based dataset for
    `DoMSEV <https://www.verlab.dcc.ufmg.br/semantic-hyperlapse/cvpr2018-dataset/>`_
    stored as an encoded video (with frame-level labels).

    This dataset handles the loading, decoding, and configurable clip
    sampling for the videos.
    r   NFr[   r\   r]   clip_samplerdataset_typeframes_per_secondr^   frame_filterr_   rL   c
           
      C   sj   |sJ |sJ |sJ t |||	|| _t|tddd| _|| j| j| _|| _|| _| j	| _
|| _dS )ao	  
        Args:
            video_data_manifest_file_path (str):
                The path to a json file outlining the available video data for the
                associated videos.  File must be a csv (w/header) with columns:
                ``{[f.name for f in dataclass_fields(EncodedVideoInfo)]}``

                To generate this file from a directory of video frames, see helper
                functions in module: ``pytorchvideo.data.domsev.utils``

            video_info_file_path (str):
                Path or URI to manifest with basic metadata of each video.
                File must be a csv (w/header) with columns:
                ``{[f.name for f in dataclass_fields(VideoInfo)]}``

            labels_file_path (str):
                Path or URI to manifest with annotations for each video.
                File must be a csv (w/header) with columns:
                ``{[f.name for f in dataclass_fields(LabelData)]}``

            clip_sampler (Callable[[Dict[str, Video], Dict[str, List[LabelData]]],
                List[VideoClipInfo]]):
                Defines how clips should be sampled from each video. See the clip
                sampling documentation for more information.

            dataset_type (VideoDatasetType): The data format in which dataset
                video data is stored (e.g. video frames, encoded video etc).

            frames_per_second (int): The FPS of the stored videos. (NOTE:
                this is variable and may be different than the original FPS
                reported on the DoMSEV dataset website -- it depends on the
                preprocessed subsampling and frame extraction).

            transform (Optional[Callable[[Dict[str, Any]], Any]]):
                This callable is evaluated on the clip output before the clip is returned.
                It can be used for user-defined preprocessing and augmentations to the clips.
                The clip output format is described in __next__().

            frame_filter (Optional[Callable[[List[int]], List[int]]]):
                This callable is evaluated on the set of available frame indices to be
                included in a sampled clip. This can be used to subselect frames within
                a clip to be loaded.

            multithreaded_io (bool):
                Boolean to control whether io operations are performed across multiple
                threads.
        r=   Tr`   N)r   _load_videos_videosr   r<   _labels_per_video_clips_frames_per_secondre   _transform_cliprg   _frame_filter)
rm   r[   r\   r]   r   r   r   r^   r   r_   r:   r:   r;   rp   6  s(   =	
zDomsevVideoDataset.__init__c                    s   | j | }| j|j }g  |D ].}t|j|j|j|j}|dur=|\}}t|| j}t|| j}	t||	D ]}
 	| q5q fddtt
 D }t|}d|ji| j|j |j|j||j|jd}| jrq| |}|S )a/  
        Samples a video clip associated to the given index.

        Args:
            index (int): index for the video clip.

        Returns:
            A video clip with the following format if transform is None.

            .. code-block:: text

                {{
                    'video_id': <str>,
                    'video': <video_tensor>,
                    'audio': <audio_tensor>,
                    'labels': <labels_tensor>,
                    'start_time': <float>,
                    'stop_time': <float>
                }}
        Nc                    s   g | ]} | j qS r:   )rB   ).0ilabels_in_clipr:   r;   
<listcomp>  s    z2DomsevVideoDataset.__getitem__.<locals>.<listcomp>r=   )labelsr>   r?   )r   r   r=   rY   r>   r?   rP   r   rangeappendr   r   r   r   get_cliprg   )rm   r{   cliprv   
label_dataoverlap_periodrW   rX   overlap_start_frameoverlap_stop_frame_	label_idslabel_ids_tensor	clip_datar:   r   r;   r     sD   


zDomsevVideoDataset.__getitem__c                 C   r   )zP
        Returns:
            The number of video clips in the dataset.
        )r   r   r   r:   r:   r;   r     r   zDomsevVideoDataset.__len__r   c                 C   r   )a;  
        Transforms a given video clip, according to some pre-defined transforms
        and an optional user transform function (self._user_transform).

        Args:
            clip (Dict[str, Any]): The clip that will be transformed.

        Returns:
            (Dict[str, Any]) The transformed clip.
        Nr   )rm   r   r   r:   r:   r;   r     r   z"DomsevVideoDataset._transform_clip)r4   r5   r6   rD   r   FramerE   r   r   r   r   r<   r   rH   r   r   r   rp   r   r   r   r:   r:   r:   r;   r   ,  sB    "	

YD&r   r.   
image_pathnum_retriesc           	   	   C   s   t stdd}t|D ]I}t| d }t| tj}t	j
|t	jd}t	|t	j}W d   n1 s7w   Y  |durD|} ntd| d| d td q|du ratd	| t|}|S )
a  
    Loads the given image path using PathManager and decodes it as an RGB image.

    Args:
        image_path (str): the path to the image.
        num_retries (int): number of times to retry image reading to handle transient error.

    Returns:
        A PIL Image of the image RGB data with shape:
        (channel, height, width). The frames are of type np.uint8 and
        in the range [0 - 255]. Raises an exception if unable to load images.
    zVopencv2 is required to use FrameVideo. Please install with 'pip install opencv-python'Nrb)flagszReading attempt /z failed.gư>zFailed to load image from {})_HAS_CV2ImportErrorr   r
   opennp
frombufferreaduint8cv2imdecodeIMREAD_COLORcvtColorCOLOR_BGR2RGBloggingwarningtimesleep	Exceptionformatr   	fromarray)	r   r   img_arrr   fimg_strimg_bgrimg_rgb	pil_imager:   r:   r;   ry     s(   
ry   )T)r.   )9r   rM   rk   r   dataclassesr   enumr   typingr   r   r   r   r   r	   numpyr   r   iopath.common.file_ior
   PILr   (pytorchvideo.data.dataset_manifest_utilsr   r   r   r   r   pytorchvideo.data.utilsr   r   pytorchvideo.data.videor   r   r   r   USER_ENVIRONMENT_MAPUSER_ACTIVITY_MAPUSER_ATTENTION_MAPr3   r7   r8   r9   LABEL_TYPE_2_MAPr<   rG   rH   r   rP   rY   utilsdataDatasetrZ   r   rE   ry   r:   r:   r:   r;   <module>   s    	

 % D