o
    ϯi~%                     @   s@  d dl Z d dlZd dlZd dlmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZmZ d dlmZ eeZe	jjjddddfd	ed
edee	jjj deeeeef geeef f  dededefddZe	jjjddddfd	ed
edee	jjj deeeeef geeef f  dededefddZG dd dZdS )    N)AnyCallableDictOptionalType)	g_pathmgr)ClipInfoClipSampler)LabeledVideoDataset Tpyav	data_pathclip_samplervideo_sampler	transformvideo_path_prefixdecode_audiodecoderc                    s   t jd t| rCzt| d}t|}W d   n1 s"w   Y  W n ty6   t	|  dw  fdd|
 D }	nt	|  dt|	|||||d}
|
S )	ae  
    Builds a LabeledVideoDataset with no annotations from a json file with the following
    format:

        .. code-block:: text

            {
              "video_name1": {...}
              "video_name2": {...}
              ....
              "video_nameN": {...}
            }

    Args:
        labeled_video_paths (List[Tuple[str, Optional[dict]]]): List containing
                video file paths and associated labels. If video paths are a folder
                it's interpreted as a frame video, otherwise it must be an encoded
                video.

        clip_sampler (ClipSampler): Defines how clips should be sampled from each
            video. See the clip sampling documentation for more information.

        video_sampler (Type[torch.utils.data.Sampler]): Sampler for the internal
            video container. This defines the order videos are decoded and,
            if necessary, the distributed split.

        transform (Callable): This callable is evaluated on the clip output before
            the clip is returned. It can be used for user defined preprocessing and
            augmentations on the clips. The clip output format is described in __next__().

        decode_audio (bool): If True, also decode audio from video.

        decoder (str): Defines what type of decoder used to decode a video. Not used for
            frame videos.
    z4PYTORCHVIDEO.dataset.json_dataset.video_only_datasetrN must be json for Ego4D datasetc                    s   g | ]}t j |i fqS  )ospathjoin).0xr   r   R/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/data/json_dataset.py
<listcomp>I   s    z&video_only_dataset.<locals>.<listcomp> not found.r   r   )torch_C_log_api_usage_oncer   isfileopenjsonload	ExceptionFileNotFoundErrorkeysr
   )r   r   r   r   r   r   r   fannotationsvideo_pathsdatasetr   r   r   video_only_dataset   s.   -

r/   c              
      sJ  t | rzt | d}t|}W d   n1 sw   Y  W n ty0   t|  dw g  | D ]I\}	}
tj	
||	}|
d d D ]7}|d }|d }|d }|d	 }|d
 }|d }|d }|du sr|du sr|sr|rsqH |||||df qHq7nt|  d fdd}|d |d t t|||||d}|S )a  
    Builds a LabeledVideoDataset with noun, verb annotations from a json file with the following
    format:

        .. code-block:: text

            {
              "video_name1": {
                  {
                    "benchmarks": {
                        "forecasting_hands_objects": [
                            {
                                "critical_frame_selection_parent_start_sec": <start_sec>
                                "critical_frame_selection_parent_end_sec": <end_sec>
                                {
                                    "taxonomy: {
                                        "noun": <label>,
                                        "verb": <label>,
                                    }
                                }
                            },
                            {
                                ...
                            }
                        ]
                    }
                  }
              }
              "video_name2": {...}
              ....
              "video_nameN": {...}
            }

    Args:
        labeled_video_paths (List[Tuple[str, Optional[dict]]]): List containing
                video file paths and associated labels. If video paths are a folder
                it's interpreted as a frame video, otherwise it must be an encoded
                video.

        clip_sampler (ClipSampler): Defines how clips should be sampled from each
            video. See the clip sampling documentation for more information.

        video_sampler (Type[torch.utils.data.Sampler]): Sampler for the internal
            video container. This defines the order videos are decoded and,
            if necessary, the distributed split.

        transform (Callable): This callable is evaluated on the clip output before
            the clip is returned. It can be used for user defined preprocessing and
            augmentations on the clips. The clip output format is described in __next__().

        decode_audio (bool): If True, also decode audio from video.

        decoder (str): Defines what type of decoder used to decode a video. Not used for
            frame videos.
    r   Nr   
benchmarksforecasting_hands_objects)critical_frame_selection_parent_start_sec'critical_frame_selection_parent_end_sectaxonomynounverbverb_unsurenoun_unsure)clip_start_secclip_end_sec
noun_label
verb_labelr   c                    sb   t  fddD }dd t|D }ttD ]}| d   }|| | d  < qd S )Nc                    s   h | ]\}}|  qS r   r   )r   _info
label_namer   r   	<setcomp>   s    zHclip_recognition_dataset.<locals>.map_labels_to_index.<locals>.<setcomp>c                 S   s   i | ]\}}||qS r   r   )r   ilabelr   r   r   
<dictcomp>   s    zIclip_recognition_dataset.<locals>.map_labels_to_index.<locals>.<dictcomp>   )list	enumeraterangelen)r@   labelslabel_to_idxrB   rC   untrimmed_clip_annotationsr?   r   map_labels_to_index   s   z5clip_recognition_dataset.<locals>.map_labels_to_indexr;   r<   r    )r   r$   r%   r&   r'   r(   r)   itemsr   r   r   appendr
   UntrimmedClipSampler)r   r   r   r   r   r   r   r+   r,   
video_namechild
video_pathclip_annotation
clip_startclip_endr4   r;   r<   r7   r8   rN   r.   r   rL   r   clip_recognition_datasetZ   sj   
@ rX   c                   @   sN   e Zd ZdZdeddfddZdeded	eee	f de
fd
dZdddZdS )rQ   a:  
    A wrapper for adapting untrimmed annotated clips from the json_dataset to the
    standard `pytorchvideo.data.ClipSampler` expected format. Specifically, for each
    clip it uses the provided `clip_sampler` to sample between "clip_start_sec" and
    "clip_end_sec" from the json_dataset clip annotation.
    r   returnNc                 C   s
   || _ dS )z
        Args:
            clip_sampler (`pytorchvideo.data.ClipSampler`): Strategy used for sampling
                between the untrimmed clip boundary.
        N)_trimmed_clip_sampler)selfr   r   r   r   __init__   s   
zUntrimmedClipSampler.__init__last_clip_timevideo_duration	clip_infoc                 C   sH   |d }|d }|| }|  |||}t|j| |j| |j|j|jS )Nr9   r:   )rZ   r   r9   r:   
clip_index	aug_indexis_last_clip)r[   r]   r^   r_   clip_start_boundaryclip_end_boundarydurationr   r   r   __call__   s   zUntrimmedClipSampler.__call__c                 C   s   d S )Nr   )r[   r   r   r   reset   s   zUntrimmedClipSampler.reset)rY   N)__name__
__module____qualname____doc__r	   r\   floatr   strr   r   rf   rg   r   r   r   r   rQ      s    

rQ   )r&   loggingr   typingr   r   r   r   r   r!   iopath.common.file_ior   pytorchvideo.data.clip_samplingr   r	   'pytorchvideo.data.labeled_video_datasetr
   	getLoggerrh   loggerutilsdataRandomSamplerrm   Samplerboolr/   rX   rQ   r   r   r   r   <module>   sf   
 
L 
 