o
    ϯiY                     @   s2  d dl Z d dlZd dlmZ d dlZd dlZzd dlZW n ey'   dZ	Y nw dZ		dJdej
deded	ej
fd
dZejjdej
deeef ded	ej
fddZ		dKdej
dededed	ej
f
ddZ	dJdej
dee ded	eej
 fddZ	dLdej
deded	ej
fddZ		dKdej
d ej
dededed	eej
ejf fd!d"Z		dKdej
d ej
d#ed$ededed	eej
ej
f fd%d&Zdej
ded ej
d	eej
ej
f fd'd(Zdej
ded)efd*d+Zdej
ded)ed	ej
fd,d-Zdej
ded)ed ej
d	eej
ejf f
d.d/Zd0edej
d ej
d	eej
ej
f fd1d2Zd ej
d3ed4ed	ej
fd5d6Zd ej
d7ed8ed	ej
fd9d:Z		;dMd<eeef d=eeef d3ed4ed>ed?ed	eeeeef fd@dAZ 				;dNdej
dBedCed<eeef dDeeef dEed>eded?ed	ej
fdFdGZ!dej
d	ej
fdHdIZ"dS )O    N)TupleFTxnum_samplestemporal_dimreturnc                 C   sT   | j | }|dkr|dksJ td|d |}t|d|d  }t| ||S )av  
    Uniformly subsamples num_samples indices from the temporal dimension of the video.
    When num_samples is larger than the size of temporal dimension of the video, it
    will sample frames based on nearest neighbor interpolation.

    Args:
        x (torch.Tensor): A video tensor with dimension larger than one with torch
            tensor type includes int, long, float, complex, etc.
        num_samples (int): The number of equispaced samples to be selected
        temporal_dim (int): dimension of temporal to perform temporal subsample.

    Returns:
        An x-like Tensor with subsampled temporal dimension.
    r      )shapetorchlinspaceclamplongindex_select)r   r   r   tindices r   V/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/transforms/functional.pyuniform_temporal_subsample   s
   
r   sizeinterpolationc                    s   t stdtjtjtjtjd  v sJ |\dd | ddddjddd	D } fd
d|D }t	j
dd |D dd}tt	|}|dddd}|S )an  
    Down/up samples the input torch tensor x to the given size with given interpolation
    mode.
    Args:
        input (Tensor): the input tensor to be down/up sampled.
        size (Tuple[int, int]): expected output spatial size.
        interpolation: model to perform interpolation, options include `nearest`,
            `linear`, `bilinear`, `bicubic`.
    z]opencv is required to use opencv transforms. Please install with 'pip install opencv-python'.)nearestlinearbilinearbicubicc                 S   s   g | ]	}| d  qS )r   )squeezenumpy).0
img_tensorr   r   r   
<listcomp>G   s    z'_interpolate_opencv.<locals>.<listcomp>r         r   )dimc                    s$   g | ]}t j|f  d qS ))r   )cv2resizer   	img_array!_opencv_pytorch_interpolation_mapr   new_hnew_wr   r   r   K   s    c                 S   s   g | ]	}t j|d dqS )r   axis)npexpand_dimsr$   r   r   r   r   T   s    r*   )_HAS_CV2ImportErrorr"   INTER_NEARESTINTER_LINEAR
INTER_AREAINTER_CUBICpermutesplitr,   concatenater
   
from_numpyascontiguousarray)r   r   r   img_array_listresized_img_array_listr%   r   r   r&   r   _interpolate_opencv,   s0   r;   r   pytorchbackendc           
      C   s   t | jdks	J | jtjksJ |dv sJ | j\}}}}||k r2ttt|| | }|}	n|}ttt|| | }	|dkrRtj	j
j| ||	f|ddS |dkr_t| ||	f|dS t| d)	a  
    Determines the shorter spatial dim of the video (i.e. width or height) and scales
    it to the given size. To maintain aspect ratio, the longer side is then scaled
    accordingly.
    Args:
        x (torch.Tensor): A video tensor of shape (C, T, H, W) and type torch.float32.
        size (int): The size the shorter side is scaled to.
        interpolation (str): Algorithm used for upsampling,
            options: nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'
        backend (str): backend used to perform interpolation. Options includes
            `pytorch` as default, and `opencv`. Note that opencv and pytorch behave
            differently on linear interpolation on some versions.
            https://discuss.pytorch.org/t/pytorch-linear-interpolation-is-different-from-pil-opencv/71181
    Returns:
        An x-like Tensor with scaled spatial dims.
       )r<   opencvr<   F)r   modealign_cornersr?   )r   r   z backend not supported.)lenr	   dtyper
   float32intmathfloorfloatnn
functionalinterpolater;   NotImplementedError)
r   r   r   r=   cr   hwr(   r)   r   r   r   short_side_scale\   s    rP   framesframe_ratiosc                 C   s6   | j | }g }|D ]}t| || |}|| q	|S )a^  
    Prepare output as a list of tensors subsampled from the input frames. Each tensor
        maintain a unique copy of subsampled frames, which corresponds to a unique
        pathway.

    Args:
        frames (tensor): frames of images sampled from the video. Expected to have
            torch tensor (including int, long, float, complex, etc) with dimension
            larger than one.
        frame_ratios (tuple): ratio to perform temporal down-sampling for each pathways.
        temporal_dim (int): dimension of temporal.

    Returns:
        frame_list (tuple): list of tensors as output.
    )r	   r   append)rQ   rR   r   temporal_length
frame_listratiopathwayr   r   r   #uniform_temporal_subsample_repeated   s   

rX           targets	num_classlabel_smoothc                 C   s   t |  |k sJ dd|  krdk sJ d J d|| }d| | }t j| jd |f||dkr9t jnd| jd}|d|  d	d| |S )
ac  
    This function converts target class indices to one-hot vectors,
    given the number of classes.

    Args:
        targets (torch.Tensor): Index labels to be converted.
        num_class (int): Total number of classes.
        label_smooth (float): Label smooth value for non-target classes. Label smooth
            is disabled by default (0).
    z/Class Index must be less than number of classesr         ?z/Label smooth value needs to be between 0 and 1.rY   N)rC   devicer   )	r
   maxitemfullr	   r   r^   scatter_view)rZ   r[   r\   non_target_valuetarget_valueone_hot_targetsr   r   r   convert_to_one_hot   s   $rh   imagesboxesc                 C   sb   | j \}}}}t| |||} | j \}	}	}
}||k r%|t|
| 9 }| |fS |t|| 9 }| |fS )a>  
    Perform a spatial short scale jittering on the given images and
    corresponding boxes.
    Args:
        images (tensor): images to perform scale jitter. Dimension is
            `channel` x `num frames` x `height` x `width`.
        boxes (tensor): Corresponding boxes to images.
            Dimension is `num boxes` x 4.
        size (int): The size the shorter side is scaled to.
        interpolation (str): Algorithm used for upsampling,
            options: nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'
        backend (str): backend used to perform interpolation. Options includes
            `pytorch` as default, and `opencv`. Note that opencv and pytorch behave
            differently on linear interpolation on some versions.
            https://discuss.pytorch.org/t/pytorch-linear-interpolation-is-different-from-pil-opencv/71181
    Returns:
        (tensor): the scaled images with dimension of
            `channel` x `num frames` x `height` x `width`.
        (tensor): the scaled boxes with dimension of
            `num boxes` x 4.
    )r	   rP   rH   )ri   rj   r   r   r=   rM   r   rN   rO   _r(   r)   r   r   r   short_side_scale_with_boxes   s   rl   min_sizemax_sizec                 C   s&   t ||d d }t| ||||S )a~  
    Perform a spatial short scale jittering on the given images and
    corresponding boxes.
    Args:
        images (tensor): images to perform scale jitter. Dimension is
            `channel` x `num frames` x `height` x `width`.
        boxes (tensor): Corresponding boxes to images.
            Dimension is `num boxes` x 4.
        min_size (int): the minimal size to scale the frames.
        max_size (int): the maximal size to scale the frames.
        interpolation (str): Algorithm used for upsampling,
            options: nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'
        backend (str): backend used to perform interpolation. Options includes
            `pytorch` as default, and `opencv`. Note that opencv and pytorch behave
            differently on linear interpolation on some versions.
            https://discuss.pytorch.org/t/pytorch-linear-interpolation-is-different-from-pil-opencv/71181
    Returns:
        (tensor): the scaled images with dimension of
            `channel` x `num frames` x `height` x `width`.
        (tensor): the scaled boxes with dimension of
            `num boxes` x 4.
    r   r   )r
   randintra   rl   )ri   rj   rm   rn   r   r=   r   r   r   r   "random_short_side_scale_with_boxes   s   rq   c           	      C   s   | j d |kr| j d |kr| S | j d }| j d }d}||kr+ttjd|| }d}||kr<ttjd|| }| dddd||| ||| f }t|||}|t||j d |j d fS )al  
    Perform random spatial crop on the given images and corresponding boxes.
    Args:
        images (tensor): images to perform random crop. The dimension is
            `channel` x `num frames` x `height` x `width`.
        size (int): the size of height and width to crop on the image.
        boxes (tensor): Corresponding boxes to images.
            Dimension is `num boxes` x 4.
    Returns:
        cropped (tensor): cropped images with dimension of
            `channel` x `num frames` x `height` x `width`.
        cropped_boxes (tensor): the cropped boxes with dimension of
            `num boxes` x 4.
    r   r    r   Nr_   )r	   rE   r,   randomrp   
crop_boxesclip_boxes_to_image)	ri   r   rj   heightwidthy_offsetx_offsetcroppedcropped_boxesr   r   r   random_crop_with_boxes  s   

(r|   spatial_idxc                 C   s   |dv sJ | j d }| j d }tt|| d }tt|| d }||kr:|dkr1d}n|dkr9|| }n|dkrAd}n|dkrI|| }| dddd||| ||| f }|||fS )zJ
    A helper function grouping the common components in uniform crop
    )r   r   r   r   r    r   N)r	   rE   rF   ceil)ri   r   r}   rv   rw   rx   ry   rz   r   r   r   _uniform_crop_helper.  s"   

(
r   c                 C   s   t | ||\}}}|S )ak  
    Perform uniform spatial sampling on the images and corresponding boxes.
    Args:
        images (tensor): images to perform uniform crop. The dimension is
            `channel` x `num frames` x `height` x `width`.
        size (int): size of height and weight to crop the images.
        spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
            is larger than height. Or 0, 1, or 2 for top, center, and bottom
            crop if height is larger than width.
    Returns:
        cropped (tensor): images with dimension of
            `channel` x `num frames` x `height` x `width`.
    )r   )ri   r   r}   rz   rk   r   r   r   uniform_cropH  s   r   c                 C   s:   t | ||\}}}t|||}|t||jd |jd fS )a-  
    Perform uniform spatial sampling on the images and corresponding boxes.
    Args:
        images (tensor): images to perform uniform crop. The dimension is
            `channel` x `num frames` x `height` x `width`.
        size (int): size of height and weight to crop the images.
        spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
            is larger than height. Or 0, 1, or 2 for top, center, and bottom
            crop if height is larger than width.
        boxes (tensor): Corresponding boxes to images.
            Dimension is `num boxes` x 4.
    Returns:
        cropped (tensor): images with dimension of
            `channel` x `num frames` x `height` x `width`.
        cropped_boxes (tensor): the cropped boxes with dimension of
            `num boxes` x 4.
    rr   r_   )r   rt   ru   r	   )ri   r   r}   rj   rz   ry   rx   r{   r   r   r   uniform_crop_with_boxes^  s
   r   probc                 C   s`   t |}tj | k r,|d}|jd }||ddddgf  d |ddddgf< ||fS )aP  
    Perform horizontal flip on the given images and corresponding boxes.
    Args:
        prob (float): probility to flip the images.
        images (tensor): images to perform horizontal flip, the dimension is
            `channel` x `num frames` x `height` x `width`.
        boxes (tensor): Corresponding boxes to images.
            Dimension is `num boxes` x 4.
    Returns:
        images (tensor): images with dimension of
            `channel` x `num frames` x `height` x `width`.
        flipped_boxes (tensor): the flipped boxes with dimension of
            `num boxes` x 4.
    r_   r    Nr   r   r   )copydeepcopyr,   rs   uniformflipr	   )r   ri   rj   flipped_boxesrw   r   r   r   horizontal_flip_with_boxes|  s   


,r   rv   rw   c              
   C   s~   t | }t|d td| ddddgf |ddddgf< t|d td| ddddgf |ddddgf< |S )a}  
    Clip an array of boxes to an image with the given height and width.
    Args:
        boxes (tensor): bounding boxes to perform clipping.
            Dimension is `num boxes` x 4.
        height (int): given image height.
        width (int): given image width.
    Returns:
        clipped_boxes (tensor): the clipped boxes with dimension of
            `num boxes` x 4.
    r]   rY   Nr   r   r   r    )r   r   r,   minimummaximum)rj   rv   rw   clipped_boxesr   r   r   ru     s   
  ru   ry   rx   c                 C   s^   t | }| ddddgf | |ddddgf< | ddddgf | |ddddgf< |S )a  
    Peform crop on the bounding boxes given the offsets.
    Args:
        boxes (torch.Tensor): bounding boxes to peform crop. The dimension
            is `num boxes` x 4.
        x_offset (int): cropping offset in the x axis.
        y_offset (int): cropping offset in the y axis.
    Returns:
        cropped_boxes (torch.Tensor): the cropped boxes with dimension of
            `num boxes` x 4.
    Nr   r   r   r    )r   r   )rj   ry   rx   r{   r   r   r   rt     s   
((rt   
   scalerV   log_uniform_ratio	num_triesc                 C   s.  |dksJ d| d | d kr| d | d f} |d |d kr(|d |d f}t |D ]}|| }|| d td | d | d     }|rnt|d t|d f}	t|	d td |	d |	d    }
n|d td |d |d    }
ttt	||
 }ttt	||
 }d|  k r|krn q,d|  k r|krn q,t
d|| d d }t
d|| d d }||||f  S q,t|t| }|t|k r|}tt|t| }n|t|kr|}tt|t| }n|}|}|| d }|| d }||||fS )a  
    Given scale, ratio, height and width, return sampled coordinates of the videos.

    Args:
        scale (Tuple[float, float]): Scale range of Inception-style area based
            random resizing.
        ratio (Tuple[float, float]): Aspect ratio range of Inception-style
            area based random resizing.
        height (int): Height of the original image.
        width (int): Width of the original image.
        log_uniform_ratio (bool): Whether to use a log-uniform distribution to
            sample the aspect ratio. Default is True.
        num_tries (int): The number of times to attempt a randomly resized crop.
            Falls back to a central crop after all attempts are exhausted.
            Default is 10.

    Returns:
        Tuple containing i, j, h, w. (i, j) are the coordinates of the top left
        corner of the crop. (h, w) are the height and width of the crop.
    r   znum_tries must be at least 1r   ro   r   )ranger
   randra   rF   logexprE   roundsqrtrp   rH   minr`   )r   rV   rv   rw   r   r   rk   areatarget_area	log_ratioaspect_ratiorO   rN   ijin_ratior   r   r   _get_param_spatial_crop  sB   *$&0r   target_heighttarget_widthr   shiftc	              
   C   s  |d dkr|d dksJ d|d dkr|d dks J d| j d }	| j d }
| j d }| j d }t||||||\}}}}|sc| dddd||| ||| f }tjjj|||f|dS t||||||\}}}}d	d
 tj|||
d D }dd
 tj|||
d D }dd
 tj|||
d D }dd
 tj|||
d D }t|	|
||f}t	|
D ]>}tjjj| dd||d || || ||  || || ||  f ||f|d|dd||d ddddf< q|S )a  
    Crop the given images to random size and aspect ratio. A crop of random
    size relative to the original size and a random aspect ratio is made. This
    crop is finally resized to given size. This is popularly used to train the
    Inception networks.

    Args:
        frames (torch.Tensor): Video tensor to be resized with shape (C, T, H, W).
        target_height (int): Desired height after cropping.
        target_width (int): Desired width after cropping.
        scale (Tuple[float, float]): Scale range of Inception-style area based
            random resizing. Should be between 0.0 and 1.0.
        aspect_ratio (Tuple[float, float]): Aspect ratio range of Inception-style
            area based random resizing. Should be between 0.0 and +infinity.
        shift (bool): Bool that determines whether or not to sample two different
            boxes (for cropping) for the first and last frame. If True, it then
            linearly interpolates the two boxes for other frames. If False, the
            same box is cropped for every frame. Default is False.
        log_uniform_ratio (bool): Whether to use a log-uniform distribution to
            sample the aspect ratio. Default is True.
        interpolation (str): Algorithm used for upsampling. Currently supports
            'nearest', 'bilinear', 'bicubic', 'area'. Default is 'bilinear'.
        num_tries (int): The number of times to attempt a randomly resized crop.
            Falls back to a central crop after all attempts are exhausted.
            Default is 10.

    Returns:
        cropped (tensor): A cropped video tensor of shape (C, T, target_height, target_width).
    r   r   z1min and max of scale range must be greater than 0z8min and max of aspect_ratio range must be greater than 0r   r    N)r   r@   c                 S      g | ]}t |qS r   rE   r   r   r   r   r   r   I      z'random_resized_crop.<locals>.<listcomp>)stepsc                 S   r   r   r   r   r   r   r   r   J  r   c                 S   r   r   r   r   r   r   r   r   K  r   c                 S   r   r   r   r   r   r   r   r   L  r   )
r	   r   r
   rI   rJ   rK   r   tolistzerosr   )rQ   r   r   r   r   r   r   r   r   channelsr   rv   rw   r   r   rN   rO   rz   i_j_h_w_i_sj_sh_sw_sindr   r   r   random_resized_crop  sR   )



(
(
r   c                 C   s   | d }|S )z
    Divide the given tensor x by 255.

    Args:
        x (torch.Tensor): The input tensor.

    Returns:
        y (torch.Tensor): Scaled tensor by dividing 255.
    g     o@r   )r   yr   r   r   div_255\  s   
r   )r   )r   r<   )rY   )Tr   )FTr   r   )#r   rF   typingr   r   r,   r
   r"   r/   r.   TensorrE   r   jitignorestrr;   rP   rX   rH   rh   ndarrayrl   rq   r|   r   r   r   r   ru   rt   boolr   r   r   r   r   r   r   <module>   sh  

2
+
 
$
+
"
#





L

	

V