o
    0iO                  %   @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	 d dl
Z
ddlmZ zed dZW n eefy;   dZY nw ed d	ZG d
d dZG dd dZdeeef ddfddZde
jde
jde
jde
jde
jde
jdefddZde
jde
jdeeef de
jfddZddd d d d d edd d d efd!ed"ed#ed$ed%ed&ed'ed(eeef d)ed*ed+ed,edeeef d-edee
je
jef fd.d/Zd!edeee ee ef fd0d1Zd!edefd2d3Z				 	 	 	 	 	 					 	 	 	 		dGd4e
jd"ed#ed$ed%ed&ed'ed(eeef d5ed6ed*ed+ed,edeeef d7ed8edee
je
jf f"d9d:Zd4e
jdeee ee ef fd;d<Zd4e
jdefd=d>Z 	 		?dHd!ed@e	eef dAee	eef  dBedee
je
jeeef f f
dCdDZ!	?dId!edBedee	ee ee f ee f fdEdFZ"dS )J    N)Fraction)DictListOptionalTupleUnion   )_load_libraryvideo_readerTF   c                   @   s4   e Zd ZeedZddgZdededdfddZdS )Timebase	numeratordenominatorr   r   returnNc                 C   s   || _ || _d S )Nr   )selfr   r    r   W/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/torchvision/io/_video_opt.py__init__   s   
zTimebase.__init__)__name__
__module____qualname__int__annotations__	__slots__r   r   r   r   r   r      s    
r   c                	   @   s4   e Zd ZeeeeeeeedZg dZdddZdS )VideoMetaData)	has_videovideo_timebasevideo_duration	video_fps	has_audioaudio_timebaseaudio_durationaudio_sample_rater   Nc                 C   s@   d| _ tdd| _d| _d| _d| _tdd| _d| _d| _d S )NFr   r   g        )	r   r   r   r   r   r    r!   r"   r#   )r   r   r   r   r   9   s   
zVideoMetaData.__init__)r   N)	r   r   r   boolr   floatr   r   r   r   r   r   r   r   #   s    
r   	pts_ranger   c                 C   sB   | d | d   krdkrn d S t d| d  d| d  d S )Nr   r   z=Start pts should not be smaller than end pts, got start pts: z and end pts: )
ValueError)r&   r   r   r   _validate_ptsD   s
   "r(   	vtimebasevfps	vduration	atimebaseasample_rate	adurationc                 C   s  t  }|  dkr;tt| d  t| d  |_| d  t| d   }| dkr;d|_t| | |_| dkrHt| |_	| dkrtt|d  t|d  |_
|d  t|d   }| dkrd|_t| | |_| dkrt| |_|S )zE
    Build update VideoMetaData struct with info about the video
    r   r   T)r   numelr   r   itemr   r%   r   r   r   r!   r    r"   r#   )r)   r*   r+   r,   r-   r.   metatimebaser   r   r   
_fill_infoL   s$   $$r3   aframes
aframe_ptsaudio_pts_rangec           	      C   s   |d |d }}|  d}t|| d t| }d}|}||d k r.t|d | | }|d dkrD||d krDt|d | | }| ||d d f S )Nr   r   )sizer%   r   )	r4   r5   r6   startendnum_samplesstep_per_aframes_idxe_idxr   r   r   _align_audio_framesl   s   
r?         ?r   r7   filenameseek_frame_marginread_video_streamvideo_widthvideo_heightvideo_min_dimensionvideo_max_dimensionvideo_pts_ranger   read_audio_streamaudio_samplesaudio_channelsr!   c                 C   s   t | t | tjj| |d||||||d |d |j|j|	|
||d |d |j|j}|\
}}}}}}}}}}t||||||}| dkrNt	|||}|||fS )ab  
    Reads a video from a file, returning both the video frames and the audio frames

    Args:
    filename (str): path to the video file
    seek_frame_margin (double, optional): seeking frame in the stream is imprecise. Thus,
        when video_start_pts is specified, we seek the pts earlier by seek_frame_margin seconds
    read_video_stream (int, optional): whether read video stream. If yes, set to 1. Otherwise, 0
    video_width/video_height/video_min_dimension/video_max_dimension (int): together decide
        the size of decoded frames:

            - When video_width = 0, video_height = 0, video_min_dimension = 0,
                and video_max_dimension = 0, keep the original frame resolution
            - When video_width = 0, video_height = 0, video_min_dimension != 0,
                and video_max_dimension = 0, keep the aspect ratio and resize the
                frame so that shorter edge size is video_min_dimension
            - When video_width = 0, video_height = 0, video_min_dimension = 0,
                and video_max_dimension != 0, keep the aspect ratio and resize
                the frame so that longer edge size is video_max_dimension
            - When video_width = 0, video_height = 0, video_min_dimension != 0,
                and video_max_dimension != 0, resize the frame so that shorter
                edge size is video_min_dimension, and longer edge size is
                video_max_dimension. The aspect ratio may not be preserved
            - When video_width = 0, video_height != 0, video_min_dimension = 0,
                and video_max_dimension = 0, keep the aspect ratio and resize
                the frame so that frame video_height is $video_height
            - When video_width != 0, video_height == 0, video_min_dimension = 0,
                and video_max_dimension = 0, keep the aspect ratio and resize
                the frame so that frame video_width is $video_width
            - When video_width != 0, video_height != 0, video_min_dimension = 0,
                and video_max_dimension = 0, resize the frame so that frame
                video_width and  video_height are set to $video_width and
                $video_height, respectively
    video_pts_range (list(int), optional): the start and end presentation timestamp of video stream
    video_timebase (Fraction, optional): a Fraction rational number which denotes timebase in video stream
    read_audio_stream (int, optional): whether read audio stream. If yes, set to 1. Otherwise, 0
    audio_samples (int, optional): audio sampling rate
    audio_channels (int optional): audio channels
    audio_pts_range (list(int), optional): the start and end presentation timestamp of audio stream
    audio_timebase (Fraction, optional): a Fraction rational number which denotes time base in audio stream

    Returns
        vframes (Tensor[T, H, W, C]): the `T` video frames
        aframes (Tensor[L, K]): the audio frames, where `L` is the number of points and
            `K` is the number of audio_channels
        info (Dict): metadata for the video and audio. Can contain the fields video_fps (float)
            and audio_fps (int)
    r   r   )
r(   torchopsr
   read_video_from_filer   r   r3   r/   r?   )rB   rC   rD   rE   rF   rG   rH   rI   r   rJ   rK   rL   r6   r!   resultvframes_vframe_ptsr)   r*   r+   r4   r5   r,   r-   r.   infor   r   r   _read_video_from_file{   s8   @
rT   c                 C   s~   t jj| dddddddddddddddddd}|\
}}}}}}}}	}
}t||||	|
|}|  }|  }|||fS )z
    Decode all video- and audio frames in the video. Only pts
    (presentation timestamp) is returned. The actual frame pixel data is not
    copied. Thus, it is much faster than read_video(...)
    r   r   r7   )rM   rN   r
   rO   r3   numpytolist)rB   rP   _vframes
vframe_ptsr)   r*   r+   _aframesr5   r,   r-   r.   rS   r   r   r    _read_video_timestamps_from_file   s4   
rZ   c           	      C   s4   t jj| }|\}}}}}}t||||||}|S )zO
    Probe a video file and return VideoMetaData with info about the video
    )rM   rN   r
   probe_video_from_filer3   )	rB   rP   r)   r*   r+   r,   r-   r.   rS   r   r   r   _probe_video_from_file   s   r\   
video_datavideo_timebase_numeratorvideo_timebase_denominatoraudio_timebase_numeratoraudio_timebase_denominatorc                 C   s   t | t | t| tjs1t  tjddd tj| tjd} W d   n1 s,w   Y  tj	j
| |d||||||d |d ||	|
|||d |d ||}|\
}}}}}}}}}}| dkrjt|||}||fS )a  
    Reads a video from memory, returning both the video frames as the audio frames
    This function is torchscriptable.

    Args:
    video_data (data type could be 1) torch.Tensor, dtype=torch.int8 or 2) python bytes):
        compressed video content stored in either 1) torch.Tensor 2) python bytes
    seek_frame_margin (double, optional): seeking frame in the stream is imprecise.
        Thus, when video_start_pts is specified, we seek the pts earlier by seek_frame_margin seconds
    read_video_stream (int, optional): whether read video stream. If yes, set to 1. Otherwise, 0
    video_width/video_height/video_min_dimension/video_max_dimension (int): together decide
        the size of decoded frames:

            - When video_width = 0, video_height = 0, video_min_dimension = 0,
                and video_max_dimension = 0, keep the original frame resolution
            - When video_width = 0, video_height = 0, video_min_dimension != 0,
                and video_max_dimension = 0, keep the aspect ratio and resize the
                frame so that shorter edge size is video_min_dimension
            - When video_width = 0, video_height = 0, video_min_dimension = 0,
                and video_max_dimension != 0, keep the aspect ratio and resize
                the frame so that longer edge size is video_max_dimension
            - When video_width = 0, video_height = 0, video_min_dimension != 0,
                and video_max_dimension != 0, resize the frame so that shorter
                edge size is video_min_dimension, and longer edge size is
                video_max_dimension. The aspect ratio may not be preserved
            - When video_width = 0, video_height != 0, video_min_dimension = 0,
                and video_max_dimension = 0, keep the aspect ratio and resize
                the frame so that frame video_height is $video_height
            - When video_width != 0, video_height == 0, video_min_dimension = 0,
                and video_max_dimension = 0, keep the aspect ratio and resize
                the frame so that frame video_width is $video_width
            - When video_width != 0, video_height != 0, video_min_dimension = 0,
                and video_max_dimension = 0, resize the frame so that frame
                video_width and  video_height are set to $video_width and
                $video_height, respectively
    video_pts_range (list(int), optional): the start and end presentation timestamp of video stream
    video_timebase_numerator / video_timebase_denominator (float, optional): a rational
        number which denotes timebase in video stream
    read_audio_stream (int, optional): whether read audio stream. If yes, set to 1. Otherwise, 0
    audio_samples (int, optional): audio sampling rate
    audio_channels (int optional): audio audio_channels
    audio_pts_range (list(int), optional): the start and end presentation timestamp of audio stream
    audio_timebase_numerator / audio_timebase_denominator (float, optional):
        a rational number which denotes time base in audio stream

    Returns:
        vframes (Tensor[T, H, W, C]): the `T` video frames
        aframes (Tensor[L, K]): the audio frames, where `L` is the number of points and
            `K` is the number of channels
    ignore The given buffer is not writablemessagedtypeNr   r   )r(   
isinstancerM   Tensorwarningscatch_warningsfilterwarnings
frombufferuint8rN   r
   read_video_from_memoryr/   r?   )r]   rC   rD   rE   rF   rG   rH   rI   r^   r_   rJ   rK   rL   r6   r`   ra   rP   rQ   rR   r)   r*   r+   r4   r5   r,   r-   r.   r   r   r   _read_video_from_memory  s@   E
rp   c                 C   s   t | tjs)t  tjddd tj| tjd} W d   n1 s$w   Y  tjj	
| dddddddddddddddddd}|\
}}}}}}}}	}
}t||||	|
|}|  }|  }|||fS )	z
    Decode all frames in the video. Only pts (presentation timestamp) is returned.
    The actual frame pixel data is not copied. Thus, read_video_timestamps(...)
    is much faster than read_video(...)
    rb   rc   rd   rf   Nr   r   r7   )rh   rM   ri   rj   rk   rl   rm   rn   rN   r
   ro   r3   rU   rV   )r]   rP   rW   rX   r)   r*   r+   rY   r5   r,   r-   r.   rS   r   r   r   "_read_video_timestamps_from_memoryu  s>   

rq   c           	      C   s   t | tjs)t  tjddd tj| tjd} W d   n1 s$w   Y  tjj	
| }|\}}}}}}t||||||}|S )zy
    Probe a video in memory and return VideoMetaData with info about the video
    This function is torchscriptable
    rb   rc   rd   rf   N)rh   rM   ri   rj   rk   rl   rm   rn   rN   r
   probe_video_from_memoryr3   )	r]   rP   r)   r*   r+   r,   r-   r.   rS   r   r   r   _probe_video_from_memory  s   
rs   pts	start_ptsend_ptspts_unitc              	      s    d u rt d dkrtd t| }|j}|j} fdd}d}t}	|r6t|jj	|jj
}	||	}d}
t}|rIt|jj	|jj
}||}
t| d||	d|
|d\}}}i }|r`|j|d	< |rg|j|d
< |||fS )Ninfrt   mThe pts_unit 'pts' gives wrong results and will be removed in a follow-up version. Please use pts_unit 'sec'.c                    s`   } }dkr$t td|   }|tdkr$t t d|   }|tdkr,d}||fS )Nsecr   rx   r7   )r   mathfloorr%   ceil)	time_basestart_offset
end_offsetrv   rw   ru   r   r   get_pts  s   z_read_video.<locals>.get_ptsrA   T)rD   rI   r   rJ   r6   r!   r   	audio_fps)r%   rj   warnr\   r   r    default_timebaser   r   r   r   r!   rT   r   r#   )rB   ru   rv   rw   rS   r   r    r   rI   r   r6   r!   rQ   r4   _infor   r   r   _read_video  sF   	


r   c                    sd   |dkr	t d t| \}}}|dkr&t|jj|jj  fdd|D }|jr,|jnd }||fS )Nrt   ry   rz   c                    s   g | ]}|  qS r   r   ).0xvideo_time_baser   r   
<listcomp>  s    z*_read_video_timestamps.<locals>.<listcomp>)	rj   r   rZ   r   r   r   r   r   r   )rB   rw   rt   _rS   r   r   r   r   _read_video_timestamps  s   r   )r@   r   r   r   r   r   rA   r   r   r   r   r   rA   r   r   )r   Nrt   )rt   )#r{   rj   	fractionsr   typingr   r   r   r   r   rM   	extensionr	   _HAS_VIDEO_OPTImportErrorOSErrorr   r   r   r   r(   ri   r3   r?   strr%   r$   rT   rZ   r\   rp   rq   rs   r   r   r   r   r   r   <module>   sP   
!
 


	


$`#
	


m
*


>