o
    i                     @   s   d Z ddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZmZmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ G dd deZG dd deZdgZdS )z(Fast Video processor class for InternVL.    )OptionalUnionN)
functional   )BatchFeature)OPENAI_CLIP_MEANOPENAI_CLIP_STDPILImageResamplingSizeDict)UnpackVideosKwargs)
TensorType)BaseVideoProcessor)VideoMetadatagroup_videos_by_shapereorder_videosc                   @   s    e Zd ZU eeeef ed< dS ) InternVLVideoProcessorInitKwargsinitial_shiftN)__name__
__module____qualname__r   boolfloatint__annotations__ r   r   j/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/internvl/video_processing_internvl.pyr      s   
 r   c                       s  e Zd ZejZeZeZ	dddZ
dZdZdZdZdZdZeZdgZdee f fddZ						d"d
edee deeeef  deeeeef  fddZ		d#ded dedededed dedededededeeeee f  deeeee f  deeee f  de!fd d!Z"  Z#S )$InternVLVideoProcessori  )heightwidthTFpixel_values_videoskwargsc                    s   t  jdi | d S )Nr   )super__init__)selfr!   	__class__r   r   r#   0   s   zInternVLVideoProcessor.__init__Nmetadata
num_framesfpsr   c                 K   s   |dur|n| j }|dur|n| j}|j}|du r3|dur3|du s&|jdu r*tdt||j | }|du r=|| d }||krLtd| d| dt||||  }|S )a  
        Default sampling function which uniformly samples the desired number of frames between 0 and total number of frames.
        If `fps` is passed along with metadata, `fps` frames per second are sampled uniformty. Arguments `num_frames`
        and `fps` are mutually exclusive.

        Args:
            metadata (`VideoMetadata`):
                Metadata of the video containing information about total duration, fps and total number of frames.
            num_frames (`int`, *optional*):
                Maximum number of frames to sample. Defaults to `self.num_frames`.
            fps (`int` or `float`, *optional*):
                Target frames to sample per second. Defaults to `self.fps`.
            initial_shift (`bool`, `float` or `int`, defaults to `self.initial_shift`):
                The initial shift to apply when sampling frames. If `True`, the shift is set so that frames are sampled from the middle of the video.

        Returns:
            np.ndarray:
                Indices to sample video frames.
        NzAsked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. Please pass in `VideoMetadata` object or use a fixed `num_frames` per input videoT   z(Video can't be sampled. The `num_frames=z` exceeds `total_num_frames=z`. )r(   r   total_num_framesr)   
ValueErrorr   torcharange)r$   r'   r(   r)   r   r!   r+   indicesr   r   r   sample_frames3   s"   z$InternVLVideoProcessor.sample_framesvideosztorch.Tensordo_convert_rgb	do_resizesizeinterpolationzF.InterpolationModedo_center_crop	crop_size
do_rescalerescale_factordo_normalize
image_mean	image_stdreturn_tensorsreturnc              	   K   s   t |\}}i }| D ]\}}|r| |}|r!| j|||d}|||< qt||}t |\}}i }| D ]\}}|rC| ||}| |||	|
||}|||< q7t||}|r`tj|ddn|}t	d|i|dS )N)r4   r5   r   )dimr    )datatensor_type)
r   itemsconvert_to_rgbresizer   center_croprescale_and_normalizer-   stackr   )r$   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r!   grouped_videosgrouped_videos_indexresized_videos_groupedshapestacked_videosresized_videosprocessed_videos_groupedprocessed_videosr   r   r   _preprocessf   s*   




z"InternVLVideoProcessor._preprocess)NNN)N)$r   r   r   r	   BICUBICresampler   r;   r   r<   r4   r3   r8   r:   r2   r   do_sample_framesr   valid_kwargsmodel_input_namesr   r#   r   r   r   r   r   r   r0   listr
   strr   r   rP   __classcell__r   r   r%   r   r   "   sp    

A	
r   )__doc__typingr   r   r-   torchvision.transforms.v2r   Fimage_processing_utilsr   image_utilsr   r   r	   r
   processing_utilsr   r   utilsr   video_processing_utilsr   video_utilsr   r   r   r   r   __all__r   r   r   r   <module>   s   
s