o
    ϯi.                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlZd dlmZ d dl	m
Z d dlmZ d dlmZ d dlm
Z
 d dlmZ dZdd	 Zd
d Zdd Zdd Zdd Z							d&ddZdd Zd'ddZG dd  d ejZG d!d" d"Z		#	d(d$d%ZdS ))    N)Image)
transforms)ConstantClipsPerVideoSampler)EncodedVideo)SimpleTokenizer
   c                   C   s   t ddS )N	imagebindz bpe/bpe_simple_vocab_16e6.txt.gz)pkg_resourcesresource_filename r   r   B/home/ubuntu/.local/lib/python3.10/site-packages/imagebind/data.pyreturn_bpe_path   s   r   c                 C   s   | |   8 } tjjj| d|dd|ddtd	}|dd}|d}|| }t|| d	kr4t	
d
|| |dkrFtjjj|d|fddd}n|dk rT|d d d|f }|d}|S )NTFhanning           )
htk_compatsample_frequency
use_energywindow_typenum_mel_binsditherframe_lengthframe_shiftr      g?zhLarge gap between audio n_frames(%d) and target_length (%d). Is the audio_target_length setting correct?constant)modevalue)mean
torchaudio
compliancekaldifbankDEFAULT_AUDIO_FRAME_SHIFT_MS	transposesizeabsloggingwarningtorchnn
functionalpad	unsqueeze)waveformsample_rater   target_lengthr!   n_framespr   r   r   waveform2melspec    s6   

r2   c                 C   s>   g }d}d}|s| ||d d\}}}}}| ||f |r|S )NFr   )
annotation)append)clip_samplerdurationall_clips_timepointsis_last_clipendstart_r   r   r   get_clip_timepointsG   s   r<   c              	   C   s   | d u rd S g }t t jdt jjdt dt  t jdddg}| D ]+}t|d}t	|
d}W d    n1 s>w   Y  |||}|| q$tj|dd	S )
N   )interpolationg3<4'?gwgM?gy{ ?gB91?gwt.?g	U?r   stdrbRGBr   dim)r   ComposeResizeInterpolationModeBICUBIC
CenterCropToTensor	Normalizeopenr   converttor4   r(   stack)image_pathsdeviceimage_outputsdata_transform
image_pathfopenimager   r   r   load_and_transform_vision_dataR   s(   rY   c                    s>   | d u rd S t t d fdd| D }tj|dd}|S )N)bpe_pathc                    s    g | ]}| d  qS )r   )r,   rP   ).0trS   	tokenizerr   r   
<listcomp>q   s     z+load_and_transform_text.<locals>.<listcomp>r   rE   )r   r   r(   cat)textrS   tokensr   r]   r   load_and_transform_textm   s   rc         >        On-F"@c	                    s   | d u rd S g }	t ||d}
| D ]f}t|\}}||kr&tjj|||d}t|
|d| }g }|D ]$}|d d t|d | t|d | f }t||||}|	| q4t
j||d fdd|D }tj|dd}|		| qtj|	ddS )	Nclip_durationclips_per_video)	orig_freqnew_freqr   r   rA   c                    s   g | ]	}|  qS r   )rP   )r[   acrS   	normalizer   r   r_      s    z1load_and_transform_audio_data.<locals>.<listcomp>rE   )r   r   loadr*   resampler<   r$   intr2   r4   r   rM   r(   rQ   )audio_pathsrS   r   r/   r.   rl   rm   r   rB   audio_outputsr5   
audio_pathr-   srr7   	all_clipsclip_timepointswaveform_clipwaveform_melspecr   rq   r   load_and_transform_audio_datav   sB   
r~   c                 C   s\   |   }| ddddgf | |ddddgf< | ddddgf | |ddddgf< |S )a  
    Perform crop on the bounding boxes given the offsets.
    Args:
        boxes (ndarray or None): bounding boxes to perform crop. The dimension
            is `num boxes` x 4.
        x_offset (int): cropping offset in the x axis.
        y_offset (int): cropping offset in the y axis.
    Returns:
        cropped_boxes (ndarray or None): the cropped boxes with dimension of
            `num boxes` x 4.
    Nr   rg   r   rh   )copy)boxesx_offsety_offsetcropped_boxesr   r   r   
crop_boxes   s   ((r   c                 C   s`  |dv sJ t | j}|dkr| d} | jd }| jd }|durJ||kr2|t|| | }}nt|| | |}}tjjj| ||fddd} tt	|| d }tt	|| d }	||krt|dkrkd}n|dkrs|| }n|dkr{d}	n|dkr|| }	| dddd||| |	|	| f }
|durt
||	|nd}|dkr|
d}
|
|fS )	a  
    Perform uniform spatial sampling on the images and corresponding boxes.
    Args:
        images (tensor): images to perform uniform crop. The dimension is
            `num frames` x `channel` x `height` x `width`.
        size (int): size of height and weight to crop the images.
        spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
            is larger than height. Or 0, 1, or 2 for top, center, and bottom
            crop if height is larger than width.
        boxes (ndarray or None): optional. Corresponding boxes to images.
            Dimension is `num boxes` x 4.
        scale_size (int): optinal. If not None, resize the images to scale_size before
            performing any crop.
    Returns:
        cropped (tensor): images with dimension of
            `num frames` x `channel` x `size` x `size`.
        cropped_boxes (ndarray or None): the cropped boxes with dimension of
            `num boxes` x 4.
    r   r   rg   rh   r   rg   NbilinearF)r$   r   align_corners)lenshaper,   ru   r(   r)   r*   interpolatemathceilr   squeeze)imagesr$   spatial_idxr   
scale_sizendimheightwidthr   r   croppedr   r   r   r   uniform_crop   sB   



(
r   c                       s4   e Zd ZdZd
dedef fddZdd	 Z  ZS )SpatialCropaK  
    Convert the video into 3 smaller clips spatially. Must be used after the
        temporal crops to get spatial crops, and should be used with
        -2 in the spatial crop at the slowfast augmentation stage (so full
        frames are passed in here). Will return a larger list with the
        3x spatial crops as well.
    r=   rh   	crop_size	num_cropsc                    sN   t    || _|dkrg d| _g | _d S |dkr#dg| _g | _d S td)Nrh   r   r   zNothing else supported yet)super__init__r   crops_to_extflipped_crops_to_extNotImplementedError)selfr   r   	__class__r   r   r      s   



zSpatialCrop.__init__c                 C   s   t |ts	J dtdd |D sJ dg }|D ]0}| jD ]}|t|| j|d  q| js2qtj	
|}| jD ]}|t|| j|d  q;q|S )z
        Args:
            videos: A list of C, T, H, W videos.
        Returns:
            videos: A list with 3x the number of elements. Each video converted
                to C, T, H', W' by spatial cropping.
        z-Must be a list of videos after temporal cropsc                 S   s   g | ]}|j d kqS )   )r   )r[   videor   r   r   r_     s    z'SpatialCrop.forward.<locals>.<listcomp>zMust be (C,T,H,W)r   )
isinstancelistallr   r4   r   r   r   r   r*   hflip)r   videosresr   r   flipped_videor   r   r   forward  s   

zSpatialCrop.forward)r=   rh   )__name__
__module____qualname____doc__ru   r   r   __classcell__r   r   r   r   r      s    r   c                   @   s   e Zd ZdddZdd ZdS )NormalizeVideoFc                 C   s   || _ || _|| _d S )N)r   rB   inplace)r   r   rB   r   r   r   r   r   "  s   
zNormalizeVideo.__init__c                 C   sn   | j s| }tj| j|j|jd}tj| j|j|jd}||d d d d d f 	|d d d d d f  |S )N)dtyperS   )
r   cloner(   	as_tensorr   r   rS   rB   sub_div_)r   clipr   rB   r   r   r   __call__'  s   0zNormalizeVideo.__call__N)F)r   r   r   r   r   r   r   r   r   r   !  s    
r      c                    s  | d u rd S g }t tdtdddg t||d}tj|d}| D ][}tj|fddd	d
|i}	t	||	j
}
g }|
D ]#}|	|d |d }|d u rRtd||d }|d }|| q> fdd|D }tddd|}tj|dd}|| q$tj|dd|S )Nr=   r?   r@   rA   rk   )num_samplesdecordF)decoderdecode_audior.   r   r   zNo clip foundr   g     o@c                    s   g | ]} |qS r   r   )r[   r   video_transformr   r   r_   _  s    z1load_and_transform_video_data.<locals>.<listcomp>rh   )r   rE   )r   rG   pv_transformsShortSideScaler   r   UniformTemporalSubsampler   	from_pathr<   r6   get_clip
ValueErrorr4   r   r(   rQ   rP   )video_pathsrS   rl   rm   r.   video_outputsr5   frame_sampler
video_pathr   r7   	all_videor{   r   
video_clipr   r   r   load_and_transform_video_data0  sL   
r   )rd   re   rf   rg   rh   ri   rj   )NN)rg   r   rf   ) r&   r   r	   r(   torch.nnr)   r   PILr   pytorchvideor   r   pytorchvideo.data.clip_samplingr   pytorchvideo.data.encoded_videor   torchvision)imagebind.models.multimodal_preprocessorsr   r"   r   r2   r<   rY   rc   r~   r   r   Moduler   r   r   r   r   r   r   <module>   sD   '
2
;+