o
    ۷i`                     @   s  d dl mZ d dlmZ d dlmZmZ d dlmZm	Z	m
Z
mZ d dlZddlmZmZmZ ddlmZmZmZmZmZ dd	lmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z# dd
l$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. e+ r{ddlm/Z/ e) rd dl0Z0e* rd dl1m2Z3 ddlm4Z4 ndZ4e,5e6Z7edddddddddddddej8fde	e9 de	e: de	e9 de	ee:e;e: f  de	ee:e;e: f  de	e9 de	e de	e9 de	e de	d de	ee<e'f  defdd Z=d8d!d"d#e	e> d$d"fd%d&Z?d'ee d$e;e fd(d)Z@d*e;d" d$eAe>d+f fd,d-ZBd.eejCd"f d/e>d$e;eejCd"f  fd0d1ZDG d2d3 d3e
d4d5ZEe(G d6d7 d7eZFdS )9    )Iterable)deepcopy)	lru_cachepartial)AnyOptional	TypedDictUnionN   )BaseImageProcessorBatchFeatureget_size_dict)convert_to_rgbget_resize_output_image_sizeget_size_with_aspect_ratiogroup_images_by_shapereorder_images)ChannelDimension
ImageInput	ImageTypeSizeDictget_image_size#get_image_size_for_max_height_widthget_image_typeinfer_channel_dimension_formatmake_flat_list_of_imagesvalidate_kwargsvalidate_preprocess_arguments)Unpack)
TensorTypeauto_docstringis_torch_availableis_torchvision_availableis_vision_availablelogging)is_rocm_platform)PILImageResampling)
functional)pil_torch_interpolation_mapping
   maxsize
do_rescalerescale_factordo_normalize
image_mean	image_stddo_center_crop	crop_size	do_resizesizeinterpolationF.InterpolationModereturn_tensorsdata_formatc                 C   sJ   t | |||||||||	d
 |
dur|
dkrtd|tjkr#tddS )z
    Checks validity of typically used arguments in an `ImageProcessorFast` `preprocess` method.
    Raises `ValueError` if arguments incompatibility is caught.
    )
r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   Nptz6Only returning PyTorch tensors is currently supported.z6Only channel first data format is currently supported.)r   
ValueErrorr   FIRST)r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r7   r8    r<   ^/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/image_processing_utils_fast.py"validate_fast_preprocess_argumentsI   s"   
r>   tensortorch.Tensoraxisreturnc                 C   s6   |du r|   S z| j |dW S  ty   |  Y S w )zF
    Squeezes a tensor, but only if the axis specified has dim 1.
    N)rA   )squeezer:   )r?   rA   r<   r<   r=   safe_squeezep   s   rD   valuesc                 C   s   dd t |  D S )zO
    Return the maximum value across all indices of an iterable of values.
    c                 S   s   g | ]}t |qS r<   )max).0values_ir<   r<   r=   
<listcomp>       z&max_across_indices.<locals>.<listcomp>)zip)rE   r<   r<   r=   max_across_indices}   s   rL   images.c                 C   s    t dd | D \}}}||fS )zH
    Get the maximum height and width across all images in a batch.
    c                 S   s   g | ]}|j qS r<   )shaperG   imgr<   r<   r=   rI      s    z(get_max_height_width.<locals>.<listcomp>)rL   )rM   _
max_height	max_widthr<   r<   r=   get_max_height_width   s   rT   image
patch_sizec                 C   sj   g }t | tjd\}}td||D ]!}td||D ]}| dd||| ||| f }|| qq|S )a6  
    Divides an image into patches of a specified size.

    Args:
        image (`Union[np.array, "torch.Tensor"]`):
            The input image.
        patch_size (`int`):
            The size of each patch.
    Returns:
        list: A list of Union[np.array, "torch.Tensor"] representing the patches.
    )channel_dimr   N)r   r   r;   rangeappend)rU   rV   patchesheightwidthijpatchr<   r<   r=   divide_to_patches   s   "r`   c                   @   s>  e Zd ZU ee ed< eeeef  ed< ee ed< ee	d  ed< ee ed< eeeef  ed< ee ed< ee	ee
f  ed	< ee ed
< ee	e
ee
 f  ed< ee	e
ee
 f  ed< ee ed< eeeef  ed< ee ed< ee	eef  ed< ee ed< ee	eef  ed< ed ed< ee ed< dS )DefaultFastImageProcessorKwargsr3   r4   default_to_square)r&   r6   resampler1   r2   r,   r-   r.   r/   r0   do_padpad_sizedo_convert_rgbr7   r8   input_data_formattorch.devicedevicedisable_groupingN)__name__
__module____qualname__r   bool__annotations__dictstrintr	   floatlistr   r   r<   r<   r<   r=   ra      s(   
 ra   F)totalc                !       s  e Zd ZdZdZdZdZdZdZdZ	dZ
dZdZdZdZdZdZdZejZdZdZdgZeZdZdee f fddZedefd	d
Z					d[ddde de!e" de!e# dede!e ddfddZ$		d\ddde de!d deddf
ddZ%e&		d\ddde'e"e"f de!d deddf
dd Z(ddd!e)ddfd"d#Z*ddd$e+e)e,e) f d%e+e)e,e) f ddfd&d'Z-e.d(d)						d]d*e!e d+e!e+e)e/e) f  d,e!e+e)e/e) f  d-e!e d.e!e) d/e!d0 de'fd1d2Z0ddd-ed.e)d*ed+e+e)e/e) f d,e+e)e/e) f ddfd3d4Z1ddde ddfd5d6Z2de3de3fd7d8Z4de5fd9d:Z6	;d^de3d<e"de3fd=d>Z7			d_de3d?e!e d@e!e+e#ef  d/e!d0 ddf
dAdBZ8				;d`de3d?e!e d@e!e+e#ef  d/e!d0 d<e"de/d fdCdDZ9							dade!e  dEe!e  de!e  dFe!e d+e!e+e)e/e) f  d,e!e+e)e/e) f  dGe!e de5fdHdIZ:												dbd-e!e d.e!e) d*e!e d+e!e+e)e'e) f  d,e!e+e)e'e) f  dJe!e de!e  dKe!e dEe!e  de!d dLe!e+e#e;f  dGe!e fdMdNZ<de3dee de=fdOdPZ>e?de3dee de=fdQdRZ@ddSde3d?ed@ed/e!e+e#d0f  dee de=fdTdUZAde/d dJede de!d dKedEe d-ed.e)d*ed+e!e+e)e/e) f  d,e!e+e)e/e) f  dVe!e de!e  de!e dLe!e+e#e;f  de=f dWdXZB fdYdZZC  ZDS )cBaseImageProcessorFastNTgp?pixel_valueskwargsc              	      s   t  jdi | | |}|d| j}|d ur$t||d| jdnd | _|d| j}|d ur8t|ddnd | _|d| j}|d urLt|ddnd | _| j	j
D ]}||d }|d urft| || qSt| |tt| |d  qSt| j	j
 | _d S )	Nr4   rb   r4   rb   r2   
param_namere   r4   r{   r<   )super__init__filter_out_unused_kwargspopr4   r   rb   r2   re   valid_kwargsro   setattrr   getattrrt   keys_valid_kwargs_names)selfrx   r4   r2   re   keykwarg	__class__r<   r=   r~      s"   
zBaseImageProcessorFast.__init__rB   c                 C   s   dS )zv
        `bool`: Whether or not this image processor is a fast processor (backed by PyTorch and TorchVision).
        Tr<   )r   r<   r<   r=   is_fast   s   zBaseImageProcessorFast.is_fastr   constantFrM   r@   re   
fill_valuepadding_modereturn_maskrj   c                 K   sX  |dur|j r
|jstd| d|j |jf}nt|}t||d\}}	i }
i }| D ]l\}}|jdd }|d |d  }|d |d  }|dk sP|dk r[td| d	| d||krndd||f}tj||||d
}||
|< |rt	j
|t	jdddddddf }d|dd|d d|d f< |||< q-t|
|	}|rt||	}||fS |S )a  
        Pads images to `(pad_size["height"], pad_size["width"])` or to the largest size in the batch.

        Args:
            images (`torch.Tensor`):
                Images to pad.
            pad_size (`SizeDict`, *optional*):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            fill_value (`int`, *optional*, defaults to `0`):
                The constant value used to fill the padded area.
            padding_mode (`str`, *optional*, defaults to "constant"):
                The padding mode to use. Can be any of the modes supported by
                `torch.nn.functional.pad` (e.g. constant, reflection, replication).
            return_mask (`bool`, *optional*, defaults to `False`):
                Whether to return a pixel mask to denote padded regions.
            disable_grouping (`bool`, *optional*, defaults to `False`):
                Whether to disable grouping of images by size.

        Returns:
            `torch.Tensor`: The resized image.
        NzCPad size must contain 'height' and 'width' keys only. Got pad_size=.rj   r   r
   zrPadding dimensions are negative. Please make sure that the `pad_size` is larger than the image size. Got pad_size=z, image_size=)fillr   dtype.)r[   r\   r:   rT   r   itemsrN   Fpadtorch
zeros_likeint64r   )r   rM   re   r   r   r   rj   rx   grouped_imagesgrouped_images_indexprocessed_images_groupedprocessed_masks_groupedrN   stacked_images
image_sizepadding_heightpadding_widthpaddingstacked_masksprocessed_imagesprocessed_masksr<   r<   r=   r      sD   $

zBaseImageProcessorFast.padrU   r4   r5   r6   	antialiasc                 K   s   |dur|nt jj}|jr|jrt| dd |j|j}n8|jr-t||jdtj	d}n*|j
rB|jrBt| dd |j
|j}n|jrO|jrO|j|jf}ntd| dtj rgt rg| ||||S t j||||dS )a@  
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`torch.Tensor`):
                Image to resize.
            size (`SizeDict`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.

        Returns:
            `torch.Tensor`: The resized image.
        Nr   F)r4   rb   rg   zjSize must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got r   r5   r   )r   InterpolationModeBILINEARshortest_edgelongest_edger   r4   r   r   r;   rR   rS   r   r[   r\   r:   r   compileris_compilingr%   compile_friendly_resizeresize)r   rU   r4   r5   r   rx   new_sizer<   r<   r=   r   7  s4   zBaseImageProcessorFast.resizer   c                 C   s   | j tjkr5|  d } tj| |||d} | d } t| dkd| } t| dk d| } |  tj} | S tj| |||d} | S )z{
        A wrapper around `F.resize` so that it is compatible with torch.compile when the image is a uint8 tensor.
           r      r   )	r   r   uint8rs   r   r   whereroundto)rU   r   r5   r   r<   r<   r=   r   m  s   
z.BaseImageProcessorFast.compile_friendly_resizescalec                 K   s   || S )a?  
        Rescale an image by a scale factor. image = image * scale.

        Args:
            image (`torch.Tensor`):
                Image to rescale.
            scale (`float`):
                The scaling factor to rescale pixel values by.

        Returns:
            `torch.Tensor`: The rescaled image.
        r<   )r   rU   r   rx   r<   r<   r=   rescale  s   zBaseImageProcessorFast.rescalemeanstdc                 K   s   t |||S )a  
        Normalize an image. image = (image - image_mean) / image_std.

        Args:
            image (`torch.Tensor`):
                Image to normalize.
            mean (`torch.Tensor`, `float` or `Iterable[float]`):
                Image mean to use for normalization.
            std (`torch.Tensor`, `float` or `Iterable[float]`):
                Image standard deviation to use for normalization.

        Returns:
            `torch.Tensor`: The normalized image.
        )r   	normalize)r   rU   r   r   rx   r<   r<   r=   r     s   z BaseImageProcessorFast.normalizer)   r*   r.   r/   r0   r,   r-   ri   rh   c                 C   sB   |r|rt j||dd|  }t j||dd|  }d}|||fS )Nri   g      ?F)r   r?   )r   r.   r/   r0   r,   r-   ri   r<   r<   r=   !_fuse_mean_std_and_rescale_factor  s
   

z8BaseImageProcessorFast._fuse_mean_std_and_rescale_factorc                 C   sR   | j ||||||jd\}}}|r| |jtjd||}|S |r'| ||}|S )z/
        Rescale and normalize images.
        )r.   r/   r0   r,   r-   ri   r   )r   ri   r   r   r   float32r   )r   rM   r,   r-   r.   r/   r0   r<   r<   r=   rescale_and_normalize  s   	z,BaseImageProcessorFast.rescale_and_normalizec                 K   s"  |j du s
|jdu rtd|  |jdd \}}|j |j}}||ks+||krx||kr5|| d nd||kr@|| d nd||krM|| d d nd||krZ|| d d ndg}tj||dd}|jdd \}}||krx||krx|S t|| d }	t|| d }
t||	|
||S )	a  
        Note: override torchvision's center_crop to have the same behavior as the slow processor.
        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
        any edge, the image is padded with 0's and then center cropped.

        Args:
            image (`"torch.Tensor"`):
                Image to center crop.
            size (`dict[str, int]`):
                Size of the output image.

        Returns:
            `torch.Tensor`: The center cropped image.
        Nz=The size dictionary must have keys 'height' and 'width'. Got r      r   r
   )r   g       @)	r[   r\   r:   r   rN   r   r   rr   crop)r   rU   r4   rx   image_heightimage_widthcrop_height
crop_widthpadding_ltrbcrop_top	crop_leftr<   r<   r=   center_crop  s"   z"BaseImageProcessorFast.center_cropc                 C   s   t |S )a'  
        Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
        as is.
        Args:
            image (ImageInput):
                The image to convert.

        Returns:
            ImageInput: The converted image.
        )r   )r   rU   r<   r<   r=   r     s   z%BaseImageProcessorFast.convert_to_rgbc                 C   sB   | j du r|S | j D ]}||v rtd| d || q
|S )zJ
        Filter out the unused kwargs from the kwargs dictionary.
        Nz!This processor does not use the `z ` parameter. It will be ignored.)unused_kwargsloggerwarning_oncer   )r   rx   
kwarg_namer<   r<   r=   r     s   


z/BaseImageProcessorFast.filter_out_unused_kwargs   expected_ndimsc                 C   s   |  |}t||dS )z
        Prepare the images structure for processing.

        Args:
            images (`ImageInput`):
                The input images to process.

        Returns:
            `ImageInput`: The images with a valid nesting.
        r   )fetch_imagesr   )r   rM   r   r<   r<   r=   _prepare_images_structure$  s   
z0BaseImageProcessorFast._prepare_images_structurerf   rg   c                 C   s   t |}|tjtjtjfvrtd| |r| |}|tjkr't|}n|tjkr3t	
| }|jdkr=|d}|d u rEt|}|tjkrS|ddd }|d ur\||}|S )NzUnsupported input image type r   r   r
   )r   r   PILTORCHNUMPYr:   r   r   pil_to_tensorr   
from_numpy
contiguousndim	unsqueezer   r   LASTpermuter   )r   rU   rf   rg   ri   
image_typer<   r<   r=   _process_image7  s$   






z%BaseImageProcessorFast._process_imagec                    sn   | j ||d}t| j|||d t|dkot|d ttf}|r, fdd|D }|S  fdd|D }|S )a  
        Prepare image-like inputs for processing.

        Args:
            images (`ImageInput`):
                The image-like inputs to process.
            do_convert_rgb (`bool`, *optional*):
                Whether to convert the images to RGB.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The input data format of the images.
            device (`torch.device`, *optional*):
                The device to put the processed images on.
            expected_ndims (`int`, *optional*):
                The expected number of dimensions for the images. (can be 2 for segmentation maps etc.)

        Returns:
            List[`torch.Tensor`]: The processed images.
        r   rf   rg   ri   r   c                    s   g | ]} fd d|D qS )c                       g | ]} |qS r<   r<   rO   process_image_partialr<   r=   rI     rJ   zPBaseImageProcessorFast._prepare_image_like_inputs.<locals>.<listcomp>.<listcomp>r<   )rG   nested_listr   r<   r=   rI     s    zEBaseImageProcessorFast._prepare_image_like_inputs.<locals>.<listcomp>c                    r   r<   r<   rO   r   r<   r=   rI     rJ   )r   r   r   len
isinstancert   tuple)r   rM   rf   rg   ri   r   has_nested_structurer   r<   r   r=   _prepare_image_like_inputs]  s   
z1BaseImageProcessorFast._prepare_image_like_inputsr2   rb   r8   c           
      K   s   |du ri }|durt di t||d}|dur$t di t|dd}|dur3t di t|dd}t|tr<t|}t|trEt|}|du rLtj}||d< ||d< ||d< ||d< ||d	< ||d
< |d}	t|	tt	frtt
|	 n|	|d< |S )z
        Update kwargs that need further processing before being validated
        Can be overridden by subclasses to customize the processing of kwargs.
        Nry   r2   rz   re   r|   r4   r/   r0   r8   rc   r5   r<   )r   r   r   rt   r   r   r;   r   r&   rr   r(   )
r   r4   r2   re   rb   r/   r0   r8   rx   rc   r<   r<   r=   _further_process_kwargs  s0   


z.BaseImageProcessorFast._further_process_kwargsr3   r1   r7   c                 K   s$   t |||||||||	|
||d dS )z@
        validate the kwargs for the preprocess method.
        )r,   r-   r.   r/   r0   r3   r4   r1   r2   r5   r7   r8   N)r>   )r   r,   r-   r.   r/   r0   r3   r4   r1   r2   r5   r7   r8   rx   r<   r<   r=   _validate_preprocess_kwargs  s   
z2BaseImageProcessorFast._validate_preprocess_kwargsc                 O   s   | j |g|R i |S N)
preprocess)r   rM   argsrx   r<   r<   r=   __call__  s   zBaseImageProcessorFast.__call__c                 O   s   t | | jd | jD ]}||t| |d  q|d}|d}|d}| jdi |}| jdi | |d | j|g|R |||d|S )N)captured_kwargsvalid_processor_keysrf   rg   ri   r8   r   r<   )	r   r   r   
setdefaultr   r   r   r   _preprocess_image_like_inputs)r   rM   r   rx   r   rf   rg   ri   r<   r<   r=   r     s$   




z!BaseImageProcessorFast.preprocessr   c                O   s*   | j ||||d}| j|g|R i |S )z
        Preprocess image-like inputs.
        To be overridden by subclasses when image-like inputs other than images should be processed.
        It can be used for segmentation maps, depth maps, etc.
        )rM   rf   rg   ri   )r   _preprocess)r   rM   rf   rg   ri   r   rx   r<   r<   r=   r     s   z4BaseImageProcessorFast._preprocess_image_like_inputsrd   c              	   K   s   t ||d\}}i }| D ]\}}|r| j|||d}|||< qt||}t ||d\}}i }| D ]\}}|r@| ||}| ||||	|
|}|||< q4t||}|r^| j|||d}|rgtj|ddn|}t	d|i|dS )Nr   )rU   r4   r5   )re   rj   r   )dimrw   )datatensor_type)
r   r   r   r   r   r   r   r   stackr   )r   rM   r3   r4   r5   r1   r2   r,   r-   r.   r/   r0   rd   re   rj   r7   rx   r   r   resized_images_groupedrN   r   resized_imagesr   r   r<   r<   r=   r     s*   



z"BaseImageProcessorFast._preprocessc                    s&   t   }|dd  |dd  |S )N_valid_processor_keysr   )r}   to_dictr   )r   encoder_dictr   r<   r=   r   =  s   
zBaseImageProcessorFast.to_dict)Nr   r   FF)NT)NNNNNN)r   )NNN)NNNr   )NNNNNNN)NNNNNNNNNNNN)Erk   rl   rm   rc   r/   r0   r4   rb   r2   r3   r1   rd   re   r,   r-   r.   rf   r7   r   r;   r8   rg   ri   model_input_namesra   r   r   r   r~   propertyrn   r   r   r   rr   rq   r   r   staticmethodr   r   rs   r   r	   r   r   r   rt   r   r   r   r   r   rp   r   r   r   r   r   r   r   r   r   r    r   r   r   r   __classcell__r<   r<   r   r=   rv      sB   		
H
6




)


)
.

2	

" 
	

0rv   r   )Gcollections.abcr   copyr   	functoolsr   r   typingr   r   r   r	   numpynpimage_processing_utilsr   r   r   image_transformsr   r   r   r   r   image_utilsr   r   r   r   r   r   r   r   r   r   r   processing_utilsr   utilsr   r    r!   r"   r#   r$   utils.import_utilsr%   r&   r   torchvision.transforms.v2r'   r   r(   
get_loggerrk   r   r;   rn   rs   rt   rq   r>   rr   rD   rL   r   rT   ndarrayr`   ra   rv   r<   r<   r<   r=   <module>   s   4 
	
&

