o
    iWM                     @   s  d Z ddlZddlZddlmZmZ ddlZddlm	Z	 ddl
mZmZ ddlmZmZmZmZ ddlmZmZmZmZmZmZmZ dd	lmZmZmZmZ dd
lm Z  e reddl!Z!ddl"m#Z#m$Z$m%Z% e rlddl&Z&e'e(Z)dZ*dd Z+									d'de,de-de,de,de-de-de-de-dee. dee, de#j#fddZ/	d(d ej0d!e,d"eee,e1f  fd#d$Z2G d%d& d&eZ3d&gZ4dS ))z%Image processor class for Pix2Struct.    N)OptionalUnion)hf_hub_download   )BaseImageProcessorBatchFeature)convert_to_rgb	normalizeto_channel_dimension_formatto_pil_image)ChannelDimension
ImageInputget_image_sizeinfer_channel_dimension_formatmake_flat_list_of_imagesto_numpy_arrayvalid_images)
TensorTypeis_torch_availableis_vision_availablelogging)requires_backends)Image	ImageDraw	ImageFontzybelkada/fontsc                 C   s   t tdg | d} tjjj| ||f||fd}|| d| d||d}|	ddddd| d| | d| | d| | }|dS )	a  
    Utility function to extract patches from a given image tensor. Returns a tensor of shape
    (1, `rows`, `columns`, `num_channels`x `patch_height` x `patch_width`).

    Args:
        image_tensor (torch.Tensor):
            The image tensor to extract patches from.
        patch_height (int):
            The height of the patches to extract.
        patch_width (int):
            The width of the patches to extract.
    torchr   )stride         r   )
r   torch_extract_patches	unsqueezer   nn
functionalunfoldreshapesizepermute)image_tensorpatch_heightpatch_widthpatches r-   n/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/pix2struct/image_processing_pix2struct.pyr!   4   s   

r!   $   blackwhite   text	text_size
text_colorbackground_colorleft_paddingright_paddingtop_paddingbottom_padding
font_bytes	font_pathreturnc
                 C   s   t td tjdd}
|
j| d}d|}|dur$|	du r$t|}n|	dur+|	}ntt	d}t
j|d|d	}ttd
d|}|d||\}}}}|| | }|| | }td
||f|}t|}|j||f|||d |S )a  
    Render text. This script is entirely adapted from the original script that can be found here:
    https://github.com/google-research/pix2struct/blob/main/pix2struct/preprocessing/preprocessing_utils.py

    Args:
        text (`str`, *optional*, defaults to ):
            Text to render.
        text_size (`int`, *optional*, defaults to 36):
            Size of the text.
        text_color (`str`, *optional*, defaults to `"black"`):
            Color of the text.
        background_color (`str`, *optional*, defaults to `"white"`):
            Color of the background.
        left_padding (`int`, *optional*, defaults to 5):
            Padding on the left.
        right_padding (`int`, *optional*, defaults to 5):
            Padding on the right.
        top_padding (`int`, *optional*, defaults to 5):
            Padding on the top.
        bottom_padding (`int`, *optional*, defaults to 5):
            Padding on the bottom.
        font_bytes (`bytes`, *optional*):
            Bytes of the font to use. If `None`, the default font will be used.
        font_path (`str`, *optional*):
            Path to the font to use. If `None`, the default font will be used.
    visionP   )width)r3   
Nz	Arial.TTFzUTF-8)encodingr'   RGB)r   r   r   r   )xyr3   fillfont)r   render_texttextwrapTextWrapperwrapjoinioBytesIOr   DEFAULT_FONT_PATHr   truetyper   Drawr   newtextbboxr3   )r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   wrapperlineswrapped_textrG   	temp_draw_
text_widthtext_heightimage_widthimage_heightimagedrawr-   r-   r.   rH   O   s$   
&


rH   r]   headerinput_data_formatc           	      K   s   t td t| |d} t|fi |}t|j| j}t| j|| j  }t|j||j  }t	d||| fd}|
|||fd |
| ||fd|f t|}t|tjkrbt|tj}|S )a  
    Renders the input text as a header on the input image.

    Args:
        image (`np.ndarray`):
            The image to render the header on.
        header (`str`):
            The header text.
        data_format (`Union[ChannelDimension, str]`, *optional*):
            The data format of the image. Can be either "ChannelDimension.channels_first" or
            "ChannelDimension.channels_last".

    Returns:
        `np.ndarray`: The image with the header rendered.
    r>   )r`   rC   r1   rD   r   )r   render_headerr   rH   maxr@   intheightr   rR   pasteresizer   r   r   LASTr
   )	r]   r_   r`   kwargsheader_image	new_width
new_heightnew_header_height	new_imager-   r-   r.   ra      s   
ra   c                       sB  e Zd ZdZddgZ					dded	ed
eeee	f  de	deddf fddZ
	ddejde	d
edeeeef  dejf
ddZ		ddejdeeeef  deeeef  dejfddZddddddejdfdedee dee d	ee dee	 d
eeee	f  deeeef  dedeeeef  defddZ  ZS )Pix2StructImageProcessoraj  
    Constructs a Pix2Struct image processor.

    Args:
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method. According to Pix2Struct paper and code, the image is normalized with its own mean and standard
            deviation.
        patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
            The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
        max_patches (`int`, *optional*, defaults to 2048):
            The maximum number of patches to extract from the image as per the [Pix2Struct
            paper](https://huggingface.co/papers/2210.03347).
        is_vqa (`bool`, *optional*, defaults to `False`):
            Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
            rendered onto the input images.
    flattened_patchesattention_maskTN   Fdo_convert_rgbdo_normalize
patch_sizemax_patchesis_vqar=   c                    sF   t  jdi | |d ur|nddd| _|| _|| _|| _|| _d S )N   )rd   r@   r-   )super__init__rt   rs   rr   ru   rv   )selfrr   rs   rt   ru   rv   rh   	__class__r-   r.   ry      s   	
z!Pix2StructImageProcessor.__init__r]   r`   c              	   K   s  t | jd t|tj|}t|}|d |d }}t|tj\}}	t	|||  ||	  }
t
tt|
| | |d}t
tt|
|	 | |d}t
|| d}t
|| d}tjjj|d||fdddd	d}t|||}|j}|d }|d
 }|d }||| |g}t||dgd||| dg}t|d|g|d|| dg}|d7 }|d7 }|tj}|tj}t|||gd}tjj|ddd|||  g }t|}|S )a  
        Extract flattened patches from an image.

        Args:
            image (`np.ndarray`):
                Image to extract flattened patches from.
            max_patches (`int`):
                Maximum number of patches to extract.
            patch_size (`dict`):
                Dictionary containing the patch height and width.

        Returns:
            result (`np.ndarray`):
                A sequence of `max_patches` flattened patches.
        r   rd   r@   r   r   bilinearFT)r'   modealign_corners	antialiasr    r   r   )r   extract_flattened_patchesr
   r   FIRSTr   
from_numpyr   mathsqrtrb   minfloorr#   r$   interpolater"   squeezer!   shaper&   arangerepeattofloat32catpadfloatr   )rz   r]   ru   rt   r`   rh   r*   r+   r\   r[   scalenum_feasible_rowsnum_feasible_colsresized_heightresized_widthr,   patches_shaperowscolumnsdepthrow_idscol_idsresultr-   r-   r.   r      sF   
	**$z2Pix2StructImageProcessor.extract_flattened_patchesdata_formatc                 K   sb   |j tjkr|tj}t|}t|}t|dt	t
|j }t|f||||d|S )a  
        Normalize an image. image = (image - image_mean) / image_std.

        The image std is to mimic the tensorflow implementation of the `per_image_standardization`:
        https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization

        Args:
            image (`np.ndarray`):
                Image to normalize.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        g      ?)meanstdr   r`   )dtypenpuint8astyper   r   r   rb   r   r   prodr   r	   )rz   r]   r   r`   rh   r   r   adjusted_stddevr-   r-   r.   r	   5  s   

z"Pix2StructImageProcessor.normalizeimagesheader_textreturn_tensorsc
                    sb  |dur|nj }|dur|nj}durnjdur!njj}|
ddur2tdt|}t|s>td|rGdd |D }dd |D }du rXt	|d |rdu rbtd	|

d
d |

ddttrzgt|  fddt|D }|rfdd|D }fdd|D }dd |D }t||d|d}|S )a  
        Preprocess an image or batch of images. The processor first computes the maximum possible number of
        aspect-ratio preserving patches of size `patch_size` that can be extracted from the image. It then pads the
        image with zeros to make the image respect the constraint of `max_patches`. Before extracting the patches the
        images are standardized following the tensorflow implementation of `per_image_standardization`
        (https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization).


        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images.
            header_text (`Union[list[str], str]`, *optional*):
                Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            max_patches (`int`, *optional*, defaults to `self.max_patches`):
                Maximum number of patches to extract.
            patch_size (`dict`, *optional*, defaults to `self.patch_size`):
                Dictionary containing the patch height and width.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        Nr   z8data_format is not an accepted input as the outputs are zkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.c                 S      g | ]}t |qS r-   )r   .0r]   r-   r-   r.   
<listcomp>      z7Pix2StructImageProcessor.preprocess.<locals>.<listcomp>c                 S   r   r-   )r   r   r-   r-   r.   r     r   r   z.A header text must be provided for VQA models.r;   r<   c                    s$   g | ]\}}t ||  d qS ))r;   r<   )ra   )r   ir]   )r;   r<   r   r-   r.   r     s    c                    s   g | ]	}j | d qS ))r]   r`   )r	   r   )r`   rz   r-   r.   r     s    c                    s   g | ]}j | d qS ))r]   ru   rt   r`   )r   r   )r`   ru   rt   rz   r-   r.   r     s    c                 S   s$   g | ]}|j d ddktjqS )r   )axisr   )sumr   r   r   r   r-   r-   r.   r     s   $ )ro   rp   )datatensor_type)rs   rr   rt   ru   rv   get
ValueErrorr   r   r   pop
isinstancestrlen	enumerater   )rz   r   r   rr   rs   ru   rt   r   r   r`   rh   rv   attention_masksencoded_outputsr-   )r;   r<   r   r`   ru   rt   rz   r.   
preprocess\  sJ   5

z#Pix2StructImageProcessor.preprocess)TTNrq   FN)NN)__name__
__module____qualname____doc__model_input_namesboolr   dictr   rc   ry   r   ndarrayr   r   r   r	   r   r   r   r   __classcell__r-   r-   r{   r.   rn      s    
T
*	
rn   )	r/   r0   r1   r2   r2   r2   r2   NNr   )5r   rM   r   typingr   r   numpyr   huggingface_hubr   image_processing_utilsr   r   image_transformsr   r	   r
   r   image_utilsr   r   r   r   r   r   r   utilsr   r   r   r   utils.import_utilsr   rI   PILr   r   r   r   
get_loggerr   loggerrO   r!   r   rc   bytesrH   r   ChildProcessErrorra   rn   __all__r-   r-   r-   r.   <module>   s~   $	
	

E
*  
