o
    i95                     @   s$  d Z ddlmZmZ ddlZddlmZmZm	Z	 ddl
mZmZmZmZ ddlmZmZmZmZmZmZmZmZ ddlmZmZmZmZmZmZ dd	lm Z  e rUddl!Z!e r\ddl"Z"e#e$Z%d
d Z&		ddej'dee( dee( deee(ef  fddZ)e ddG dd deZ*dgZ+dS )z%Image processor class for LayoutLMv2.    )OptionalUnionN   )BaseImageProcessorBatchFeatureget_size_dict)flip_channel_orderresizeto_channel_dimension_formatto_pil_image)ChannelDimension
ImageInputPILImageResamplinginfer_channel_dimension_formatmake_flat_list_of_imagesto_numpy_arrayvalid_imagesvalidate_preprocess_arguments)
TensorTypefilter_out_non_signature_kwargsis_pytesseract_availableis_vision_availableloggingrequires_backends)requiresc                 C   sL   t d| d |  t d| d |  t d| d |  t d| d |  gS )Ni  r         r   )int)boxwidthheight r!   n/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/layoutlmv2/image_processing_layoutlmv2.pynormalize_box6   s
   r#   imagelangtesseract_configinput_data_formatc                    sb  |dur|nd}t | |d}|j\}}tj||d|d}|d |d |d |d	 |d
 f\}}	}
}}dd t|D   fddt|D } fddt|	D }	 fddt|
D }
 fddt|D } fddt|D }g }t|	|
||D ]\}}}}|||| || g}|| q{g }|D ]}|t||| qt|t|ksJ d||fS )zdApplies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes.N r'   dict)r%   output_typeconfigtextlefttopr   r    c                 S   s   g | ]
\}}|  s|qS r!   )strip.0idxwordr!   r!   r"   
<listcomp>O       z#apply_tesseract.<locals>.<listcomp>c                       g | ]
\}}| vr|qS r!   r!   r1   irrelevant_indicesr!   r"   r5   P   r6   c                    r7   r!   r!   r2   r3   coordr8   r!   r"   r5   Q   r6   c                    r7   r!   r!   r:   r8   r!   r"   r5   R   r6   c                    r7   r!   r!   r:   r8   r!   r"   r5   S   r6   c                    r7   r!   r!   r:   r8   r!   r"   r5   T   r6   z-Not as many words as there are bounding boxes)	r   sizepytesseractimage_to_data	enumeratezipappendr#   len)r$   r%   r&   r'   	pil_imageimage_widthimage_heightdatawordsr.   r/   r   r    actual_boxesxywh
actual_boxnormalized_boxesr   r!   r8   r"   apply_tesseract?   s(   
,rO   )vision)backendsc                       sF  e Zd ZdZdgZddejdddfdedee	e
ef  ded	ed
ee
 dee
 ddf fddZejddfdejde	e
ef dedeee
ef  deee
ef  dejfddZe dddddddejdf	dedee dee	e
ef  dee d	ee d
ee
 dee
 deee
ef  dedeee
ef  dejjfddZ  ZS )LayoutLMv2ImageProcessora  
    Constructs a LayoutLMv2 image processor.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to `(size["height"], size["width"])`. Can be
            overridden by `do_resize` in `preprocess`.
        size (`dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
            Size of the image after resizing. Can be overridden by `size` in `preprocess`.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
            `preprocess` method.
        apply_ocr (`bool`, *optional*, defaults to `True`):
            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
            `apply_ocr` in `preprocess`.
        ocr_lang (`str`, *optional*):
            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
            used. Can be overridden by `ocr_lang` in `preprocess`.
        tesseract_config (`str`, *optional*, defaults to `""`):
            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
            Tesseract. For example: '--psm 6'. Can be overridden by `tesseract_config` in `preprocess`.
    pixel_valuesTNr(   	do_resizer<   resample	apply_ocrocr_langr&   returnc                    sX   t  jdi | |d ur|nddd}t|}|| _|| _|| _|| _|| _|| _d S )N   )r    r   r!   )	super__init__r   rT   r<   rU   rV   rW   r&   )selfrT   r<   rU   rV   rW   r&   kwargs	__class__r!   r"   r[      s   

z!LayoutLMv2ImageProcessor.__init__r$   data_formatr'   c                 K   sT   t |}d|vsd|vrtd|  |d |d f}t|f||||d|S )a  
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        r    r   zFThe `size` dictionary must contain the keys `height` and `width`. Got )r<   rU   r`   r'   )r   
ValueErrorkeysr	   )r\   r$   r<   rU   r`   r'   r]   output_sizer!   r!   r"   r	      s   #zLayoutLMv2ImageProcessor.resizeimagesreturn_tensorsc                    sn  |dur|nj }durnjtdurnj|dur%|nj}|dur.|nj}|dur7|nj}t|}t|sFt	dt
|d dd |D }du r^t|d |rtd g }g }|D ]}t|||d\}}|| || qk|rfd	d|D }fd
d|D } fdd|D }td|i|d}|r||d< ||d< |S )a  
        Preprocess an image or batch of images.

        Args:
            images (`ImageInput`):
                Image to preprocess.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`dict[str, int]`, *optional*, defaults to `self.size`):
                Desired size of the output image after resizing.
            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. This can be one of the enum `PIL.Image` resampling
                filter. Only has an effect if `do_resize` is set to `True`.
            apply_ocr (`bool`, *optional*, defaults to `self.apply_ocr`):
                Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
            ocr_lang (`str`, *optional*, defaults to `self.ocr_lang`):
                The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
                used.
            tesseract_config (`str`, *optional*, defaults to `self.tesseract_config`):
                Any additional custom configuration flags that are forwarded to the `config` parameter when calling
                Tesseract.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        NzkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.)rT   r<   rU   c                 S   s   g | ]}t |qS r!   )r   r2   r$   r!   r!   r"   r5     s    z7LayoutLMv2ImageProcessor.preprocess.<locals>.<listcomp>r   r=   r)   c                    s   g | ]}j | d qS ))r$   r<   rU   r'   )r	   rf   )r'   rU   r\   r<   r!   r"   r5     s    c                    s   g | ]}t | d qS )r)   )r   rf   r)   r!   r"   r5   "  s    c                    s   g | ]	}t | d qS ))input_channel_dim)r
   rf   )r`   r'   r!   r"   r5   #  s    rS   )rF   tensor_typerG   boxes)rT   r<   r   rU   rV   rW   r&   r   r   ra   r   r   r   rO   rA   r   )r\   rd   rT   r<   rU   rV   rW   r&   re   r`   r'   words_batchboxes_batchr$   rG   ri   rF   r!   )r`   r'   rU   r\   r<   r"   
preprocess   sR   /

z#LayoutLMv2ImageProcessor.preprocess)__name__
__module____qualname____doc__model_input_namesr   BILINEARboolr   r*   strr   r[   npndarrayr   r   r	   r   FIRSTr   r   PILImagerl   __classcell__r!   r!   r^   r"   rR   f   s    	

0	
rR   )NN),rp   typingr   r   numpyru   image_processing_utilsr   r   r   image_transformsr   r	   r
   r   image_utilsr   r   r   r   r   r   r   r   utilsr   r   r   r   r   r   utils.import_utilsr   rx   r=   
get_loggerrm   loggerr#   rv   rt   rO   rR   __all__r!   r!   r!   r"   <module>   s:   ( 


' 
I