o
    i{                     @   s  d Z ddlZddlmZmZ ddlZddlmZm	Z	m
Z
 ddlmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZ ddlmZmZmZmZmZm Z m!Z! e rWddl"Z"e #e$Z%d	ee&e&e  e&e ef d
e&e&e  fddZ'G dd de	Z(G dd deZ)dgZ*dS )zImage processor class for Fuyu.    N)OptionalUnion   )BaseImageProcessorBatchFeatureget_size_dict)padresizeto_channel_dimension_format)
ChannelDimension
ImageInputPILImageResamplingget_image_sizeinfer_channel_dimension_formatis_scaled_imageis_valid_imagemake_list_of_imagesto_numpy_arrayvalidate_preprocess_arguments)
TensorTypefilter_out_non_signature_kwargsis_torch_availableis_torch_deviceis_torch_dtypeloggingrequires_backendsimagesreturnc                 C   sP   t | r| ggS t| trtdd | D r| S t| tr$dd | D S td)Nc                 s   s    | ]}t |tV  qd S N)
isinstancelist.0image r$   b/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/fuyu/image_processing_fuyu.py	<genexpr>@   s    z.make_list_of_list_of_images.<locals>.<genexpr>c                 S      g | ]}t |qS r$   )r   r!   r$   r$   r%   
<listcomp>D       z/make_list_of_list_of_images.<locals>.<listcomp>zHimages must be a list of list of images or a list of images or an image.)r   r   r    all
ValueError)r   r$   r$   r%   make_list_of_list_of_images:   s   
r,   c                   @   s6   e Zd ZdZd
deeeef  fddZddd	Z	dS )FuyuBatchFeaturez
    BatchFeature class for Fuyu image processor and processor.

    The outputs dictionary from the processors contains a mix of tensors and lists of tensors.
    Ntensor_typec                    s   |du r| S | j |d\fdd  fdd|  D ]3\}t|tr<t|d tr<fdd	|D | < q t|trMfd
d	|D | < q || < q | S )a5  
        Convert the inner content to tensors.

        Args:
            tensor_type (`str` or [`~utils.TensorType`], *optional*):
                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
                `None`, no modification is done.
        N)r.   c                    s   | r| S  | S r   r$   elem)	as_tensor	is_tensorr$   r%   _convert_tensor^   s   z<FuyuBatchFeature.convert_to_tensors.<locals>._convert_tensorc                    s*   z | W S    dkrt dt d)Noverflowing_valueszKUnable to create tensor returning overflowing values of different lengths. zUnable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.)r+   r/   )r3   keyr$   r%   _safe_convert_tensorc   s   
zAFuyuBatchFeature.convert_to_tensors.<locals>._safe_convert_tensorr   c                    s   g | ]} fd d|D qS )c                       g | ]} |qS r$   r$   r"   r0   r6   r$   r%   r(   r   r)   zBFuyuBatchFeature.convert_to_tensors.<locals>.<listcomp>.<listcomp>r$   )r"   elemsr9   r$   r%   r(   r   s    z7FuyuBatchFeature.convert_to_tensors.<locals>.<listcomp>c                    r7   r$   r$   r8   r9   r$   r%   r(   u   r)   )_get_is_as_tensor_fnsitemsr   r    )selfr.   valuer$   )r3   r6   r1   r2   r5   r%   convert_to_tensorsP   s   	
z#FuyuBatchFeature.convert_to_tensorsr   r   c           	         s"  t | dg ddli }ddu r?tdkr?d }t|r$nt|ts2t|s2t|tr5|n
t	dt| dfdd | 
 D ]?\}}t|trtt|d trtg }|D ]}| fd	d
|D  q`|||< qLt|tr fdd
|D ||< qL |||< qL|| _| S )a  
        Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
        different `dtypes` and sending the `BatchFeature` to a different `device`.

        Args:
            args (`Tuple`):
                Will be passed to the `to(...)` function of the tensors.
            kwargs (`Dict`, *optional*):
                Will be passed to the `to(...)` function of the tensors.

        Returns:
            [`BatchFeature`]: The same instance after modification.
        torchr   Ndevicez*Attempting to cast a BatchFeature to type z. This is not supported.c                    s2    | r| j i S d ur| jdS | S )N)rA   )is_floating_pointtor/   )argsrA   kwargsr@   r$   r%   _to   s
   
z FuyuBatchFeature.to.<locals>._toc                    r7   r$   r$   r8   rF   r$   r%   r(      r)   z'FuyuBatchFeature.to.<locals>.<listcomp>c                    r7   r$   r$   r8   rG   r$   r%   r(      r)   )r   r@   getlenr   r   strr   intr+   r<   r    appenddata)	r=   rD   rE   new_dataargkvnew_vr:   r$   )rF   rD   rA   rE   r@   r%   rC   {   s.   


zFuyuBatchFeature.tor   )r   r   )
__name__
__module____qualname____doc__r   r   rJ   r   r?   rC   r$   r$   r$   r%   r-   I   s    +r-   c                !       s  e Zd ZdZg dZddejdddddddddfd	ed
ee	e
ef  dededede
dedeeee f deeee f dededee	e
ef  f fddZejddfdejd
e	e
ef dedeee
ef  deee
ef  dejfddZ				d4dejd
e	e
ef de
dedeee
ef  deee
ef  dejfdd Ze ddddddddddddejddfd	ee d
ee	e
ef  dee dee dee dee
 dee dee dee dee dee dee	e
ef  deee
ef  deee
ef  d!ee fd"d#Zd5d$ed%edee	e
ef  defd&d'Zd5dd(dee	e
ef  dd(fd)d*Z	d5d+d(d,d(d-d(d.d(d/ed0ed1edee	e
ef  defd2d3Z  ZS )6FuyuImageProcessora	  
    This class should handle the image processing part before the main FuyuForCausalLM. In particular, it should
    handle:

    - Processing Images:
        Taking a batch of images as input. If the images are variable-sized, it resizes them based on the desired patch
        dimensions. The image output is always img_h, img_w of (1080, 1920)

        Then, it patches up these images using the patchify_image function.

    - Creating Image Input IDs:
        For each patch, a placeholder ID is given to identify where these patches belong in a token sequence. For
        variable-sized images, each line of patches is terminated with a newline ID.

    - Image Patch Indices:
        For each image patch, the code maintains an index where these patches should be inserted in a token stream.


    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image to `size`.
        size (`dict[str, int]`, *optional*, defaults to `{"height": 1080, "width": 1920}`):
            Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
        do_pad (`bool`, *optional*, defaults to `True`):
            Whether to pad the image to `size`.
        padding_value (`float`, *optional*, defaults to 1.0):
            The value to pad the image with.
        padding_mode (`str`, *optional*, defaults to `"constant"`):
            The padding mode to use when padding the image.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image.
        image_mean (`float`, *optional*, defaults to 0.5):
            The mean to use when normalizing the image.
        image_std (`float`, *optional*, defaults to 0.5):
            The standard deviation to use when normalizing the image.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image.
        rescale_factor (`float`, *optional*, defaults to `1 / 255`):
            The factor to use when rescaling the image.
        patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 30, "width": 30}`):
            Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
    r   image_input_idsimage_patchesimage_patch_indices_per_batch#image_patch_indices_per_subsequenceTN      ?constantg      ?gp?	do_resizesizeresampledo_padpadding_valuepadding_modedo_normalize
image_mean	image_std
do_rescalerescale_factor
patch_sizec                    s   t  jdi | || _|d ur|nddd| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _|d ur<|| _d S ddd| _d S )Ni8  i  )heightwidth   r$   )super__init__r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   )r=   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rE   	__class__r$   r%   ro      s   "zFuyuImageProcessor.__init__r#   data_formatinput_data_formatr   c                 K   s   t ||\}}|d |d }	}
||
kr||	kr|S |	| }|
| }t||}t|| }t|| }td|||f|||d|}|S )a  
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        rk   rl   )r#   r`   ra   rr   rs   Nr$   )r   minrK   r	   )r=   r#   r`   ra   rr   rs   rE   image_heightimage_widthtarget_heighttarget_widthheight_scale_factorwidth_scale_factoroptimal_scale_factor
new_height	new_widthscaled_imager$   r$   r%   r	   
  s&   #
zFuyuImageProcessor.resizemodeconstant_valuesc                 C   s\   t ||\}}|d |d }	}
d}d}|	| }|
| }t|||f||ff||||d}|S )a  
        Pad an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to pad.
            size (`dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            data_format (`ChannelDimension` or `str`, *optional*):
                The data format of the output image. If unset, the same format as the input image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        rk   rl   r   )paddingr   r   rr   rs   )r   r   )r=   r#   r`   r   r   rr   rs   ru   rv   rw   rx   padding_toppadding_leftpadding_bottompadding_rightpadded_imager$   r$   r%   	pad_imageD  s   zFuyuImageProcessor.pad_imagereturn_tensorsc              
      s  |dur|nj }durnj|dur|nj}|dur!|nj}|dur*|nj}dur3nj|dur<|nj}durEnjdurNnjdurWnj	dur`nj
|duri|nj}durrnj|dur{|nj}t|trtdd |D rtdt|}t||||d dd |D }dd |D d	 d	 }|rt|rtd
 du rt|fdd|D }t|r߇fdd|D }fdd|D }dd |D }dd |D }dd t||D }|rfdd|D }|rfdd|D }|r-fdd|D } dur< fdd|D }||||d}t||dS )a  

        Utility function to preprocess the images and extract necessary information about original formats.

        Args:
            images (`ImageInput`):
                Images to preprocess. Expects a single image, a list or images or a list of lists of images. Pixel
                values range from 0 to 255, or between 0 and 1 if `do_rescale` is `False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image to `size`.
            size (`dict[str, int]`, *optional*, defaults to `self.size`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
                Whether to pad the image to `size`.
            padding_value (`float`, *optional*, defaults to `self.padding_value`):
                The value to pad the image with.
            padding_mode (`str`, *optional*, defaults to `self.padding_mode`):
                The padding mode to use when padding the image.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float`, *optional*, defaults to `self.image_mean`):
                The mean to use when normalizing the image.
            image_std (`float`, *optional*, defaults to `self.image_std`):
                The standard deviation to use when normalizing the image.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                The factor to use when rescaling the image.
            patch_size (`dict[str, int]`, *optional*, defaults to `self.patch_size`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                - Unset: Return a list of `np.ndarray`.
                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format of the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        Nc                 s   s&    | ]}t |tot|d kV  qdS )   N)r   r    rI   r8   r$   r$   r%   r&     s   $ z0FuyuImageProcessor.preprocess.<locals>.<genexpr>z:Multiple images for a single sample are not yet supported.)rh   ri   re   rf   rg   r_   r`   ra   c                 S   s   g | ]	}d d |D qS )c                 S   r'   r$   )r   r!   r$   r$   r%   r(     r)   <FuyuImageProcessor.preprocess.<locals>.<listcomp>.<listcomp>r$   r"   r   r$   r$   r%   r(     s    z1FuyuImageProcessor.preprocess.<locals>.<listcomp>c                 S   s   g | ]}|r|qS r$   r$   r   r$   r$   r%   r(     r)   r   zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.c                        g | ]}|rt |d   dqS r   )channel_dimr   r   rs   r$   r%   r(     s
    c                    "   g | ]} fd d|D qS )c                    s   g | ]
}j | d qS ))r`   rs   )r	   r!   rs   r=   r`   r$   r%   r(         r   r$   r   r   r$   r%   r(         c                    r   r   r   r   r   r$   r%   r(     s     c                 S      g | ]}|d  gqS r   r$   r"   
image_sizer$   r$   r%   r(         c                 S   r   )   r$   r   r$   r$   r%   r(     r   c                 S   s"   g | ]\}}|d  |d   gqS r   r$   )r"   original_sizeresized_sizer$   r$   r%   r(     s    c                    s&   g | ]} fd d|D qS )c              	      s    g | ]}j | d qS ))r`   r   r   rs   )r   r!   rs   rd   rc   r=   r`   r$   r%   r(     s    r   r$   r   r   r$   r%   r(     s    c                    r   )c                    s   g | ]
}j | d qS ))scalers   )rescaler!   rs   ri   r=   r$   r%   r(     r   r   r$   r   r   r$   r%   r(     r   c                    s$   g | ]} fd d|D qS )c                    s   g | ]}j | d qS ))meanstdrs   )	normalizer!   rf   rg   rs   r=   r$   r%   r(     s    r   r$   r   r   r$   r%   r(   
  s    c                    s    g | ]} fd d|D qS )c                    s   g | ]}t | qS r$   )r
   r!   rr   rs   r$   r%   r(     s    r   r$   r   r   r$   r%   r(     s    )r   image_unpadded_heightsimage_unpadded_widthsimage_scale_factors)rM   r.   )r_   r`   ra   rb   rh   ri   re   rf   rg   rc   rd   rj   r   r    anyr+   r,   r   r   loggerwarning_oncer   r   zipr-   )r=   r   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rr   rs   r   batch_imagesfirst_image_in_listoriginal_image_sizesimage_sizesr   r   r   rM   r$   )	rr   rf   rg   rs   rd   rc   ri   r=   r`   r%   
preprocessj  s   F

zFuyuImageProcessor.preprocessru   rv   c           	      C   s   |dur|n| j }| j d | j d }}|| dkr$td|d| || dkr4td|d| || }|| }|| }|S )a  
        Calculate number of patches required to encode an image.

        Args:
            image_height (`int`):
                Height of the image.
            image_width (`int`):
                Width of the image.
            patch_size (`dict[str, int]`, *optional*, defaults to `self.patch_size`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
        Nrk   rl   r   zimage_height=z must be divisible by zimage_width=)rj   r+   )	r=   ru   rv   rj   patch_heightpatch_widthnum_patches_per_dim_hnum_patches_per_dim_wnum_patchesr$   r$   r%   get_num_patches   s   z"FuyuImageProcessor.get_num_patchesztorch.Tensorc           
      C   s   t | dg |dur|n| j}|d |d }}|j\}}}}|d||}|d||}	|	 }	|	||d||}	|	dddd	d
}	|	|d|| | }	|	S )a|  
        Convert an image into a tensor of patches.

        Args:
            image (`torch.Tensor`):
                Image to convert. Shape: [batch, channels, height, width]
            patch_size (`dict[str, int]`, *optional*, defaults to `self.patch_size`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
        r@   Nrk   rl   r   r   r      r   )r   rj   shapeunfold
contiguousviewpermutereshape)
r=   r#   rj   r   r   
batch_sizechannels_unfolded_along_heightpatchesr$   r$   r%   patchify_image9  s   
z!FuyuImageProcessor.patchify_imageimage_inputimage_presentimage_unpadded_himage_unpadded_wimage_placeholder_idimage_newline_idvariable_sizedc	           '   	   C   s  t | dg |dur|n| j}|d |d }	}
g }g }g }t|jd D ]}g }g }t|jd D ]}|||f r|||f }|jd |jd }}|rt|t|||f |	 |	 }t|t|||f |
 |
 }|ddd|d|f }||}}| j||d}tj	|g|tj
|jd	}| j|dd
d}||jd ksJ |r|d||
 }tj	|jd dg|tj
|jd	}tj||gdd}|d}||g || || q2|tjg tj
|jd	 q2|| || q%g }g }|D ][}d}g }g } |D ]F}!|!|k}"t|"}tj|tj|!jd	|!}#t|!d}$t|!d}%tj|"ddd }&|#| |$|&< |#|%|&< ||$ | |% ||7 }q	|| ||  qt|||||ddS )a  Process images for model input. In particular, variable-sized images are handled here.

        Args:
            image_input (`torch.Tensor` of shape [batch_size, subsequence_size, num_channels, height, width]):
                Tensor of images padded to model input size.
            image_present (`torch.Tensor` of shape [batch_size, subsequence_size, num_images]):
                Tensor of 1s and 0s indicating whether an image is present.
            image_unpadded_h (`torch.Tensor` of shape [batch_size, subsequence_size]):
                Tensor of unpadded image heights.
            image_unpadded_w (`torch.Tensor` of shape [batch_size, subsequence_size]):
                Tensor of unpadded image widths.
            image_placeholder_id (int):
                The id of the image placeholder token. Comes from an associated tokenizer.
            image_newline_id (int):
                The id of the image newline token. Comes from an associated tokenizer.
            variable_sized (bool):
                Whether to process images as variable-sized.
            patch_size (`dict[str, int]`, *optional*, defaults to `self.patch_size`):
                Size of the patches.
        r@   Nrk   rl   r   r   r   )ru   rv   )dtyperA   )r#   r   )dimT)as_tuplerX   )rM   )r   rj   ranger   rt   mathceilr   r@   fullint32rA   r   	unsqueezesqueezer   catrL   tensorcount_nonzeroarangeint64type_as	full_likenonzeror-   )'r=   r   r   r   r   r   r   r   rj   r   r   r   batch_image_patchesbatch_image_input_idsbatch_indexrY   rZ   subseq_indexr#   ru   rv   new_hnew_wr   tensor_of_image_idsr   newline_idsr[   r\   sample_image_input_idsindex_offsetper_batch_indicesper_subsequence_indicessubseq_image_input_idspatches_maskindicesindices_in_stream_per_batch!indices_in_stream_per_subsequencepatches_indsr$   r$   r%   preprocess_with_tokenizer_infoS  s   







z1FuyuImageProcessor.preprocess_with_tokenizer_info)r^   r]   NNr   )rS   rT   rU   rV   model_input_namesr   BILINEARboolr   dictrJ   rK   floatr   r    ro   npndarrayr   r	   r   r   FIRSTr   r   r   r   r-   r   __classcell__r$   r$   rp   r%   rW      s"   -
	
"

>

&	
 (6$#	
rW   )+rV   r   typingr   r   numpyr   image_processing_utilsr   r   r   image_transformsr   r	   r
   image_utilsr   r   r   r   r   r   r   r   r   r   utilsr   r   r   r   r   r   r   r@   
get_loggerrS   r   r    r,   r-   rW   __all__r$   r$   r$   r%   <module>   s.   0$


m    
&