o
    iY                     @   s   d Z ddlmZ ddlmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZ eeZe rBddlZeddG dd deZdgZdS )z
Processor class for SAM2.
    deepcopy)OptionalUnionN   )
ImageInput)ProcessorMixin)BatchEncoding)
TensorTypeis_torch_availablelogging)requires)torch)backendsc                       s  e Zd ZdZdgZdZd1dee def fdd	Z							d2d
ee	 dee	 dee
eeeee    ejf  dee
eeee   ejf  dee
eeee   ejf  dee
eee  ejf  dee
eef  defddZ	d3deddddfddZd4ddZd5ddZd6ddZd d! Zd"d# Z	d5d$e
ejejef d%ed&ed'ed(ee defd)d*Zd7d+d,Z	-	.	-	-	d8d/d0Z  ZS )9Sam2Processora  
    Constructs a SAM2 processor which wraps a SAM2 image processor and an 2D points & Bounding boxes processor into a
    single processor.

    [`Sam2Processor`] offers all the functionalities of [`Sam2ImageProcessorFast`] and [`Sam2VideoProcessor`]. See the docstring of
    [`~Sam2ImageProcessorFast.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information.

    Args:
        image_processor (`Sam2ImageProcessorFast`):
            An instance of [`Sam2ImageProcessorFast`].
        target_size (`int`, *optional*):
            The target size (target_size, target_size) to which the image will be resized.
        point_pad_value (`int`, *optional*, defaults to -10):
            The value used for padding input points.
    image_processorSam2ImageProcessorFastNtarget_sizepoint_pad_valuec                    s>   t  j|fi | || _|d ur|| _d S | jjd | _d S )Nheight)super__init__r   r   sizer   )selfr   r   r   kwargs	__class__ e/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/sam2/processing_sam2.pyr   :   s   $zSam2Processor.__init__imagessegmentation_mapsinput_pointsinput_labelsinput_boxesoriginal_sizesreturn_tensorsreturnc                    s:  |dur| j |f||d|}	n|dur*t|tjr!|  }td|i|d}	ntd|	d }|durHt|dkrHt|t|krHtd|dusU|dusU|dur| j	|dd	d
dd}
| j	|dddd}| j	|ddddd}|
dur| 
|
dd }|dur| 
|dd }|dur| 
|dd  |
dur|dur||krtd|durt|dkrt fdd|D rtd|
dur| |
|dg }tj|tjd}| j||dd |	d|i |dur| ||}tj|tjd}|	d|i |durtj|tjd}| j||dd |	d|i |	S )a  
        This method uses [`Sam2ImageProcessorFast.__call__`] method to prepare image(s) for the model. It also prepares 2D
        points and bounding boxes for the model if they are provided.

        Args:
            images (`ImageInput`, *optional*):
                The image(s) to process.
            segmentation_maps (`ImageInput`, *optional*):
                The segmentation maps to process.
            input_points (`list[list[list[list[float]]]]`, `torch.Tensor`, *optional*):
                The points to add to the frame.
            input_labels (`list[list[list[int]]]`, `torch.Tensor`, *optional*):
                The labels for the points.
            input_boxes (`list[list[list[float]]]`, `torch.Tensor`, *optional*):
                The bounding boxes to add to the frame.
            original_sizes (`list[list[float]]`, `torch.Tensor`, *optional*):
                The original sizes of the images.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return.
            **kwargs:
                Additional keyword arguments to pass to the image processor.

        Returns:
            A [`BatchEncoding`] with the following fields:
            - `pixel_values` (`torch.Tensor`): The processed image(s).
            - `original_sizes` (`list[list[float]]`): The original sizes of the images.
            - `reshaped_input_sizes` (`torch.Tensor`): The reshaped input sizes of the images.
            - `labels` (`torch.Tensor`): The processed segmentation maps (if provided).
            - `input_points` (`torch.Tensor`): The processed points.
            - `input_labels` (`torch.Tensor`): The processed labels.
            - `input_boxes` (`torch.Tensor`): The processed bounding boxes.
        N)r!   r&   r%   )tensor_typez0Either images or original_sizes must be provided   z{original_sizes must be of length 1 or len(images). If you are passing a single image, you must pass a single original_size.   pointsz;[image level, object level, point level, point coordinates]   )expected_depth
input_nameexpected_formatexpected_coord_sizer   labelsz([image level, object level, point level])r-   r.   r/   boxesz)[image level, box level, box coordinates]zbInput points and labels have inconsistent dimensions. Please ensure they have the same dimensions.c                 3   s     | ]}t | d  k V  qdS r)   N)len).0	img_boxesboxes_max_dimsr   r   	<genexpr>   s    z)Sam2Processor.__call__.<locals>.<genexpr>zInput boxes have inconsistent dimensions that would require padding, but boxes cannot be padded due to model limitations. Please ensure all images have the same number of boxes.)dtypeT)preserve_paddingr"   r#   is_bounding_boxr$   )r   
isinstancer   Tensorcputolistr	   
ValueErrorr4   _validate_single_input_get_nested_dimensionsany_pad_nested_listtensorfloat32_normalize_tensor_coordinatesupdateint64)r   r    r!   r"   r#   r$   r%   r&   r   encoding_image_processorprocessed_pointsprocessed_labelsprocessed_boxespoints_max_dimslabels_max_dimspadded_pointsfinal_pointspadded_labelsfinal_labelsfinal_boxesr   r7   r   __call__?   s   +$	
zSam2Processor.__call__Fcoordsztorch.Tensorc           	      C   sl   |\}}||}}t | }|r|ddd}|d ||  |d< |d ||  |d< |r4|dd}|S )a  
        Expects a numpy array of length 2 in the final dimension. Requires the original image size in (H, W) format.

        Args:
            target_size (`int`):
                The target size of the image.
            coords (`torch.Tensor`):
                The coordinates to be normalized.
            original_size (`tuple`):
                The original size of the image.
            is_bounding_box (`bool`, *optional*, defaults to `False`):
                Whether the coordinates are bounding boxes.
        r,   ).r   ).r)   r*   )r   floatreshape)	r   r   rX   original_sizer=   old_hold_wnew_hnew_wr   r   r   _normalize_coordinates   s   
z$Sam2Processor._normalize_coordinatesr   c                    s   |du rdS t |tjr* d kst|jdkr|  S  fdd|D S t |tjrL d ks=t|jdkrA| S  fdd|D S t |t	rb krW|S  fdd|D S t |t
tfrk|S tdt| )aS  
        Recursively convert various input formats (tensors, numpy arrays, lists) to nested lists.

        Args:
            data: Input data in any format
            expected_depth: Expected nesting depth
            current_depth: Current depth in recursion

        Returns:
            Nested list representation of the data
        Nr,   c                       g | ]} | d  qS r)   _convert_to_nested_listr5   itemcurrent_depthr-   r   r   r   
<listcomp>       z9Sam2Processor._convert_to_nested_list.<locals>.<listcomp>c                    rb   rc   rd   rf   rh   r   r   rj      rk   c                    rb   rc   rd   rf   rh   r   r   rj     rk   zUnsupported data type: )r>   r   r?   r4   shapenumpyrA   npndarraylistintrZ   rB   type)r   datar-   ri   r   rh   r   re      s"   
z%Sam2Processor._convert_to_nested_listc                 C   s   |du rg }t |ts|S t|dkr|t| nt|d t||d< t|dkr_|D ]0}t |tr^| |}t|D ]\}}|d t|krP|| q>t||d  |||d < q>q.|S )a`  
        Get the maximum dimensions at each level of nesting.

        Args:
            nested_list (`list`):
                Nested list structure.
            max_dims (`list`, *optional*):
                Current maximum dimensions (for recursion).

        Returns:
            `list`: A list of maximum dimensions for each nesting level.
        Nr   r)   )r>   rp   r4   appendmaxrD   	enumerate)r   nested_listmax_dimsrg   sub_dimsidimr   r   r   rD     s"   


z$Sam2Processor._get_nested_dimensionsc           	         sV  |du r| j }|t|kr|S t|ts|g}t|}|| }|t|d kr2||g||   nQ|dkrg|t|d k rM||d d }| || n	|g||d    | fddt|| D  n||d d }| || | fddt|D  |t|d k rtt|D ]}t|| tr| || ||d |||< q|S )a  
        Recursively pad a nested list to match target dimensions.

        Args:
            nested_list (`list`):
                Nested list to pad.
            target_dims (`list`):
                Target dimensions for each level.
            current_level (`int`, *optional*, defaults to 0):
                Current nesting level.
            pad_value (`int`, *optional*):
                Value to use for padding.

        Returns:
            `list`: The padded nested list.
        Nr)   r   r,   c                       g | ]}t  qS r   r   r5   _templater   r   rj   [      z2Sam2Processor._pad_nested_list.<locals>.<listcomp>c                    r|   r   r   r}   r   r   r   rj   `  r   )r   r4   r>   rp   extend_create_empty_nested_structurerangerF   )	r   rw   target_dimscurrent_level	pad_valuecurrent_sizer   template_dimsrz   r   r   r   rF   ,  s2   
"zSam2Processor._pad_nested_listc                    s8   t  dkrg d  S  fddt d D S )a  
        Create an empty nested structure with given dimensions filled with pad_value.

        Args:
            dims (`list`):
                The dimensions of the nested structure.
            pad_value (`int`):
                The value to fill the structure with.
        r)   r   c                    s    g | ]}  d d qS r3   )r   r}   dimsr   r   r   r   rj   w  s     z@Sam2Processor._create_empty_nested_structure.<locals>.<listcomp>)r4   r   )r   r   r   r   r   r   r   j  s   
z,Sam2Processor._create_empty_nested_structurec                 C   sL   t |trt|dkrdS d| |d  S t |tjtjfr$t|jS dS )z
        Get the nesting level of a list structure.

        Args:
            input_list (`list`):
                The list to get the nesting level of.
        r   r)   )	r>   rp   r4   _get_nesting_levelrn   ro   r   r?   rl   )r   
input_listr   r   r   r   y  s   

z Sam2Processor._get_nesting_levelrs   r-   r.   r/   r0   c              
   C   s   |du rdS t |tjtjfrH|j|kr&td| d| d| d|j d	|durB|jd |krBtd| d| d|jd  d	| ||S t |t	rm| 
|}||krgtd| d
| d| d| d	| ||S dS )a  
                Validate a single input by ensuring proper nesting and raising an error if the input is not valid.

                Args:
                    data (`torch.Tensor`, `np.ndarray`, or `list`):
                        Input data to process.
                    expected_depth (`int`):
                        Expected nesting depth.
                    input_name (`str`):
                        Name of the input for error messages.
                    expected_format (`str`):
                        The expected format of the input.
                    expected_coord_size (`int`, *optional*):
                        Expected coordinate size (2 for points, 4 for boxes, None for labels).
        .
        NzInput z must be a tensor/array with z, dimensions. The expected nesting format is z. Got z dimensions.rY   z as the last dimension, got .z must be a nested list with z( levels. The expected nesting format is z levels.)r>   r   r?   rn   ro   ndimrB   rl   re   rp   r   )r   rs   r-   r.   r/   r0   ri   r   r   r   rC     s*   


z$Sam2Processor._validate_single_inputc                 C   s   |r|| j k}|jddd}tt|D ]>}||jd k rR|t|k r'|| n|d }| j| j|| ||d}	|rN|| }
t|
	|| |	|| ||< q|	||< qdS )a  
        Helper method to normalize coordinates in a tensor across multiple images.

        Args:
            tensor (`torch.Tensor`):
                Input tensor with coordinates.
            original_sizes (`list`):
                Original image sizes.
            is_bounding_box (`bool`, *optional*, defaults to `False`):
                Whether coordinates are bounding boxes.
            preserve_padding (`bool`, *optional*, defaults to `False`):
                Whether to preserve padding values (for points).
        rY   T)r{   keepdimr   r<   N)
r   allr   r4   rl   ra   r   r   where	expand_as)r   rG   r%   r=   r;   mask
coord_maskimg_idxr\   normalized_coordsimg_maskr   r   r   rI     s"   

z+Sam2Processor._normalize_tensor_coordinates        Tc           	      K   s    | j j|||||||fi |S )a-  
        Remove padding and upscale masks to the original image size.

        Args:
            masks (`Union[List[torch.Tensor], List[np.ndarray]]`):
                Batched masks from the mask_decoder in (batch_size, num_channels, height, width) format.
            original_sizes (`Union[torch.Tensor, List[Tuple[int,int]]]`):
                The original sizes of each image before it was resized to the model's expected input shape, in (height,
                width) format.
            mask_threshold (`float`, *optional*, defaults to 0.0):
                Threshold for binarization and post-processing operations.
            binarize (`bool`, *optional*, defaults to `True`):
                Whether to binarize the masks.
            max_hole_area (`float`, *optional*, defaults to 0.0):
                The maximum area of a hole to fill.
            max_sprinkle_area (`float`, *optional*, defaults to 0.0):
                The maximum area of a sprinkle to fill.
            apply_non_overlapping_constraints (`bool`, *optional*, defaults to `False`):
                Whether to apply non-overlapping constraints to the masks.

        Returns:
            (`torch.Tensor`): Batched masks in batch_size, num_channels, height, width) format, where (height, width)
            is given by original_size.
        )r   post_process_masks)	r   masksr%   mask_thresholdbinarizemax_hole_areamax_sprinkle_area!apply_non_overlapping_constraintsr   r   r   r   r     s   #z Sam2Processor.post_process_masks)Nr   )NNNNNNN)F)r   )N)r   N)FF)r   Tr   r   F)__name__
__module____qualname____doc__
attributesimage_processor_classr   rq   r   r   r   rp   rZ   r   r?   strr
   r	   rW   ra   re   rD   rF   r   r   rn   ro   rC   rI   r   __classcell__r   r   r   r   r   %   s     

 


&
%>

2'r   )r   copyr   typingr   r   rm   rn   image_utilsr   processing_utilsr   tokenization_utils_baser	   utilsr
   r   r   utils.import_utilsr   
get_loggerr   loggerr   r   __all__r   r   r   r   <module>   s$   
   
k