o
    eil                     @   s   d Z ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZmZmZmZ dd	lmZ eeZe r@ddlZd
d Zdd Zdd Zdd Zdd Zdd Zdd ZeddeG dd deZdgZ dS )z
Processor class for SAM3.
    deepcopyN   )
ImageInput)ProcessorMixin)BatchEncodingPreTokenizedInput	TextInput)
TensorTypeauto_docstringis_torch_availablelogging)requiresc                 C   sL   |  d\}}}}|d|  |d|  |d|  |d|  g}tj|ddS N      ?dimunbindtorchstackxx_cy_cwhb r   f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/sam3/processing_sam3.pybox_cxcywh_to_xyxy#   s   ,r!   c                 C   s<   |  d\}}}}|d|  |d|  ||g}tj|ddS r   r   r   r   r   r    box_cxcywh_to_xywh)      r"   c                 C   s4   |  d\} }}}| || | || g}tj|ddS Nr   r   r   r   yr   r   r   r   r   r    box_xywh_to_xyxy/      r'   c                 C   s<   |  d\} }}}| d|  |d|  ||g}tj|ddS r   r   r%   r   r   r    box_xywh_to_cxcywh5   r#   r)   c                 C   s4   |  d\} }}}| |||  || g}tj|ddS r$   r   )r   r&   XYr   r   r   r    box_xyxy_to_xywh;   r(   r,   c                 C   sD   |  d\}}}}|| d || d || || g}tj|ddS )Nr      r   r   )r   x0y0x1y1r   r   r   r    box_xyxy_to_cxcywhA   s   $r2   c                 C   s"   |  d\}}}}|| ||  S )z
    Batched version of box area. Boxes should be in [x0, y0, x1, y1] format.

    Inputs:
    - boxes: Tensor of shape (..., 4)

    Returns:
    - areas: Tensor of shape (...,)
    r   )r   )boxesr.   r/   r0   r1   r   r   r    box_areaG   s   
r4   )r   )backendsc                       sx  e Zd Z	d4dedB def fddZe							d5dedB deeB e	e B e	e B dB d	edB d
e	e	e	e
   ejB dB de	e	e	e   ejB dB de	e	e
  ejB dB deeB dB defddZd6d7ddZd8ddZdd Zd9ddZd:ddZdd  Zd!d" Z	d9d#ejejB e	B d$ed%ed&ed'edB de	fd(d)Zd;d*d+Zd<d-d.Zd=d0d1Z	/	,	d>d2d3Z  ZS )?Sam3ProcessorNtarget_sizepoint_pad_valuec                    s@   t  j||fi | || _|dur|| _dS | jjd | _dS )z
        target_size (`int`, *optional*):
            The target size (target_size, target_size) to which the image will be resized.
        point_pad_value (`int`, *optional*, defaults to -10):
            The value used for padding input boxes.
        Nheight)super__init__r9   image_processorsizer8   )selfr=   	tokenizerr8   r9   kwargs	__class__r   r    r<   X   s   	$zSam3Processor.__init__imagestextsegmentation_mapsinput_boxesinput_boxes_labelsoriginal_sizesreturn_tensorsreturnc                 K   s  d}	|dur| j |f||d|}	n!|dur,t|tjr#|  }td|i|d}	n|dur4td| ||}|durS| j	||ddd}
|	durQ|	
|
 n|
}	|dur|	d }| j|d	d
ddd}| j|dddd}|dur{| |dd }|dur| |dd }|dur|dur||krtd|dur| ||dg }tj|tjd}| j||ddd t|}|	
d|i |dur| ||}tj|tjd}|	
d|i |	S )a}  
        images (`ImageInput`, *optional*):
            The image(s) to process.
        text (`str`, `list[str]`, `list[list[str]]`, *optional*):
            The text to process.
        segmentation_maps (`ImageInput`, *optional*):
            The segmentation maps to process.
        input_boxes (`list[list[list[float]]]`, `torch.Tensor`, *optional*):
            The bounding boxes to process.
        input_boxes_labels (`list[list[int]]`, `torch.Tensor`, *optional*):
            The labels for the bounding boxes.
        original_sizes (`list[list[float]]`, `torch.Tensor`, *optional*):
            The original sizes of the images.

        Returns:
            A [`BatchEncoding`] with the following fields:
            - `pixel_values` (`torch.Tensor`): The processed image(s).
            - `original_sizes` (`list[list[float]]`): The original sizes of the images.
            - `labels` (`torch.Tensor`): The processed segmentation maps (if provided).
            - `input_boxes_labels` (`torch.Tensor`): The processed labels for the bounding boxes.
            - `input_boxes` (`torch.Tensor`): The processed bounding boxes.
        N)rF   rJ   rI   )tensor_typezKEither images or original_sizes must be provided if input_boxes is not None
max_length    )rJ   paddingrM   r   r3   z)[image level, box level, box coordinates]   )expected_depth
input_nameexpected_formatexpected_coord_sizer-   labelsz[image level, box level])rQ   rR   rS   zaInput boxes and labels have inconsistent dimensions. Please ensure they have the same dimensions.)dtypeT)is_bounding_boxpreserve_paddingrG   rH   )r=   
isinstancer   Tensorcputolistr   
ValueError_resolve_text_promptsr@   update_validate_single_input_get_nested_dimensions_pad_nested_listtensorfloat32_normalize_tensor_coordinatesr2   int64)r?   rD   rE   rF   rG   rH   rI   rJ   rA   encodingtext_inputsprocessed_boxesprocessed_boxes_labelsboxes_max_dimsboxes_labels_max_dimspadded_boxesfinal_boxespadded_boxes_labelsfinal_boxes_labelsr   r   r    __call__e   sv   "zSam3Processor.__call__Fcoordstorch.Tensorc                 C   sZ   |\}}t | }|r|ddd}|d | |d< |d | |d< |r+|dd}|S )a  
        Expects a numpy array of length 2 in the final dimension. Requires the original image size in (H, W) format.

        Args:
            target_size (`int`):
                The target size of the image.
            coords (`torch.Tensor`):
                The coordinates to be normalized.
            original_size (`tuple`):
                The original size of the image.
            is_bounding_box (`bool`, *optional*, defaults to `False`):
                Whether the coordinates are bounding boxes.
        r   r-   ).r   ).   rP   )r   floatreshape)r?   rr   original_sizerW   old_hold_wr   r   r    _normalize_coordinates   s   z$Sam3Processor._normalize_coordinatesr   c                    s   |du rdS t |tjr* d kst|jdkr|  S  fdd|D S t |tjrL d ks=t|jdkrA| S  fdd|D S t |t	rb krW|S  fdd|D S t |t
tfrk|S tdt| )a  
        Recursively convert various input formats (tensors, numpy arrays, lists) to nested lists.
        Preserves None values within lists.

        Args:
            data: Input data in any format (may be None or contain None values)
            expected_depth: Expected nesting depth
            current_depth: Current depth in recursion

        Returns:
            Nested list representation of the data (or None)
        Nr-   c                       g | ]} | d  qS rt   _convert_to_nested_list.0itemcurrent_depthrQ   r?   r   r    
<listcomp>       z9Sam3Processor._convert_to_nested_list.<locals>.<listcomp>c                    r{   r|   r}   r   r   r   r    r     r   c                    s*   g | ]}|d ur | d nd qS )Nrt   r}   r   r   r   r    r   
  s    zUnsupported data type: )rY   r   rZ   lenshapenumpyr\   npndarraylistintru   r]   type)r?   datarQ   r   r   r   r    r~      s&   
z%Sam3Processor._convert_to_nested_listc                 C   s   |du r
|rdS dS t |ttfs|S t|}|r0t|t|kr0tdt| dt| dt|D ]\}}|du rH|rH|| durHd||< q4|S )zQ
        Resolve text prompts by setting defaults based on prompt types.
        NvisualzEThe number of text prompts must match the number of input boxes. Got z text prompts and z input boxes.)rY   r   tupler   r]   	enumerate)r?   rE   rG   i
text_valuer   r   r    r^     s$   z#Sam3Processor._resolve_text_promptsc                 C   s   |du rg }t |ts|S t|dkr|t| nt|d t||d< t|dkrd|D ]5}|du r5q.t |trc| |}t|D ]\}}|d t|krU|| qCt||d  |||d < qCq.|S )a  
        Get the maximum dimensions at each level of nesting, skipping None values.

        Args:
            nested_list (`list`):
                Nested list structure (may contain None values).
            max_dims (`list`, *optional*):
                Current maximum dimensions (for recursion).

        Returns:
            `list`: A list of maximum dimensions for each nesting level.
        Nr   rt   )rY   r   r   appendmaxra   r   )r?   nested_listmax_dimsr   sub_dimsr   r   r   r   r    ra   .  s&   


z$Sam3Processor._get_nested_dimensionsc           	         s  |du r| j }|t|kr|S t|ts|g}t|}|| }|t|d kr2||g||   nQ|dkrg|t|d k rM||d d }| || n	|g||d    | fddt|| D  n||d d }| || | fddt|D  |t|d k rtt|D ].}|| du r||d d }| ||||< qt|| tr| || ||d |||< q|S )a3  
        Recursively pad a nested list to match target dimensions. Replaces None values with padded structures.

        Args:
            nested_list (`list`):
                Nested list to pad (may contain None values).
            target_dims (`list`):
                Target dimensions for each level.
            current_level (`int`, *optional*, defaults to 0):
                Current nesting level.
            pad_value (`int`, *optional*):
                Value to use for padding.

        Returns:
            `list`: The padded nested list.
        Nrt   r   r-   c                       g | ]}t  qS r   r   r   _templater   r    r         z2Sam3Processor._pad_nested_list.<locals>.<listcomp>c                    r   r   r   r   r   r   r    r     r   )r9   r   rY   r   extend_create_empty_nested_structurerangerb   )	r?   r   target_dimscurrent_level	pad_valuecurrent_sizer8   template_dimsr   r   r   r    rb   V  s8   
"zSam3Processor._pad_nested_listc                    s8   t  dkrg d  S  fddt d D S )a  
        Create an empty nested structure with given dimensions filled with pad_value.

        Args:
            dims (`list`):
                The dimensions of the nested structure.
            pad_value (`int`):
                The value to fill the structure with.
        rt   r   c                    s    g | ]}  d d qS )rt   N)r   r   dimsr   r?   r   r    r     s     z@Sam3Processor._create_empty_nested_structure.<locals>.<listcomp>)r   r   )r?   r   r   r   r   r    r     s   
z,Sam3Processor._create_empty_nested_structurec                 C   sb   t |tr!t|dkrdS |D ]}|durd| |   S qdS t |tjtjfr/t|jS dS )z
        Get the nesting level of a list structure, skipping None values.

        Args:
            input_list (`list`):
                The list to get the nesting level of.
        r   rt   N)	rY   r   r   _get_nesting_levelr   r   r   rZ   r   )r?   
input_listr   r   r   r    r     s   

z Sam3Processor._get_nesting_levelr   rQ   rR   rS   rT   c              
   C   s   |du rdS t |tjtjfrH|j|kr&td| d| d| d|j d	|durB|jd |krBtd| d| d|jd  d	| ||S t |t	rm| 
|}||krgtd| d
| d| d| d	| ||S dS )a  
                Validate a single input by ensuring proper nesting and raising an error if the input is not valid.

                Args:
                    data (`torch.Tensor`, `np.ndarray`, or `list`):
                        Input data to process.
                    expected_depth (`int`):
                        Expected nesting depth.
                    input_name (`str`):
                        Name of the input for error messages.
                    expected_format (`str`):
                        The expected format of the input.
                    expected_coord_size (`int`, *optional*):
                        Expected coordinate size (4 for boxes, None for labels).
        .
        NzInput z must be a tensor/array with z, dimensions. The expected nesting format is z. Got z dimensions.r   z as the last dimension, got .z must be a nested list with z( levels. The expected nesting format is z levels.)rY   r   rZ   r   r   ndimr]   r   r~   r   r   )r?   r   rQ   rR   rS   rT   r   r   r   r    r`     s*   


z$Sam3Processor._validate_single_inputc                 C   s   |r|| j k}|jddd}tt|D ]<}||jd k rP|t|k r'|| n|d }| j|| ||d}	|rL|| }
t|
|| |	|| ||< q|	||< qdS )a  
        Helper method to normalize coordinates in a tensor across multiple images.

        Args:
            tensor (`torch.Tensor`):
                Input tensor with coordinates.
            original_sizes (`list`):
                Original image sizes.
            is_bounding_box (`bool`, *optional*, defaults to `False`):
                Whether coordinates are bounding boxes.
            preserve_padding (`bool`, *optional*, defaults to `False`):
                Whether to preserve padding values (for boxes).
        r   T)r   keepdimr   )rW   N)	r9   allr   r   r   rz   r   where	expand_as)r?   rc   rI   rW   rX   mask
coord_maskimg_idxrw   normalized_coordsimg_maskr   r   r    re     s"   


z+Sam3Processor._normalize_tensor_coordinatesr   c                 C      | j |||S )a  
        Converts the output of [`Sam3Model`] into semantic segmentation maps.

        Args:
            outputs ([`Sam3ImageSegmentationOutput`]):
                Raw outputs of the model containing semantic_seg.
            target_sizes (`list[tuple]` of length `batch_size`, *optional*):
                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
                predictions will not be resized.
            threshold (`float`, *optional*, defaults to 0.5):
                Threshold for binarizing the semantic segmentation masks.

        Returns:
            semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic
            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
            specified). Each entry is a binary mask (0 or 1).
        )r=   "post_process_semantic_segmentation)r?   outputstarget_sizes	thresholdr   r   r    r     s   z0Sam3Processor.post_process_semantic_segmentation333333?c                 C   r   )a  
        Converts the raw output of [`Sam3Model`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format. This is a convenience wrapper around the image processor method.

        Args:
            outputs ([`Sam3ImageSegmentationOutput`]):
                Raw outputs of the model containing pred_boxes, pred_logits, and optionally presence_logits.
            threshold (`float`, *optional*, defaults to 0.3):
                Score threshold to keep object detection predictions.
            target_sizes (`list[tuple[int, int]]`, *optional*):
                List of tuples (`tuple[int, int]`) containing the target size `(height, width)` of each image in the
                batch. If unset, predictions will not be resized.

        Returns:
            `list[dict]`: A list of dictionaries, each dictionary containing the following keys:
                - **scores** (`torch.Tensor`): The confidence scores for each predicted box on the image.
                - **boxes** (`torch.Tensor`): Image bounding boxes in (top_left_x, top_left_y, bottom_right_x,
                  bottom_right_y) format.

        Example:

        ```python
        >>> from transformers import AutoModel, AutoProcessor
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> model = AutoModel.from_pretrained("facebook/sam3-base")
        >>> processor = AutoProcessor.from_pretrained("facebook/sam3-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, text="cat", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # Post-process to get bounding boxes
        >>> results = processor.post_process_object_detection(outputs, threshold=0.3, target_sizes=[image.size[::-1]])
        >>> boxes = results[0]["boxes"]
        >>> scores = results[0]["scores"]
        ```
        )r=   post_process_object_detection)r?   r   r   r   r   r   r    r   &  s   +z+Sam3Processor.post_process_object_detectionc                 C   s   | j ||||S )ay	  
        Converts the raw output of [`Sam3Model`] into instance segmentation predictions with bounding boxes and masks.
        This is a convenience wrapper around the image processor method.

        Args:
            outputs ([`Sam3ImageSegmentationOutput`]):
                Raw outputs of the model containing pred_boxes, pred_logits, pred_masks, and optionally
                presence_logits.
            threshold (`float`, *optional*, defaults to 0.3):
                Score threshold to keep instance predictions.
            mask_threshold (`float`, *optional*, defaults to 0.5):
                Threshold for binarizing the predicted masks.
            target_sizes (`list[tuple[int, int]]`, *optional*):
                List of tuples (`tuple[int, int]`) containing the target size `(height, width)` of each image in the
                batch. If unset, predictions will not be resized.

        Returns:
            `list[dict]`: A list of dictionaries, each dictionary containing the following keys:
                - **scores** (`torch.Tensor`): The confidence scores for each predicted instance on the image.
                - **boxes** (`torch.Tensor`): Image bounding boxes in (top_left_x, top_left_y, bottom_right_x,
                  bottom_right_y) format.
                - **masks** (`torch.Tensor`): Binary segmentation masks for each instance, shape (num_instances,
                  height, width).

        Example:

        ```python
        >>> from transformers import AutoModel, AutoProcessor
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> model = AutoModel.from_pretrained("facebook/sam3-base")
        >>> processor = AutoProcessor.from_pretrained("facebook/sam3-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, text="cat", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # Post-process to get instance segmentation
        >>> results = processor.post_process_instance_segmentation(
        ...     outputs, threshold=0.3, target_sizes=[image.size[::-1]]
        ... )
        >>> masks = results[0]["masks"]
        >>> boxes = results[0]["boxes"]
        >>> scores = results[0]["scores"]
        ```
        )r=   "post_process_instance_segmentation)r?   r   r   mask_thresholdr   r   r   r    r   S  s   9z0Sam3Processor.post_process_instance_segmentation)Nr7   )NNNNNNN)F)rr   rs   rK   rs   )r   )N)r   N)FF)Nr   )r   N)r   r   N) __name__
__module____qualname__r   r<   r   r   r	   r   r   ru   r   rZ   strr
   r   rq   rz   r~   r^   ra   rb   r   r   r   r   r`   re   r   r   r   __classcell__r   r   rB   r    r6   U   sx    

h
*

(B

2
#
0r6   )!__doc__copyr   r   r   image_utilsr   processing_utilsr   tokenization_utils_baser   r   r	   utilsr
   r   r   r   utils.import_utilsr   
get_loggerr   loggerr   r!   r"   r'   r)   r,   r2   r4   r6   __all__r   r   r   r    <module>   s4   
    
>