o
    i)                     @   s   d dl mZmZmZmZ ddlmZmZmZm	Z	m
Z
 ddlmZmZ e r3d dlmZ ddlmZmZ e rFd dlZd d	lmZ dd
lmZ e	eZeeddG dd deZdS )    )AnyOptionalUnionoverload   )add_end_docstringsis_torch_availableis_vision_availableloggingrequires_backends   )ChunkPipelinebuild_pipeline_init_args)Image)
load_imagevalid_imagesN)BaseModelOutput)2MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMEST)has_image_processorc                       sN  e Zd ZdZdZdZdZdZ fddZe	de
edf de
eee f d	ed
eeeef  fddZe	deeeef  d	ed
eeeeef   fddZ	dde
edeeeef  f dee
eee f  d	ed
e
eeeef  eeeeef   f f fddZdd ZdddZdd ZdddZddd
eeef fddZ  ZS )ZeroShotObjectDetectionPipelinea  
    Zero shot object detection pipeline using `OwlViTForObjectDetection`. This pipeline predicts bounding boxes of
    objects when you provide an image and a set of `candidate_labels`.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
    >>> detector(
    ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
    ...     candidate_labels=["cat", "couch"],
    ... )
    [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]

    >>> detector(
    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
    ...     candidate_labels=["head", "bird"],
    ... )
    [{'score': 0.119, 'label': 'bird', 'box': {'xmin': 71, 'ymin': 170, 'xmax': 410, 'ymax': 508}}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"zero-shot-object-detection"`.

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-object-detection).
    FTc                    sF   t  jdi | | jdkrtd| j dt| d | t d S )NtfzThe z is only available in PyTorch.vision )super__init__	framework
ValueError	__class__r   check_model_typer   )selfkwargsr   r   e/home/ubuntu/.local/lib/python3.10/site-packages/transformers/pipelines/zero_shot_object_detection.pyr   =   s
   

z(ZeroShotObjectDetectionPipeline.__init__imagezImage.Imagecandidate_labelsr    returnc                 K      d S Nr   )r   r#   r$   r    r   r   r"   __call__F   s   z(ZeroShotObjectDetectionPipeline.__call__c                 K   r&   r'   r   )r   r#   r    r   r   r"   r(   K   s   Nc                    s   d|v r	| d}t|ttjfr||d}n"t|ttfr6t|r6tt jdd t	||D fi |S 	 |}t j|fi |}|S )a|  
        Detect objects (bounding boxes & classes) in the image(s) passed as inputs.

        Args:
            image (`str`, `PIL.Image` or `list[dict[str, Any]]`):
                The pipeline handles three types of images:

                - A string containing an http url pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                You can use this parameter to send directly a list of images, or a dataset or a generator like so:

                ```python
                >>> from transformers import pipeline

                >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
                >>> detector(
                ...     [
                ...         {
                ...             "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
                ...             "candidate_labels": ["cat", "couch"],
                ...         },
                ...         {
                ...             "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
                ...             "candidate_labels": ["cat", "couch"],
                ...         },
                ...     ]
                ... )
                [[{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.25, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}], [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]]
                ```


            candidate_labels (`str` or `list[str]` or `list[list[str]]`):
                What the model should recognize in the image.

            threshold (`float`, *optional*, defaults to 0.1):
                The probability necessary to make a prediction.

            top_k (`int`, *optional*, defaults to None):
                The number of top predictions that will be returned by the pipeline. If the provided number is `None`
                or higher than the number of predictions available, it will default to the number of predictions.

            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.


        Return:
            A list of lists containing prediction results, one list per input image. Each list contains dictionaries
            with the following keys:

            - **label** (`str`) -- Text query corresponding to the found object.
            - **score** (`float`) -- Score corresponding to the object (between 0 and 1).
            - **box** (`dict[str,int]`) -- Bounding box of the detected object in image's original size. It is a
              dictionary with `x_min`, `x_max`, `y_min`, `y_max` keys.
        text_queriesr#   r$   c                 s   s    | ]
\}}||d V  qdS )r*   Nr   ).0imglabelsr   r   r"   	<genexpr>   s    z;ZeroShotObjectDetectionPipeline.__call__.<locals>.<genexpr>)
pop
isinstancestrr   listtupler   r   r(   zip)r   r#   r$   r    inputsresultsr!   r   r"   r(   N   s    ?
c                 K   sN   i }d|v r|d |d< i }d|v r|d |d< d|v r"|d |d< |i |fS )Ntimeout	thresholdtop_kr   )r   r    preprocess_paramspostprocess_paramsr   r   r"   _sanitize_parameters   s   
z4ZeroShotObjectDetectionPipeline._sanitize_parametersc           
      c   s    t |d |d}|d }t|tr|d}tj|j|jggtjd}t	|D ]0\}}| j
|| jd}| j|| jd}	| jdkrG|	| j}	|t|d k||d	||	V  q(d S )
Nr#   )r7   r$   ,)dtype)return_tensorsptr   )is_lasttarget_sizecandidate_label)r   r0   r1   splittorchtensorheightwidthint32	enumerate	tokenizerr   image_processortor>   len)
r   r5   r7   r#   r$   rB   irC   text_inputsimage_featuresr   r   r"   
preprocess   s(   


z*ZeroShotObjectDetectionPipeline.preprocessc                 C   sB   | d}| d}| d}| jdi |}|||d|}|S )NrB   rC   rA   )rB   rC   rA   r   )r/   model)r   model_inputsrB   rC   rA   outputsmodel_outputsr   r   r"   _forward   s   


z(ZeroShotObjectDetectionPipeline._forward皙?c                 C   s   g }|D ]>}|d }t |}| jj|||d dd }|d  D ] }|d |  }	| |d | d }
|	||
d}|| q!qt|dd	 d
d}|rT|d | }|S )NrC   rB   )rU   r8   target_sizesr   scoresboxes)scorelabelboxc                 S   s   | d S )Nr\   r   )xr   r   r"   <lambda>   s    z=ZeroShotObjectDetectionPipeline.postprocess.<locals>.<lambda>T)keyreverse)r   rL   post_process_object_detectionnonzeroitem_get_bounding_boxappendsorted)r   rV   r8   r9   r6   model_outputr]   rU   indexr\   r^   resultr   r   r"   postprocess   s&   
z+ZeroShotObjectDetectionPipeline.postprocessr^   ztorch.Tensorc                 C   s8   | j dkr	td|  \}}}}||||d}|S )a%  
        Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }

        Args:
            box (`torch.Tensor`): Tensor containing the coordinates in corners format.

        Returns:
            bbox (`dict[str, int]`): Dict containing the coordinates in corners format.
        r@   zAThe ZeroShotObjectDetectionPipeline is only available in PyTorch.)xminyminxmaxymax)r   r   inttolist)r   r^   rm   rn   ro   rp   bboxr   r   r"   rf      s   

z1ZeroShotObjectDetectionPipeline._get_bounding_boxr'   )rX   N)__name__
__module____qualname____doc___load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   r   r   r1   r2   r   dictr(   r   r<   rR   rW   rl   rq   rf   __classcell__r   r   r!   r"   r      sD     	
4&X


"r   )typingr   r   r   r   utilsr   r   r	   r
   r   baser   r   PILr   image_utilsr   r   rE   transformers.modeling_outputsr   models.auto.modeling_autor   
get_loggerrt   loggerr   r   r   r   r"   <module>   s    
