o
    ei(-                    @   s  d Z ddlmZ ddlmZ ddlZddlmZmZ ddlm	Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ ddl m!Z!m"Z"m#Z# e rkddl$m%Z% e&e'Z(dejdejfddZ)dejdejfddZ*eeG dd deZ+dedefddZ,dedefdd Z-d!d" Z.d#d$ Z/eed%d&G d'd( d(eZ0eed)d&G d*d+ d+eZ1G d,d- d-ej2Z3G d.d/ d/ej2Z4G d0d1 d1ej2Z5G d2d3 d3ej2Z6G d4d5 d5eZ7eG d6d7 d7eZ8G d8d9 d9ej2Z9G d:d; d;e8Z:G d<d= d=e8Z;G d>d? d?e8Z<G d@dA dAe8Z=eG dBdC dCe8Z>G dDdE dEej2Z?G dFdG dGej2Z@G dHdI dIe8ZAg dJZBdS )KzPyTorch OWLv2 model.    )	dataclass)AnyN)Tensornn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleis_vision_availablelogging	torch_int   )Owlv2ConfigOwlv2TextConfigOwlv2VisionConfig)center_to_corners_formatlogitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalcross_entropytorcharangelenr   )r    r$   f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/owlv2/modeling_owlv2.pycontrastive_loss4   s   r&   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r&   t)r'   caption_loss
image_lossr$   r$   r%   
owlv2_loss9   s   r+   c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZejdB ed< dZeed< dZeed	< d
ee fddZdS )Owlv2Outputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of
        [`Owlv2VisionModel`].
    text_model_output (tuple[`BaseModelOutputWithPooling`]):
        The output of the [`Owlv2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Owlv2VisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                       t  fdd  D S )Nc                 3   .    | ]}|d vr | nt  | V  qdS )r2   r3   Ngetattrto_tuple.0kselfr$   r%   	<genexpr>_   
    
z'Owlv2Output.to_tuple.<locals>.<genexpr>tuplekeysr=   r$   r=   r%   r9   ^      zOwlv2Output.to_tuple)__name__
__module____qualname____doc__r-   r!   FloatTensor__annotations__r.   r/   r0   r1   r2   r   r3   rB   r   r9   r$   r$   r$   r%   r,   ?   s   
 r,   r(   c                 C   sD   |   r| jtjtjfv r| S |  S | jtjtjfv r| S |  S N)	is_floating_pointdtyper!   float32float64floatint32int64int)r(   r$   r$   r%   _upcastf   s   rT   boxesc                 C   sH   t | } | dddf | dddf  | dddf | dddf   S )a  
    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
            < x2` and `0 <= y1 < y2`.

    Returns:
        `torch.FloatTensor`: a tensor containing the area for each box.
    N   r   r   r   )rT   )rU   r$   r$   r%   box_areao   s   @rW   c           
      C   s   t | }t |}t| d d d d df |d d d df }t| d d d dd f |d d dd f }|| jdd}|d d d d df |d d d d df  }|d d d f | | }|| }	|	|fS )NrV   r   minr   )rW   r!   maxrY   clamp)
boxes1boxes2area1area2left_topright_bottomwidth_heightinterunioniour$   r$   r%   box_iou   s   ..,rf   c                 C   s*  | ddddf | ddddf k  std|  |ddddf |ddddf k  s:td| t| |\}}t| dddddf |ddddf }t| dddddf |ddddf }|| jdd}|dddddf |dddddf  }||| |  S )z
    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.

    Returns:
        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
    NrV   z<boxes1 must be in [x0, y0, x1, y1] (corner) format, but got z<boxes2 must be in [x0, y0, x1, y1] (corner) format, but got r   rX   r   )all
ValueErrorrf   r!   rY   rZ   r[   )r\   r]   re   rd   top_leftbottom_rightrb   arear$   r$   r%   generalized_box_iou   s   ,	,..,rl   z5
    Output type of [`Owlv2ForObjectDetection`].
    )custom_introc                   @   s   e Zd ZU dZdZejdB ed< dZe	dB ed< dZ
ejdB ed< dZejdB ed< dZejdB ed< dZejdB ed< dZejdB ed	< dZejdB ed
< dZeed< dZeed< dee fddZdS )Owlv2ObjectDetectionOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
        scale-invariant IoU loss.
    loss_dict (`Dict`, *optional*):
        A dictionary containing the individual losses. Useful for logging.
    logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
        Classification logits (including no-object) for all queries.
    objectness_logits (`torch.FloatTensor` of shape `(batch_size, num_patches, 1)`):
        The objectness logits of all image patches. OWL-ViT represents images as a set of image patches where the
        total number of patches is (image_size / patch_size)**2.
    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
        possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to retrieve the
        unnormalized bounding boxes.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
        Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes image
        embeddings for each patch.
    class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
        Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total
        number of patches is (image_size / patch_size)**2.
    text_model_output (tuple[`BaseModelOutputWithPooling`]):
        The output of the [`Owlv2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Owlv2VisionModel`].
    Nr-   	loss_dictr   objectness_logits
pred_boxesr0   r1   class_embedsr2   r3   r   c                    r4   )Nc                 3   r5   r6   r7   r:   r=   r$   r%   r?      r@   z6Owlv2ObjectDetectionOutput.to_tuple.<locals>.<genexpr>rA   r=   r$   r=   r%   r9      rD   z#Owlv2ObjectDetectionOutput.to_tuple)rE   rF   rG   rH   r-   r!   rI   rJ   ro   dictr   rp   rq   r0   r1   rr   r2   r   r3   rB   r   r9   r$   r$   r$   r%   rn      s   
 rn   zL
    Output type of [`Owlv2ForObjectDetection.image_guided_detection`].
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZejdB ed< dZejdB ed< dZeed	< dZeed
< dee fddZdS )%Owlv2ImageGuidedObjectDetectionOutputa  
    logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
        Classification logits (including no-object) for all queries.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
        Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes
        image embeddings for each patch.
    query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
        Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes
        image embeddings for each patch.
    target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual target image in the batch
        (disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to
        retrieve the unnormalized bounding boxes.
    query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual query image in the batch
        (disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to
        retrieve the unnormalized bounding boxes.
    class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
        Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total
        number of patches is (image_size / patch_size)**2.
    text_model_output (tuple[`BaseModelOutputWithPooling`]):
        The output of the [`Owlv2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Owlv2VisionModel`].
    Nr   r1   query_image_embedstarget_pred_boxesquery_pred_boxesrr   r2   r3   r   c                    r4   )Nc                 3   r5   r6   r7   r:   r=   r$   r%   r?     r@   zAOwlv2ImageGuidedObjectDetectionOutput.to_tuple.<locals>.<genexpr>rA   r=   r$   r=   r%   r9     rD   z.Owlv2ImageGuidedObjectDetectionOutput.to_tuple)rE   rF   rG   rH   r   r!   rI   rJ   r1   ru   rv   rw   rr   r2   r   r3   rB   r   r9   r$   r$   r$   r%   rt      s   
 rt   c                       s\   e Zd Zdef fddZdejdededejfdd	Zddej	de
dejfddZ  ZS )Owlv2VisionEmbeddingsconfigc                    s   t    |j| _|| _|j| _tt	|j| _
tj|j| j|j|jdd| _|j|j d | _| jd | _t| j| j| _| jdt| jddd d S )NF)in_channelsout_channelskernel_sizestridebiasrV   r   position_idsr   
persistent)super__init__
patch_sizery   hidden_size	embed_dimr   	Parameterr!   randnclass_embeddingConv2dnum_channelspatch_embedding
image_sizenum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr"   expandr>   ry   	__class__r$   r%   r     s    
"zOwlv2VisionEmbeddings.__init__
embeddingsheightwidthr   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr   g      ?r   rV   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer!   jit
is_tracingr   r   r   reshapepermuter   r   interpolateviewcat)r>   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionsr$   r$   r%   interpolate_pos_encoding,  s*   



z.Owlv2VisionEmbeddings.interpolate_pos_encodingFpixel_valuesr   c           
      C   sz   |j \}}}}| |}|ddd}| j|dd}tj||gdd}	|r3|	| |	|| }	|	S |	| 	| j
 }	|	S )NrV   r   r   r   )r   r   flatten	transposer   r   r!   r   r   r   r   )
r>   r   r   
batch_size_r   r   patch_embedsrr   r   r$   r$   r%   forwardU  s   
zOwlv2VisionEmbeddings.forwardF)rE   rF   rG   r   r   r!   r   rS   r   rI   boolr   __classcell__r$   r$   r   r%   rx     s    $)rx   c                	       sX   e Zd Zdef fddZ			ddejdB dejdB dejdB dejfd	d
Z	  Z
S )Owlv2TextEmbeddingsry   c                    sP   t    t|j|j| _t|j|j| _| j	dt
|jddd d S )Nr   r   Fr   )r   r   r   r   
vocab_sizer   token_embeddingmax_position_embeddingsr   r   r!   r"   r   r   r   r$   r%   r   e  s   

zOwlv2TextEmbeddings.__init__N	input_idsr   inputs_embedsr   c                 C   sb   |d ur	|j d n|j d }|d u r| jd d d |f }|d u r&| |}| |}|| }|S )Nr   )r   r   r   r   )r>   r   r   r   
seq_lengthposition_embeddingsr   r$   r$   r%   r   o  s   

zOwlv2TextEmbeddings.forward)NNN)rE   rF   rG   r   r   r!   
LongTensorrI   r   r   r   r$   r$   r   r%   r   d  s    r   c                       s   e Zd ZdZ fddZdejdedefddZ			
ddejdejd	B de	d	B de
ejejd	B e
ej d	B f fddZ  ZS )Owlv2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      )r   r   ry   r   r   num_attention_heads	num_headshead_dimrh   scaleattention_dropoutdropoutr   Lineark_projv_projq_projout_projr   r   r$   r%   r     s"   

zOwlv2Attention.__init__tensorseq_lenbszc                 C   s    | ||| j| jdd S )Nr   rV   )r   r   r   r   
contiguous)r>   r   r   r   r$   r$   r%   _shape  s    zOwlv2Attention._shapeNFhidden_statesattention_maskoutput_attentionsr   c                 K   s,  |  \}}}| || j }| | |d|}	| | |d|}
|| j d| jf}| |||j| }|	j| }	|
j| }
|	 d}t	
||	dd}|  || j ||fkrmtd|| j ||f d|   |dur|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}tjj|dd}|r||| j||}||| j ||}nd}tjj|| j| jd	}||
j}t	
||
}|  || j || jfkrtd
|| j|| jf d|   ||| j|| j}|dd}||||}| |}||fS )z#Input shape: Batch x Time x Channelr   r   rV   z$Attention weights should be of size z	, but is Nz!Attention mask should be of size r   )ptrainingz `attn_output` should be of size )r   r   r   r   r   r   r   r   r   r!   bmmr   rh   r   r   softmaxr   r   torM   r   r   )r>   r   r   r   kwargsr   tgt_lenr   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputr$   r$   r%   r     sT   	



zOwlv2Attention.forward)NF)rE   rF   rG   rH   r   r!   r   rS   r   r   rB   r   r   r$   r$   r   r%   r     s    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )Owlv2MLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S rK   )r   r   ry   r   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   r   r$   r%   r     s
   
zOwlv2MLP.__init__r   r   c                 C   s"   |  |}| |}| |}|S rK   )r   r   r   )r>   r   r$   r$   r%   r     s   


zOwlv2MLP.forward)rE   rF   rG   r   r!   r   r   r   r$   r$   r   r%   r     s    r   c                       sV   e Zd Zdef fddZ	ddejdejdedB d	ee	 d
e
ej f
ddZ  ZS )Owlv2EncoderLayerry   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S Neps)r   r   r   r   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   r   r$   r%   r     s   


zOwlv2EncoderLayer.__init__Fr   r   r   Nr   r   c                 K   sj   |}|  |}| jd|||d|\}}|| }|}| |}| |}|| }|f}|r3||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   Nr$   )r   r   r   r   )r>   r   r   r   r   residualr   outputsr$   r$   r%   r     s$   




zOwlv2EncoderLayer.forwardr   )rE   rF   rG   r   r   r!   r   r   r   r   rB   rI   r   r   r$   r$   r   r%   r     s    r   c                   @   s@   e Zd ZU eed< dZdZdZdgZe	
 dejfddZd	S )
Owlv2PreTrainedModelry   owlv2)imagetextTr   modulec                 C   s  | j j}t|tr3tj|jjd|d d tj|jjd|d d t	|j
t|j
jd d nt|trrtj|jd|jd | d tj|jj|j j| d tj|jj|j j| d t	|j
t|j
jd d nt|tr|jd d|j j d  | }|jd | }tj|jj|d tj|jj|d tj|jj|d tj|jj|d nqt|tr|j jd d|j j d  | }d|j j d | }tj|jj|d tj|jj|d n?t|trtj|jj|j d | d tj|j!j|j"d | d t#|j$| j j% nt|t&r#t	|j'|(|j)|j* t|t+j,r6t-|j. t/|j t|t+j0rTtj|jd|d |j.d	urVt-|j. d	S d	S d	S )
zInitialize the weights        g{Gz?)meanstdr   r   r   )r  rV   N)1ry   initializer_factor
isinstancer   initnormal_r   r   r   copy_r   r!   r"   r   r   rx   r   r   r   initializer_ranger   num_hidden_layersr   r   r   r   r   r   r   r   
Owlv2Modeltext_projectiontext_embed_dimvisual_projectionvision_embed_dim	constant_logit_scalelogit_scale_init_valueOwlv2ForObjectDetectionbox_biascompute_box_biasnum_patches_heightnum_patches_widthr   r   zeros_r~   ones_r   )r>   r  factorin_proj_stdout_proj_stdfc_stdr$   r$   r%   _init_weights1  sV   
$
$

 z"Owlv2PreTrainedModel._init_weightsN)rE   rF   rG   r   rJ   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modulesr!   no_gradr   Moduler  r$   r$   r$   r%   r   (  s   
 r   c                       sd   e Zd ZdZdef fddZ				ddejdB dedB dedB d	edB d
e	e
B f
ddZ  ZS )Owlv2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Owlv2EncoderLayer`].

    Args:
        config: Owlv2Config
    ry   c                    s4   t    t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r$   )r   )r;   r   ry   r$   r%   
<listcomp>k  s    z)Owlv2Encoder.__init__.<locals>.<listcomp>F)r   r   r   
ModuleListranger  layersgradient_checkpointingr   r   r'  r%   r   i  s   
 
zOwlv2Encoder.__init__Nr   r   output_hidden_statesreturn_dictr   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	| jD ]!}
|r8||	f }|
|	|fd|i|}|d }	|rP||d f }q/|rX||	f }|sftdd |	||fD S t|	||dS )	a0  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`).
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr$   r   r   r   c                 s       | ]	}|d ur|V  qd S rK   r$   )r;   vr$   r$   r%   r?         z'Owlv2Encoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)ry   r   r-  use_return_dictr+  rB   r   )r>   r   r   r   r-  r.  r   encoder_statesall_attentionsr   encoder_layerlayer_outputsr$   r$   r%   r   n  s:   


zOwlv2Encoder.forwardNNNN)rE   rF   rG   rH   r   r   r!   r   r   rB   r   r   r   r$   r$   r   r%   r&  `  s$    r&  c                       sv   e Zd Zdef fddZe					ddejdejdB dejdB dedB d	edB d
edB de	e
B fddZ  ZS )Owlv2TextTransformerry   c                    sD   t  | |j}t|| _t|| _tj||j	d| _
|   d S r   )r   r   r   r   r   r&  encoderr   r   r   final_layer_norm	post_init)r>   ry   r   r   r$   r%   r     s   

zOwlv2TextTransformer.__init__Nr   r   r   r   r-  r.  r   c              	   K   s&  |dur|n| j j}|dur|n| j j}|dur|n| j j}| }|d|d }| j||d}	t| j |	|tj	|	j
d |	jddd}|dd | jd|	||||dd	|}
|
d
 }| |}|tj	|j
d
 |jd|tjjdd|jf }|s||f|
dd  S t|||
j|
jdS )a|  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        Nr   )r   r   r   r   )ry   r   r   cache_positionpast_key_values	is_causalT)r   r   r   r-  r.  r@  r   r   r2  pooler_outputr   r3  r$   )ry   r   r-  r4  r   r   r   r	   r!   r"   r   r   popr;  r<  r   rS   argmaxr   r   r3  )r>   r   r   r   r   r-  r.  r   input_shaper   encoder_outputsr2  pooled_outputr$   r$   r%   r     sP   

zOwlv2TextTransformer.forward)NNNNN)rE   rF   rG   r   r   r   r!   r   r   rB   r   r   r   r$   r$   r   r%   r:    s.    	r:  c                       s   e Zd ZU eed< dZdef fddZdejfddZ	dd	 Z
e	
	
	
	
ddejdejd
B ded
B ded
B ded
B deeB fddZ  ZS )Owlv2TextModelry   )r   c                    "   t  | t|| _|   d S rK   )r   r   r:  
text_modelr=  r   r   r$   r%   r         
zOwlv2TextModel.__init__r   c                 C   
   | j jjS rK   rJ  r   r   r=   r$   r$   r%   get_input_embeddings     
z#Owlv2TextModel.get_input_embeddingsc                 C   s   || j j_d S rK   rM  )r>   valuer$   r$   r%   set_input_embeddings	  s   z#Owlv2TextModel.set_input_embeddingsNr   r   r   r-  r.  c                 K      | j |||||dS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        Examples:
        ```python
        >>> from transformers import AutoProcessor, Owlv2TextModel

        >>> model = Owlv2TextModel.from_pretrained("google/owlv2-base-patch16")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
        ... )
        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   r   r-  r.  )rJ  )r>   r   r   r   r-  r.  r   r$   r$   r%   r     s   zOwlv2TextModel.forwardr9  )rE   rF   rG   r   rJ   r!  r   r   r%  rN  rQ  r   r!   r   r   rB   r   r   r   r$   r$   r   r%   rH    s0   
 rH  c                       sh   e Zd Zdef fddZe				ddejdedB dedB d	edB d
edB de	e
B fddZ  ZS )Owlv2VisionTransformerry   c                    sT   t  | t|| _tj|j|jd| _t	|| _
tj|j|jd| _|   d S r   )r   r   rx   r   r   r   r   r   pre_layernormr&  r;  post_layernormr=  r   r   r$   r%   r   6  s   

zOwlv2VisionTransformer.__init__NFr   r   r-  r   r.  r   c                 K   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| jjjj}||}| j||d}| 	|}| j
d||||d|}	|	d }
|
d d dd d f }| |}|sb|
|f|	dd   S t|
||	j|	jdS )N)r   )r   r   r-  r.  r   r   rA  r$   )ry   r   r-  r4  r   r   r   rM   r   rU  r;  rV  r   r   r3  )r>   r   r   r-  r   r.  r   expected_input_dtyper   rF  r2  rG  r$   r$   r%   r   A  s6   



zOwlv2VisionTransformer.forward)NNFN)rE   rF   rG   r   r   r   r!   rI   r   rB   r   r   r   r$   r$   r   r%   rT  5  s(    rT  c                       s   e Zd ZU eed< dZdZdef fddZdej	fddZ
e							
		ddejd	B ded	B ded	B deded	B deeB fddZ  ZS )Owlv2VisionModelry   r   )r   c                    rI  rK   )r   r   rT  vision_modelr=  r   r   r$   r%   r   v  rK  zOwlv2VisionModel.__init__r   c                 C   rL  rK   )rY  r   r   r=   r$   r$   r%   rN  |  rO  z%Owlv2VisionModel.get_input_embeddingsNFr   r-  r   r.  c                 K   rR  )a#  
        Examples:
        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Owlv2VisionModel

        >>> model = Owlv2VisionModel.from_pretrained("google/owlv2-base-patch16")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r-  r   r.  )rY  )r>   r   r   r-  r   r.  r   r$   r$   r%   r     s   zOwlv2VisionModel.forwardNNNFN)rE   rF   rG   r   rJ   main_input_namer!  r   r   r%  rN  r   r!   rI   r   rB   r   r   r   r$   r$   r   r%   rX  q  s2   
 rX  c                       s  e Zd ZU eed< def fddZee	ddej	dej	dB de
e deeB fd	d
Zee	ddej	dede
e deeB fddZe									ddejdB dejdB dej	dB dedB dedB dedB dededB dedB deeB fddZ  ZS )r  ry   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _|j
| _t|| _t|| _tj| j| j	dd| _tj| j| j	dd| _tt|j| _|   d S )NzLconfig.text_config is expected to be of type Owlv2TextConfig but is of type .zPconfig.vision_config is expected to be of type Owlv2VisionConfig but is of type F)r~   )r   r   r  text_configr   	TypeErrortypevision_configr   projection_dimr   r  r  r:  rJ  rT  rY  r   r   r  r  r   r!   r   r  r  r=  )r>   ry   r^  ra  r   r$   r%   r     s0   

zOwlv2Model.__init__Nr   r   r   r   c                 K   s.   | j d||dd|}|j}| ||_|S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, Owlv2Model

        >>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
        ... )
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r   r   r.  Nr$   )rJ  rB  r  )r>   r   r   r   text_outputsrG  r$   r$   r%   get_text_features  s   zOwlv2Model.get_text_featuresFr   r   c                 K   s*   | j d||dd|}| |j|_|S )a  
        Examples:
        ```python
        >>> import torch
        >>> from transformers.image_utils import load_image
        >>> from transformers import AutoProcessor, Owlv2Model

        >>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```Tr   r   r.  Nr$   )rY  r  rB  )r>   r   r   r   vision_outputsr$   r$   r%   get_image_features  s   zOwlv2Model.get_image_featuresreturn_lossr   r-  return_base_image_embedsr.  c
              	   K   s:  |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	| j|||||	d}| j|||||	d}|d }| |}|d }| |}|tj	j
|dddd }|tj	j
|dddd }| j |j}t|| | }| }d}|r{t|}|}|	s||||||f}|dur|f| S |S t|||||||d	S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        return_base_image_embeds (`bool`, *optional*):
            Whether or not to return the base image embeddings.

        Examples:
        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Owlv2Model

        >>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(text=[["a photo of a cat", "a photo of a dog"]], images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NrZ  rS  r   rV   r   T)ordr   keepdim)r-   r.   r/   r0   r1   r2   r3   )ry   r   r-  r4  rY  rJ  r  r  r!   linalgnormr  expr   r   matmulr(   r+   r,   )r>   r   r   r   rh  r   r-  r   ri  r.  r   rf  rc  r0   r1   text_embeds_normr  r/   r.   r-   outputr$   r$   r%   r     sV   &	

zOwlv2Model.forwardrK   r   )	NNNNNNFNN)rE   rF   rG   r   rJ   r   r   r   r!   r   r   r   rB   r   rd  r   rg  r   rI   r,   r   r   r$   r$   r   r%   r    st   
  %!	
r  c                       s>   e Zd Zd
dedef fddZdejdejfdd	Z	  Z
S )Owlv2BoxPredictionHead   ry   out_dimc                    sJ   t    |jj}t||| _t||| _t | _	t||| _
d S rK   )r   r   ra  r   r   r   dense0dense1GELUgeludense2)r>   ry   rt  r   r   r$   r%   r   x  s   

zOwlv2BoxPredictionHead.__init__image_featuresr   c                 C   s6   |  |}| |}| |}| |}| |}|S rK   )ru  rx  rv  ry  )r>   rz  rq  r$   r$   r%   r     s   




zOwlv2BoxPredictionHead.forward)rs  )rE   rF   rG   r   rS   r   r!   r   rI   r   r   r$   r$   r   r%   rr  w  s    	rr  c                	       sP   e Zd Zdef fddZdejdejdB dejdB deej fd	d
Z	  Z
S )Owlv2ClassPredictionHeadry   c                    sZ   t    |jj}|jj| _t| j|| _t| jd| _	t| jd| _
t | _d S )Nr   )r   r   r^  r   ra  	query_dimr   r   ru  logit_shiftr  ELUelu)r>   ry   rt  r   r$   r%   r     s   

z!Owlv2ClassPredictionHead.__init__r1   query_embedsN
query_maskr   c                 C   s
  |  |}|d u r%|j}|jd d \}}t||| jf|}||fS |tjj|dddd  }|tjj|dddd  }t	d||}| 
|}	| |}
| |
d }
||	 |
 }|d ur|jdkrmtj|dd	}t|d
kt|jj|}|tj}||fS )NrV   r   T)r   rk  gư>z...pd,...qd->...pqr   r   r   r   )ru  r   r   r!   zerosr|  r   rl  rm  einsumr}  r  r  ndimr   wherefinforM   rY   rN   )r>   r1   r  r  image_class_embedsr   r   r   pred_logitsr}  r  r$   r$   r%   r     s&   



z Owlv2ClassPredictionHead.forward)rE   rF   rG   r   r   r!   rI   r   rB   r   r   r$   r$   r   r%   r{    s    r{  c                       s  e Zd ZU eed< def fddZedededej	fddZ
d	ejdejfd
dZdededej	fddZ	d,dejdejdedejfddZ		d-dejdejdB dej	dB deej fddZ			d.dej	dejdej	dedB dedB dedeej fddZ			d.dejdedB dedB dedeej f
d d!Z	d,d"ejd#ejdedejfd$d%Ze					d/dejd&ejdB dedB dedB ded'edB defd(d)Ze					d/dej	dejdej	dB dedB dedB ded'edB defd*d+Z  ZS )0r  ry   c                    s   t  | t|| _t|| _t|| _t|dd| _t	j
|jj|jjd| _t	 | _|| _| jjj| jjj | _| jjj| jjj | _| jd| | j| jdd |   d S )Nr   )rt  r   r  Fr   )r   r   r  r   r{  
class_headrr  box_headobjectness_headr   r   ra  r   r   
layer_normSigmoidsigmoidry   r   r   r  r  r   r  r=  r   r   r$   r%   r     s   



z Owlv2ForObjectDetection.__init__r  r  r   c                 C   s   t jd|d t jd}t jd| d t jd}t j||dd\}}t j||fdd}|d  |  < |d  |   < |dd	}|S )
Nr   )rM   xy)indexingr   r   .r   .r   rV   )r!   r"   rN   meshgridstackr   )r  r  x_coordinatesy_coordinatesxxyybox_coordinatesr$   r$   r%   !normalize_grid_corner_coordinates  s   z9Owlv2ForObjectDetection.normalize_grid_corner_coordinatesrz  c                 C   s   |  }| |}|d }|S )a#  Predicts the probability that each image feature token is an object.

        Args:
            image_features (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_dim)`)):
                Features extracted from the image.
        Returns:
            Objectness scores.
        r  )detachr  )r>   rz  rp   r$   r$   r%   objectness_predictor  s   	
z,Owlv2ForObjectDetection.objectness_predictorc                 C   s   |  ||}t|dd}t|d t| d  }t|d}|d  |  < |d  |  < t|d t| d  }tj||gdd}|S )Nr  g      ?g-C6?r  r  r   r   )r  r!   cliploglog1p	full_liker   )r>   r  r  r  box_coord_biasbox_sizebox_size_biasr  r$   r$   r%   r    s   z(Owlv2ForObjectDetection.compute_box_biasFimage_featsfeature_mapr   c           	      C   sR   |  |}|r|j\}}}}| ||}n| j}||j}||7 }| |}|S )a  
        Args:
            image_feats:
                Features extracted from the image, returned by the `image_text_embedder` method.
            feature_map:
                A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method.
            interpolate_pos_encoding:
                Whether to interpolate the pre-trained position encodings.
        Returns:
            pred_boxes:
                List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary.
        )r  r   r  r  r   r   r  )	r>   r  r  r   rq   r   r  r  r  r$   r$   r%   box_predictor  s   

z%Owlv2ForObjectDetection.box_predictorNr  r  c                 C   s   |  |||\}}||fS )a8  
        Args:
            image_feats:
                Features extracted from the `image_text_embedder`.
            query_embeds:
                Text query embeddings.
            query_mask:
                Must be provided with query_embeddings. A mask indicating which query embeddings are valid.
        )r  )r>   r  r  r  r  r  r$   r$   r%   class_predictor(  s   z'Owlv2ForObjectDetection.class_predictorr   r   r   r   r-  c              	   C   s   | j ||||||dd}|r$|j\}}}	}
|	| jjj }|
| jjj }n| j}| j}|jd }| j j	|}t
|d d d dd d f |d d d df j}|d d dd d d f | }| |}|jd |||jd f}||}|d }|||fS )NT)r   r   r   r   r-  r   r.  r   r   r   )r   r   ry   ra  r   r  r  r3   rY  rV  r!   broadcast_tor  r   )r>   r   r   r   r   r-  r   r   r   r   r   r  r  r2  r1   class_token_outnew_sizer0   r$   r$   r%   image_text_embedder<  s8   


4


z+Owlv2ForObjectDetection.image_text_embedderc                 C   s   | j j||dd}|r!|j\}}}}|| jjj }	|| jjj }
n| j}	| j}
|d }| j j|}t	
|d d d dd d f |d d d df j}|d d dd d d f | }| |}|jd |	|
|jd f}||}||fS )NTre  r   r   r   )r   rY  r   ry   ra  r   r  r  rV  r!   r  r  r   )r>   r   r   r-  r   rf  r   r   r   r  r  r2  r1   r  r  r$   r$   r%   image_embedderp  s*   4

z&Owlv2ForObjectDetection.image_embedderquery_image_featuresquery_feature_mapc                 C   s:  |  |\}}| |||}t|}g }g }	|j}
t|jd D ]f}tjg dg|
d}|| }t||\}}t	|d dkrEt
||}t|d }|d |k }| r|| |d }tj|| dd}td||}|t| }||| |  |	| q |rt|}t|	}nd	\}}|||fS )
Nr   )r   r   r   r   r   r  g?r   )axiszd,id->iNN)r  r  r   r   r*  r   r!   r   rf   rg   rl   rZ   nonzeronumelsqueezer  r  argminappendr  )r>   r  r  r   r   rr   rq   pred_boxes_as_cornersbest_class_embedsbest_box_indicespred_boxes_deviceieach_query_boxeach_query_pred_boxesiousiou_thresholdselected_indsselected_embeddingsmean_embedsmean_simbest_box_indr  box_indicesr$   r$   r%   embed_image_query  s6   



z)Owlv2ForObjectDetection.embed_image_queryquery_pixel_valuesr.  c              
   C   s(  |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||dd }| j||||d\}}	|j\}
}}}t||
|| |f}|j\}
}}}t||
|| |f}| |||\}}}| j	||d\}}| 
|||}|s|||||||	 f}tdd |D }|S t||||||d|	dS )	a  
        query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values of query image(s) to be detected. Pass in one query image per target image.

        Examples:
        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> import torch
        >>> from transformers import AutoProcessor, Owlv2ForObjectDetection

        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> query_url = "http://images.cocodataset.org/val2017/000000001675.jpg"
        >>> with httpx.stream("GET", query_url) as response:
        ...     query_image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, query_images=query_image, return_tensors="pt")

        >>> # forward pass
        >>> with torch.no_grad():
        ...     outputs = model.image_guided_detection(**inputs)

        >>> target_sizes = torch.Tensor([image.size[::-1]])

        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> results = processor.post_process_image_guided_detection(
        ...     outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes
        ... )
        >>> i = 0  # Retrieve predictions for the first image
        >>> boxes, scores = results[i]["boxes"], results[i]["scores"]
        >>> for box, score in zip(boxes, scores):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
        Detected similar object with confidence 0.938 at location [327.31, 54.94, 547.39, 268.06]
        Detected similar object with confidence 0.959 at location [5.78, 360.65, 619.12, 366.39]
        Detected similar object with confidence 0.902 at location [2.85, 360.01, 627.63, 380.8]
        Detected similar object with confidence 0.985 at location [176.98, -29.45, 672.69, 182.83]
        Detected similar object with confidence 1.0 at location [6.53, 14.35, 624.87, 470.82]
        Detected similar object with confidence 0.998 at location [579.98, 29.14, 615.49, 489.05]
        Detected similar object with confidence 0.985 at location [206.15, 10.53, 247.74, 466.01]
        Detected similar object with confidence 0.947 at location [18.62, 429.72, 646.5, 457.72]
        Detected similar object with confidence 0.996 at location [523.88, 20.69, 586.84, 483.18]
        Detected similar object with confidence 0.998 at location [3.39, 360.59, 617.29, 499.21]
        Detected similar object with confidence 0.969 at location [4.47, 449.05, 614.5, 474.76]
        Detected similar object with confidence 0.966 at location [31.44, 463.65, 654.66, 471.07]
        Detected similar object with confidence 0.924 at location [30.93, 468.07, 635.35, 475.39]
        ```N)r   r   r   )r   r   r-  r   )r  r  c                 s   r/  rK   r$   r;   xr$   r$   r%   r?   2  r1  zAOwlv2ForObjectDetection.image_guided_detection.<locals>.<genexpr>)r1   ru   rv   rw   r   rr   r2   r3   )ry   r   r-  r.  r  r   r!   r   r  r  r  r9   rB   rt   )r>   r   r  r   r-  r   r.  r  r  rf  r   r  r  
hidden_dimr  query_image_featsr  r  rw   r  rr   rv   rq  r$   r$   r%   image_guided_detection  s^   >

	z.Owlv2ForObjectDetection.image_guided_detectionc              
   K   sB  |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||||d\}	}
}|j}|j}|
j\}}}}t	|
||| |f}|jd | }|		|||	jd }	|	|||jd }|d dk}| 
||	|\}}| |}| ||
|}|s||||	|
|| | f}tdd |D }|S t|
|	||||||dS )	a	  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids).
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the last hidden state. See `text_model_last_hidden_state` and
            `vision_model_last_hidden_state` under returned tensors for more detail.

        Examples:
        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> import torch

        >>> from transformers import Owlv2Processor, Owlv2ForObjectDetection

        >>> processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> text_labels = [["a photo of a cat", "a photo of a dog"]]
        >>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
        >>> target_sizes = torch.tensor([(image.height, image.width)])
        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> results = processor.post_process_grounded_object_detection(
        ...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
        ... )
        >>> # Retrieve predictions for the first image for the corresponding text queries
        >>> result = results[0]
        >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
        >>> for box, score, text_label in zip(boxes, scores, text_labels):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
        Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
        Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
        ```N)r   r   r   r   r-  r   r   r   r  c                 s   r/  rK   r$   r  r$   r$   r%   r?     r1  z2Owlv2ForObjectDetection.forward.<locals>.<genexpr>)r1   r0   rq   r   rp   rr   r2   r3   )ry   r   r-  r.  r  r2   r3   r   r!   r   r  r  r  r9   rB   rn   )r>   r   r   r   r   r-  r   r.  r   r  r  r   rc  rf  r   r  r  r  r  max_text_queriesr  r  rr   rp   rq   rq  r$   r$   r%   r   @  sZ   7


zOwlv2ForObjectDetection.forwardr   r  )NNFr[  )rE   rF   rG   r   rJ   r   staticmethodrS   r!   r   r  rI   r  r  r   r  rB   r  r  r  r  r   rt   r  rn   r   r   r$   r$   r   r%   r    s   
 
%

7
/
,x
r  )r  r   rH  rX  r  )CrH   dataclassesr   typingr   r!   r   r    r   r  activationsr   masking_utilsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   r   configuration_owlv2r   r   r   transformers.image_transformsr   
get_loggerrE   loggerr&   r+   r,   rT   rW   rf   rl   rn   rt   r%  rx   r   r   r   r   r   r&  r:  rH  rT  rX  r  rr  r{  r  __all__r$   r$   r$   r%   <module>   sv   $	
%	2.N b26LP9<5 P0    