o
    eij                     @   s`  d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZ e e!Z"eeddG dd deZ#G dd dej$Z%G dd dej$Z&G dd dej$Z'G dd dej$Z(		dBdej$dej)d ej)d!ej)d"ej)dB d#e*dB d$e*d%ee fd&d'Z+G d(d) d)ej$Z,G d*d+ d+ej$Z-G d,d- d-ej$Z.G d.d/ d/ej$Z/G d0d1 d1ej$Z0G d2d3 d3eZ1G d4d5 d5ej$Z2eG d6d7 d7eZ3eG d8d9 d9e3Z4G d:d; d;ej$Z5G d<d= d=ej$Z6ed>dG d?d@ d@e3Z7g dAZ8dS )CzPyTorch YOLOS model.    N)Callable)	dataclass)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )YolosConfigz5
    Output type of [`YolosForObjectDetection`].
    )custom_introc                   @   s   e Zd ZU dZdZejdB ed< dZe	dB ed< dZ
ejdB ed< dZejdB ed< dZee	 dB ed< dZejdB ed< dZeej dB ed	< dZeej dB ed
< dS )YolosObjectDetectionOutputa0  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
        scale-invariant IoU loss.
    loss_dict (`Dict`, *optional*):
        A dictionary containing the individual losses. Useful for logging.
    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
        Classification logits (including no-object) for all queries.
    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
        possible padding). You can use [`~YolosImageProcessor.post_process`] to retrieve the unnormalized bounding
        boxes.
    auxiliary_outputs (`list[Dict]`, *optional*):
        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
        `pred_boxes`) for each decoder layer.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the decoder of the model.
    Nloss	loss_dictlogits
pred_boxesauxiliary_outputslast_hidden_statehidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   dictr   r   r   listr   r   tupler    r*   r*   f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/yolos/modeling_yolos.pyr   %   s   
 r   c                       s@   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
YolosEmbeddingszT
    Construct the CLS token, detection tokens, position and patch embeddings.

    configreturnNc                    s   t    ttdd|j| _ttd|j|j| _	t
|| _| jj}ttd||j d |j| _t|j| _t|| _|| _d S Nr   )super__init__r   	Parameterr$   zeroshidden_size	cls_tokennum_detection_tokensdetection_tokensYolosPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout$InterpolateInitialPositionEmbeddingsinterpolationr-   )selfr-   r:   	__class__r*   r+   r1   R   s   



zYolosEmbeddings.__init__pixel_valuesc                 C   s   |j \}}}}| |}| \}}}| j|dd}	| j|dd}
tj|	||
fdd}| | j	||f}|| }| 
|}|S )Nr   dim)shaper9   sizer5   expandr7   r$   catr@   r;   r>   )rA   rD   
batch_sizenum_channelsheightwidth
embeddingsseq_len_
cls_tokensr7   r;   r*   r*   r+   forwarda   s   

zYolosEmbeddings.forward
r    r!   r"   r#   r   r1   r$   TensorrT   __classcell__r*   r*   rB   r+   r,   L   s    r,   c                       0   e Zd Zd fddZd	dejfddZ  ZS )
r?   r.   Nc                       t    || _d S Nr0   r1   r-   rA   r-   rB   r*   r+   r1   w      

z-InterpolateInitialPositionEmbeddings.__init__i   i@  c                 C   s  |d d dd d f }|d d d f }|d d | j j d d d f }|d d d| j j d d f }|dd}|j\}}}| j jd | j j | j jd | j j }	}
||||	|
}|\}}|| j j || j j }}tjj	|||fddd}|
ddd}tj|||fdd}|S )Nr   r      bicubicFrI   modealign_cornersrF   )r-   r6   	transposerH   
image_size
patch_sizeviewr   
functionalinterpolateflattenr$   rK   )rA   	pos_embedimg_sizecls_pos_embeddet_pos_embedpatch_pos_embedrL   r4   rQ   patch_heightpatch_widthrN   rO   new_patch_heightnew_patch_widthscale_pos_embedr*   r*   r+   rT   {   s$     z,InterpolateInitialPositionEmbeddings.forwardr.   Nr^   r    r!   r"   r1   r$   rV   rT   rW   r*   r*   rB   r+   r?   v       r?   c                       rX   )
 InterpolateMidPositionEmbeddingsr.   Nc                    rY   rZ   r[   r\   rB   r*   r+   r1      r]   z)InterpolateMidPositionEmbeddings.__init__r^   c                 C   sH  |d d d d dd d f }|d d d f }|d d d d | j j d d d f }|d d d d d| j j d d f }|dd}|j\}}}}	| j jd | j j | j jd | j j }
}||| ||
|}|\}}|| j j || j j }}tjj	|||fddd}|
ddd |||| |}tj|||fdd}|S )	Nr   r   r_   r   r`   Fra   rF   )r-   r6   rd   rH   re   rf   rg   r   rh   ri   rj   
contiguousr$   rK   )rA   rk   rl   rm   rn   ro   depthrL   r4   rQ   rp   rq   rN   rO   rr   rs   rt   r*   r*   r+   rT      s,   &&z(InterpolateMidPositionEmbeddings.forwardru   rv   rw   r*   r*   rB   r+   ry      rx   ry   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )r8   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)r0   r1   re   rf   rM   r4   
isinstancecollectionsabcIterabler:   r   Conv2d
projection)rA   r-   re   rf   rM   r4   r:   rB   r*   r+   r1      s   
 zYolosPatchEmbeddings.__init__rD   r.   c                 C   s<   |j \}}}}|| jkrtd| |ddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r_   r   )rH   rM   
ValueErrorr   rj   rd   )rA   rD   rL   rM   rN   rO   rP   r*   r*   r+   rT      s   
zYolosPatchEmbeddings.forward)	r    r!   r"   r#   r1   r$   rV   rT   rW   r*   r*   rB   r+   r8      s    r8           modulequerykeyvalueattention_maskscalingr>   kwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )NrE         r_   r   rF   )ptrainingr   )
rI   r$   matmulrd   r   rh   softmaxr>   r   rz   )
r   r   r   r   r   r   r>   r   attn_weightsattn_outputr*   r*   r+   eager_attention_forward   s   
r   c                       sB   e Zd Zdef fddZdejdeejejf fddZ  Z	S )YolosSelfAttentionr-   c                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .r   F)bias)r0   r1   r4   num_attention_headshasattrr   r-   intattention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   r\   rB   r*   r+   r1      s"   

zYolosSelfAttention.__init__r   r.   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t	| j
jt}|| |||d | j| j| jsHdn| jd\}}	| d d | jf }
||
}||	fS )Nr   rE   r   r_   r   )r   r   r>   )rH   r   r   r   rg   rd   r   r   r
   get_interfacer-   _attn_implementationr   r   r   r   r   rI   r   reshape)rA   r   rL   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shaper*   r*   r+   rT     s*   


zYolosSelfAttention.forward)
r    r!   r"   r   r1   r$   rV   r)   rT   rW   r*   r*   rB   r+   r      s    (r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
YolosSelfOutputz
    The residual connection is defined in YolosLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r-   c                    s.   t    t|j|j| _t|j| _d S rZ   )	r0   r1   r   r   r4   denser<   r=   r>   r\   rB   r*   r+   r1   ,     
zYolosSelfOutput.__init__r   input_tensorr.   c                 C      |  |}| |}|S rZ   r   r>   rA   r   r   r*   r*   r+   rT   1     

zYolosSelfOutput.forwardrU   r*   r*   rB   r+   r   &  s    $r   c                       8   e Zd Zdef fddZdejdejfddZ  ZS )YolosAttentionr-   c                    s"   t    t|| _t|| _d S rZ   )r0   r1   r   	attentionr   outputr\   rB   r*   r+   r1   9  s   

zYolosAttention.__init__r   r.   c                 C   s   |  |\}}| ||}|S rZ   )r   r   )rA   r   self_attn_outputrR   r   r*   r*   r+   rT   >  s   zYolosAttention.forward	r    r!   r"   r   r1   r$   rV   rT   rW   r*   r*   rB   r+   r   8      r   c                       r   )YolosIntermediater-   c                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S rZ   )r0   r1   r   r   r4   intermediate_sizer   r~   
hidden_actstrr   intermediate_act_fnr\   rB   r*   r+   r1   F  s
   
zYolosIntermediate.__init__r   r.   c                 C   r   rZ   )r   r   )rA   r   r*   r*   r+   rT   N  r   zYolosIntermediate.forwardr   r*   r*   rB   r+   r   E  s    r   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	YolosOutputr-   c                    s.   t    t|j|j| _t|j| _	d S rZ   )
r0   r1   r   r   r   r4   r   r<   r=   r>   r\   rB   r*   r+   r1   V  r   zYolosOutput.__init__r   r   r.   c                 C   s    |  |}| |}|| }|S rZ   r   r   r*   r*   r+   rT   [  s   

zYolosOutput.forwardr   r*   r*   rB   r+   r   U  s    $r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	
YolosLayerz?This corresponds to the Block class in the timm implementation.r-   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r0   r1   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr4   layer_norm_epslayernorm_beforelayernorm_afterr\   rB   r*   r+   r1   f  s   



zYolosLayer.__init__r   r.   c                 C   s@   |  |}| |}|| }| |}| |}| ||}|S rZ   )r   r   r   r   r   )rA   r   hidden_states_normattention_outputlayer_outputr*   r*   r+   rT   p  s   



zYolosLayer.forwardrU   r*   r*   rB   r+   r   c  s    
r   c                       sB   e Zd Zdeddf fddZdejdededefd	d
Z	  Z
S )YolosEncoderr-   r.   Nc                    s   t     | _t fddt jD | _d| _d j	d  j	d   j
d    j } jrAtt jd d| jnd | _ jrNt | _d S d | _d S )Nc                    s   g | ]}t  qS r*   )r   ).0rR   r-   r*   r+   
<listcomp>  s    z)YolosEncoder.__init__.<locals>.<listcomp>Fr   r   r_   )r0   r1   r-   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingre   rf   r6   use_mid_position_embeddingsr2   r$   r3   r4   mid_position_embeddingsry   r@   )rA   r-   
seq_lengthrB   r   r+   r1     s$   
 &	zYolosEncoder.__init__r   rN   rO   c                 C   sd   | j jr| | j||f}t| jD ]\}}||}| j jr,|| j jd k r,|||  }qt|dS )Nr   )r   )r-   r   r@   r   	enumerater   r   r   )rA   r   rN   rO   $interpolated_mid_position_embeddingsilayer_moduler*   r*   r+   rT     s   
zYolosEncoder.forward)r    r!   r"   r   r1   r$   rV   r   r   rT   rW   r*   r*   rB   r+   r     s    r   c                   @   sD   e Zd ZU eed< dZdZdZdZg Z	dZ
dZdZdZeedZdS )YolosPreTrainedModelr-   vitrD   )imageT)r   r   N)r    r!   r"   r   r&   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr*   r*   r*   r+   r     s   
 
r   c                
       sl   e Zd Zddedef fddZdefddZee	d	d
e
	ddejdB dee defddZ  ZS )
YolosModelTr-   add_pooling_layerc                    sX   t  | || _t|| _t|| _tj|j	|j
d| _|r#t|nd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r0   r1   r-   r,   rP   r   encoderr   r   r4   r   	layernormYolosPoolerpooler	post_init)rA   r-   r   rB   r*   r+   r1     s   

zYolosModel.__init__r.   c                 C   s   | j jS rZ   )rP   r9   )rA   r*   r*   r+   get_input_embeddings  s   zYolosModel.get_input_embeddingsF)tie_last_hidden_statesNrD   r   c           	      K   sp   |d u rt d| |}|jdd  \}}| j|||d}|j}| |}| jd ur0| |nd }t||dS )Nz You have to specify pixel_valuesr   )rN   rO   )r   pooler_output)r   rP   rH   r   r   r   r   r	   )	rA   rD   r   embedding_outputrN   rO   encoder_outputssequence_outputpooled_outputr*   r*   r+   rT     s   

zYolosModel.forward)TrZ   )r    r!   r"   r   boolr1   r8   r   r   r   r   r$   rV   r   r   r	   rT   rW   r*   r*   rB   r+   r     s    r   c                       r   )r   r-   c                    s*   t    t|j|j| _t | _d S rZ   )r0   r1   r   r   r4   r   Tanh
activationr\   rB   r*   r+   r1     s   
zYolosPooler.__init__r   r.   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )rA   r   first_token_tensorr   r*   r*   r+   rT     s   

zYolosPooler.forwardr   r*   r*   rB   r+   r     r   r   c                       s(   e Zd ZdZ fddZdd Z  ZS )YolosMLPPredictionHeadz
    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
    height and width of a bounding box w.r.t. an image.

    c                    sJ   t    || _|g|d  }tdd t|g| ||g D | _d S )Nr   c                 s   s     | ]\}}t ||V  qd S rZ   )r   r   )r   nkr*   r*   r+   	<genexpr>  s    z2YolosMLPPredictionHead.__init__.<locals>.<genexpr>)r0   r1   
num_layersr   r   ziplayers)rA   	input_dim
hidden_dim
output_dimr  hrB   r*   r+   r1     s   
,zYolosMLPPredictionHead.__init__c                 C   s>   t | jD ]\}}|| jd k rtj||n||}q|S r/   )r   r  r  r   rh   relu)rA   xr   r   r*   r*   r+   rT     s   (zYolosMLPPredictionHead.forward)r    r!   r"   r#   r1   rT   rW   r*   r*   rB   r+   r     s    r   zy
    YOLOS Model (consisting of a ViT encoder) with object detection heads on top, for tasks such as COCO detection.
    c                       s^   e Zd Zdef fddZdd Zee	ddej	de
e dB d	ee d
efddZ  ZS )YolosForObjectDetectionr-   c                    sX   t  | t|dd| _t|j|j|jd dd| _t|j|jddd| _| 	  d S )NF)r   r   r   )r  r  r	  r     )
r0   r1   r   r   r   r4   
num_labelsclass_labels_classifierbbox_predictorr   r\   rB   r*   r+   r1     s   z YolosForObjectDetection.__init__c                 C   s$   dd t |d d |d d D S )Nc                 S   s   g | ]	\}}||d qS ))r   r   r*   )r   abr*   r*   r+   r   (  s    z9YolosForObjectDetection._set_aux_loss.<locals>.<listcomp>rE   )r  )rA   outputs_classoutputs_coordr*   r*   r+   _set_aux_loss'  s   $z%YolosForObjectDetection._set_aux_lossNrD   labelsr   r.   c              
   K   s   | j |fi |}|j}|dd| jj dddf }| |}| | }d\}}	}
|durXd\}}| jjrH|j}| |}| | }| 	||| j
|| j||\}}	}
t||	|||
|j|j|jdS )a`	  
        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
            following 2 keys: `'class_labels'` and `'boxes'` (the class labels and bounding boxes of an image in the
            batch respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding
            boxes in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image,
            4)`.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("hustvl/yolos-tiny")
        >>> model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-tiny")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> target_sizes = torch.tensor([image.size[::-1]])
        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
        ...     0
        ... ]

        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(
        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
        ...         f"{round(score.item(), 3)} at location {box}"
        ...     )
        Detected remote with confidence 0.991 at location [46.48, 72.78, 178.98, 119.3]
        Detected remote with confidence 0.908 at location [336.48, 79.27, 368.23, 192.36]
        Detected cat with confidence 0.934 at location [337.18, 18.06, 638.14, 373.09]
        Detected cat with confidence 0.979 at location [10.93, 53.74, 313.41, 470.67]
        Detected remote with confidence 0.974 at location [41.63, 72.23, 178.09, 119.99]
        ```N)NNN)NN)r   r   r   r   r   r   r   r   )r   r   r-   r6   r  r  sigmoidauxiliary_lossr   loss_functiondevicer   r   )rA   rD   r  r   outputsr   r   r   r   r   r   r  r  r   r*   r*   r+   rT   *  s2   7 



zYolosForObjectDetection.forwardrZ   )r    r!   r"   r   r1   r  r   r   r$   r%   r(   r'   r   r   r   rT   rW   r*   r*   rB   r+   r    s    
r  )r  r   r   )Nr   )9r#   collections.abcr   r   dataclassesr   r$   r   activationsr   modeling_layersr   modeling_outputsr   r	   modeling_utilsr
   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_yolosr   
get_loggerr    loggerr   Moduler,   r?   ry   r8   rV   floatr   r   r   r   r   r   r   r   r   r   r   r   r  __all__r*   r*   r*   r+   <module>   s|   
!*!(
3,+o