o
    iw                     @   s^  d Z ddlZddlmZ ddlmZmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlm Z m!Z! ddl"m#Z# e$e%Z&eeddG dd deZ'G dd de
j(Z)G dd de
j(Z*G dd de
j(Z+G dd de
j(Z,	dAde
j(de	j-d e	j-d!e	j-d"ee	j- d#e.d$e.fd%d&Z/G d'd( d(e
j(Z0G d)d* d*e
j(Z1G d+d, d,e
j(Z2G d-d. d.e
j(Z3G d/d0 d0e
j(Z4G d1d2 d2eZ5G d3d4 d4e
j(Z6eG d5d6 d6eZ7eG d7d8 d8e7Z8G d9d: d:e
j(Z9G d;d< d<e
j(Z:ed=dG d>d? d?e7Z;g d@Z<dS )BzPyTorch YOLOS model.    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplecheck_model_inputs   )YolosConfigz5
    Output type of [`YolosForObjectDetection`].
    )custom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	ee
 ed< dZeej ed< dZeej ed< dZeee
  ed< dZeej ed< dZeeej  ed	< dZeeej  ed
< dS )YolosObjectDetectionOutputa0  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
        scale-invariant IoU loss.
    loss_dict (`Dict`, *optional*):
        A dictionary containing the individual losses. Useful for logging.
    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
        Classification logits (including no-object) for all queries.
    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
        possible padding). You can use [`~YolosImageProcessor.post_process`] to retrieve the unnormalized bounding
        boxes.
    auxiliary_outputs (`list[Dict]`, *optional*):
        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
        `pred_boxes`) for each decoder layer.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the decoder of the model.
    Nloss	loss_dictlogits
pred_boxesauxiliary_outputslast_hidden_statehidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   dictr   r   r   listr    r!   tupler"    r-   r-   \/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/yolos/modeling_yolos.pyr   &   s   
 r   c                       s@   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
YolosEmbeddingszT
    Construct the CLS token, detection tokens, position and patch embeddings.

    configreturnNc                    s   t    ttdd|j| _ttd|j|j| _	t
|| _| jj}ttd||j d |j| _t|j| _t|| _|| _d S Nr   )super__init__r   	Parameterr'   zeroshidden_size	cls_tokennum_detection_tokensdetection_tokensYolosPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout$InterpolateInitialPositionEmbeddingsinterpolationr0   )selfr0   r=   	__class__r-   r.   r4   S   s   



zYolosEmbeddings.__init__pixel_valuesc                 C   s   |j \}}}}| |}| \}}}| j|dd}	| j|dd}
tj|	||
fdd}| | j	||f}|| }| 
|}|S )Nr   dim)shaper<   sizer8   expandr:   r'   catrC   r>   rA   )rD   rG   
batch_sizenum_channelsheightwidth
embeddingsseq_len_
cls_tokensr:   r>   r-   r-   r.   forwardb   s   

zYolosEmbeddings.forward
r#   r$   r%   r&   r   r4   r'   TensorrW   __classcell__r-   r-   rE   r.   r/   M   s    r/   c                       0   e Zd Zd fddZd	dejfddZ  ZS )
rB   r1   Nc                       t    || _d S Nr3   r4   r0   rD   r0   rE   r-   r.   r4   x      

z-InterpolateInitialPositionEmbeddings.__init__i   i@  c                 C   s  |d d dd d f }|d d d f }|d d | j j d d d f }|d d d| j j d d f }|dd}|j\}}}| j jd | j j | j jd | j j }	}
||||	|
}|\}}|| j j || j j }}tjj	|||fddd}|
ddd}tj|||fdd}|S )Nr   r      bicubicFrL   modealign_cornersrI   )r0   r9   	transposerK   
image_size
patch_sizeviewr   
functionalinterpolateflattenr'   rN   )rD   	pos_embedimg_sizecls_pos_embeddet_pos_embedpatch_pos_embedrO   r7   rT   patch_heightpatch_widthrQ   rR   new_patch_heightnew_patch_widthscale_pos_embedr-   r-   r.   rW   |   s$     z,InterpolateInitialPositionEmbeddings.forwardr1   Nra   r#   r$   r%   r4   r'   rY   rW   rZ   r-   r-   rE   r.   rB   w       rB   c                       r[   )
 InterpolateMidPositionEmbeddingsr1   Nc                    r\   r]   r^   r_   rE   r-   r.   r4      r`   z)InterpolateMidPositionEmbeddings.__init__ra   c                 C   sH  |d d d d dd d f }|d d d f }|d d d d | j j d d d f }|d d d d d| j j d d f }|dd}|j\}}}}	| j jd | j j | j jd | j j }
}||| ||
|}|\}}|| j j || j j }}tjj	|||fddd}|
ddd |||| |}tj|||fdd}|S )	Nr   r   rb   r   rc   Frd   rI   )r0   r9   rg   rK   rh   ri   rj   r   rk   rl   rm   
contiguousr'   rN   )rD   rn   ro   rp   rq   rr   depthrO   r7   rT   rs   rt   rQ   rR   ru   rv   rw   r-   r-   r.   rW      s,   &&z(InterpolateMidPositionEmbeddings.forwardrx   ry   rz   r-   r-   rE   r.   r|      r{   r|   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )r;   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)r3   r4   rh   ri   rP   r7   
isinstancecollectionsabcIterabler=   r   Conv2d
projection)rD   r0   rh   ri   rP   r7   r=   rE   r-   r.   r4      s   
 zYolosPatchEmbeddings.__init__rG   r1   c                 C   s<   |j \}}}}|| jkrtd| |ddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rb   r   )rK   rP   
ValueErrorr   rm   rg   )rD   rG   rO   rP   rQ   rR   rS   r-   r-   r.   rW      s   
zYolosPatchEmbeddings.forward)	r#   r$   r%   r&   r4   r'   rY   rW   rZ   r-   r-   rE   r.   r;      s    r;           modulequerykeyvalueattention_maskscalingrA   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d ur,|| }t ||}	|	dd }	|	|fS )NrH   )rJ   dtype)ptrainingr   rb   )r'   matmulrg   r   rk   softmaxfloat32tor   rA   r   r}   )
r   r   r   r   r   r   rA   kwargsattn_weightsattn_outputr-   r-   r.   eager_attention_forward   s   r   c                	       sP   e Zd Zdef fddZ	d
dejdeej deejejf fdd	Z	  Z
S )YolosSelfAttentionr0   c                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r3   r4   r7   num_attention_headshasattrr   r0   intattention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   r_   rE   r-   r.   r4      s"   

zYolosSelfAttention.__init__Nr!   	head_maskr1   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t}| j	j
dkr?t| j	j
 }|| ||||| j| j| jsNdn| jd\}	}
|	 d d | jf }|	|}	|	|
fS )	Nr   rH   r   rb   eagerr   )r   r   rA   r   )rK   r   r   r   rj   rg   r   r   r   r0   _attn_implementationr   r   r   r   r   rL   r   reshape)rD   r!   r   rO   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shaper-   r-   r.   rW     s*   


zYolosSelfAttention.forwardr]   )r#   r$   r%   r   r4   r'   rY   r   r,   rW   rZ   r-   r-   rE   r.   r      s    r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
YolosSelfOutputz
    The residual connection is defined in YolosLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r0   c                    s.   t    t|j|j| _t|j| _d S r]   )	r3   r4   r   r   r7   denser?   r@   rA   r_   rE   r-   r.   r4   1     
zYolosSelfOutput.__init__r!   input_tensorr1   c                 C      |  |}| |}|S r]   r   rA   rD   r!   r   r-   r-   r.   rW   6     

zYolosSelfOutput.forwardrX   r-   r-   rE   r.   r   +  s    $r   c                       sV   e Zd Zdef fddZdee fddZddej	d	e
ej	 d
ej	fddZ  ZS )YolosAttentionr0   c                    s*   t    t|| _t|| _t | _d S r]   )r3   r4   r   	attentionr   outputsetpruned_headsr_   rE   r-   r.   r4   >  s   


zYolosAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rI   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)rD   r   indexr-   r-   r.   prune_headsD  s   zYolosAttention.prune_headsNr!   r   r1   c                 C   s    |  ||\}}| ||}|S r]   )r   r   )rD   r!   r   self_attn_outputrU   r   r-   r-   r.   rW   V  s   zYolosAttention.forwardr]   )r#   r$   r%   r   r4   r   r   r   r'   rY   r   rW   rZ   r-   r-   rE   r.   r   =  s    *r   c                       8   e Zd Zdef fddZdejdejfddZ  ZS )YolosIntermediater0   c                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r]   )r3   r4   r   r   r7   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnr_   rE   r-   r.   r4   ^  s
   
zYolosIntermediate.__init__r!   r1   c                 C   r   r]   )r   r   )rD   r!   r-   r-   r.   rW   f  r   zYolosIntermediate.forward	r#   r$   r%   r   r4   r'   rY   rW   rZ   r-   r-   rE   r.   r   ]  s    r   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	YolosOutputr0   c                    s.   t    t|j|j| _t|j| _	d S r]   )
r3   r4   r   r   r   r7   r   r?   r@   rA   r_   rE   r-   r.   r4   n  r   zYolosOutput.__init__r!   r   r1   c                 C   s    |  |}| |}|| }|S r]   r   r   r-   r-   r.   rW   s  s   

zYolosOutput.forwardr   r-   r-   rE   r.   r   m  s    $r   c                       sH   e Zd ZdZdef fddZddejdeej dejfd	d
Z	  Z
S )
YolosLayerz?This corresponds to the Block class in the timm implementation.r0   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r3   r4   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr7   layer_norm_epslayernorm_beforelayernorm_afterr_   rE   r-   r.   r4   ~  s   



zYolosLayer.__init__Nr!   r   r1   c                 C   sB   |  |}| ||}|| }| |}| |}| ||}|S r]   )r   r   r   r   r   )rD   r!   r   hidden_states_normattention_outputlayer_outputr-   r-   r.   rW     s   


zYolosLayer.forwardr]   )r#   r$   r%   r&   r   r4   r'   rY   r   rW   rZ   r-   r-   rE   r.   r   {  s    *
r   c                       sP   e Zd Zdeddf fddZ	ddejdeded	eej de	f
d
dZ
  ZS )YolosEncoderr0   r1   Nc                    s   t     | _t fddt jD | _d| _d j	d  j	d   j
d    j } jrAtt jd d| jnd | _ jrNt | _d S d | _d S )Nc                    s   g | ]}t  qS r-   )r   ).0rU   r0   r-   r.   
<listcomp>  s    z)YolosEncoder.__init__.<locals>.<listcomp>Fr   r   rb   )r3   r4   r0   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingrh   ri   r9   use_mid_position_embeddingsr5   r'   r6   r7   mid_position_embeddingsr|   rC   )rD   r0   
seq_lengthrE   r   r.   r4     s$   
 &	zYolosEncoder.__init__r!   rQ   rR   r   c           	      C   sz   | j jr| | j||f}t| jD ]%\}}|d ur|| nd }|||}| j jr7|| j jd k r7|||  }qt|dS )Nr   )r    )r0   r   rC   r   	enumerater   r   r
   )	rD   r!   rQ   rR   r   $interpolated_mid_position_embeddingsilayer_modulelayer_head_maskr-   r-   r.   rW     s   

zYolosEncoder.forwardr]   )r#   r$   r%   r   r4   r'   rY   r   r   r
   rW   rZ   r-   r-   rE   r.   r     s    r   c                   @   sb   e Zd ZU eed< dZdZdZg ZdZ	dZ
dZdZeedZdeejejejf ddfd	d
ZdS )YolosPreTrainedModelr0   vitrG   T)r!   r"   r   r1   Nc                 C   st   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS dS )zInitialize the weightsr   )meanstdNg      ?)r   r   r   r   weightdatanormal_r0   initializer_ranger   zero_r   fill_)rD   r   r-   r-   r.   _init_weights  s   
z"YolosPreTrainedModel._init_weights)r#   r$   r%   r   r)   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr   r   r   r   r   r   r-   r-   r-   r.   r     s   
 &r   c                       s   e Zd Zddedef fddZdefddZd	ee	e
e	 f dd
fddZedde	
	
ddeej deej dee defddZ  ZS )
YolosModelTr0   add_pooling_layerc                    sX   t  | || _t|| _t|| _tj|j	|j
d| _|r#t|nd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r3   r4   r0   r/   rS   r   encoderr   r   r7   r   	layernormYolosPoolerpooler	post_init)rD   r0   r  rE   r-   r.   r4     s   

zYolosModel.__init__r1   c                 C   s   | j jS r]   )rS   r<   )rD   r-   r-   r.   get_input_embeddings  s   zYolosModel.get_input_embeddingsheads_to_pruneNc                 C   s*   |  D ]\}}| jj| j| qdS )a	  
        Prunes heads of the model.

        Args:
            heads_to_prune (`dict`):
                See base class `PreTrainedModel`. The input dictionary must have the following format: {layer_num:
                list of heads to prune in this layer}
        N)itemsr  r   r   r   )rD   r
  r   r   r-   r-   r.   _prune_heads  s   	zYolosModel._prune_headsF)tie_last_hidden_statesrG   r   r   c           
      K   s   |d u rt d| || jj}| |}|jdd  \}}| j||||d}|j}| |}| j	d ur9| 	|nd }	t
||	dS )Nz You have to specify pixel_valuesr   )rQ   rR   r   )r    pooler_output)r   get_head_maskr0   r   rS   rK   r  r    r  r  r   )
rD   rG   r   r   embedding_outputrQ   rR   encoder_outputssequence_outputpooled_outputr-   r-   r.   rW     s   

zYolosModel.forward)TNN)r#   r$   r%   r   boolr4   r;   r	  r*   r   r+   r  r   r   r   r'   rY   r   r   r   rW   rZ   r-   r-   rE   r.   r    s"    r  c                       r   )r  r0   c                    s*   t    t|j|j| _t | _d S r]   )r3   r4   r   r   r7   r   Tanh
activationr_   rE   r-   r.   r4   &  s   
zYolosPooler.__init__r!   r1   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r  )rD   r!   first_token_tensorr  r-   r-   r.   rW   +  s   

zYolosPooler.forwardr   r-   r-   rE   r.   r  %  s    r  c                       s(   e Zd ZdZ fddZdd Z  ZS )YolosMLPPredictionHeada  
    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
    height and width of a bounding box w.r.t. an image.

    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py

    c                    sJ   t    || _|g|d  }tdd t|g| ||g D | _d S )Nr   c                 s   s     | ]\}}t ||V  qd S r]   )r   r   )r   nkr-   r-   r.   	<genexpr>B  s    z2YolosMLPPredictionHead.__init__.<locals>.<genexpr>)r3   r4   
num_layersr   r   ziplayers)rD   	input_dim
hidden_dim
output_dimr  hrE   r-   r.   r4   >  s   
,zYolosMLPPredictionHead.__init__c                 C   s>   t | jD ]\}}|| jd k rtj||n||}q|S r2   )r   r  r  r   rk   relu)rD   xr   r   r-   r-   r.   rW   D  s   (zYolosMLPPredictionHead.forward)r#   r$   r%   r&   r4   rW   rZ   r-   r-   rE   r.   r  5  s    r  zy
    YOLOS Model (consisting of a ViT encoder) with object detection heads on top, for tasks such as COCO detection.
    c                       sf   e Zd Zdef fddZejjdd Ze	e
	ddejdeee  d	ee d
efddZ  ZS )YolosForObjectDetectionr0   c                    sX   t  | t|dd| _t|j|j|jd dd| _t|j|jddd| _| 	  d S )NF)r  r   r   )r   r!  r"  r     )
r3   r4   r  r   r  r7   
num_labelsclass_labels_classifierbbox_predictorr  r_   rE   r-   r.   r4   P  s   z YolosForObjectDetection.__init__c                 C   s$   dd t |d d |d d D S )Nc                 S   s   g | ]	\}}||d qS ))r   r   r-   )r   abr-   r-   r.   r   h  s    z9YolosForObjectDetection._set_aux_loss.<locals>.<listcomp>rH   )r  )rD   outputs_classoutputs_coordr-   r-   r.   _set_aux_lossc  s   $z%YolosForObjectDetection._set_aux_lossNrG   labelsr   r1   c              
   K   s   | j |fi |}|j}|dd| jj dddf }| |}| | }d\}}	}
|durXd\}}| jjrH|j}| |}| | }| 	||| j
|| j||\}}	}
t||	|||
|j|j|jdS )a	  
        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
            following 2 keys: `'class_labels'` and `'boxes'` (the class labels and bounding boxes of an image in the
            batch respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding
            boxes in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image,
            4)`.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("hustvl/yolos-tiny")
        >>> model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-tiny")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> target_sizes = torch.tensor([image.size[::-1]])
        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
        ...     0
        ... ]

        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(
        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
        ...         f"{round(score.item(), 3)} at location {box}"
        ...     )
        Detected remote with confidence 0.991 at location [46.48, 72.78, 178.98, 119.3]
        Detected remote with confidence 0.908 at location [336.48, 79.27, 368.23, 192.36]
        Detected cat with confidence 0.934 at location [337.18, 18.06, 638.14, 373.09]
        Detected cat with confidence 0.979 at location [10.93, 53.74, 313.41, 470.67]
        Detected remote with confidence 0.974 at location [41.63, 72.23, 178.09, 119.99]
        ```N)NNNr  )r   r   r   r   r   r    r!   r"   )r   r    r0   r9   r)  r*  sigmoidauxiliary_lossr!   loss_functiondevicer   r"   )rD   rG   r0  r   outputsr  r   r   r   r   r   r-  r.  r   r-   r-   r.   rW   j  s2   5 



zYolosForObjectDetection.forwardr]   )r#   r$   r%   r   r4   r'   jitunusedr/  r   r   r(   r   r+   r*   r   r   r   rW   rZ   r-   r-   rE   r.   r&  J  s     

r&  )r&  r  r   )r   )=r&   collections.abcr   dataclassesr   typingr   r   r   r'   r   activationsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   utils.genericr   r   configuration_yolosr   
get_loggerr#   loggerr   Moduler/   rB   r|   r;   rY   floatr   r   r   r   r   r   r   r   r   r  r  r  r&  __all__r-   r-   r-   r.   <module>   sv   
!*!)
5 .@q