o
    wiυ                     @   sF  d Z ddlZddlmZ ddlmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZ ddlmZmZmZ ddlmZ ee Z!eeddG dd deZ"G dd dej#Z$G dd dej#Z%G dd dej#Z&G dd dej#Z'	d?dej#de	j(de	j(de	j(d ee	j( d!e)d"e)fd#d$Z*G d%d& d&ej#Z+G d'd( d(ej#Z,G d)d* d*ej#Z-G d+d, d,ej#Z.G d-d. d.ej#Z/G d/d0 d0eZ0G d1d2 d2ej#Z1eG d3d4 d4eZ2eG d5d6 d6e2Z3G d7d8 d8ej#Z4G d9d: d:ej#Z5ed;dG d<d= d=e2Z6g d>Z7dS )@zPyTorch YOLOS model.    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )YolosConfigz5
    Output type of [`YolosForObjectDetection`].
    )custom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	ee
 ed< dZeej ed< dZeej ed< dZeee
  ed< dZeej ed< dZeeej  ed	< dZeeej  ed
< dS )YolosObjectDetectionOutputa0  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
        scale-invariant IoU loss.
    loss_dict (`Dict`, *optional*):
        A dictionary containing the individual losses. Useful for logging.
    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
        Classification logits (including no-object) for all queries.
    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
        possible padding). You can use [`~YolosImageProcessor.post_process`] to retrieve the unnormalized bounding
        boxes.
    auxiliary_outputs (`list[Dict]`, *optional*):
        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
        `pred_boxes`) for each decoder layer.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the decoder of the model.
    Nloss	loss_dictlogits
pred_boxesauxiliary_outputslast_hidden_statehidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   dictr   r   r   listr   r   tupler    r)   r)   e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/yolos/modeling_yolos.pyr   %   s   
 r   c                       s@   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
YolosEmbeddingszT
    Construct the CLS token, detection tokens, position and patch embeddings.

    configreturnNc                    s   t    ttdd|j| _ttd|j|j| _	t
|| _| jj}ttd||j d |j| _t|j| _t|| _|| _d S Nr   )super__init__r   	Parameterr#   zeroshidden_size	cls_tokennum_detection_tokensdetection_tokensYolosPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout$InterpolateInitialPositionEmbeddingsinterpolationr,   )selfr,   r9   	__class__r)   r*   r0   R   s   



zYolosEmbeddings.__init__pixel_valuesc                 C   s   |j \}}}}| |}| \}}}| j|dd}	| j|dd}
tj|	||
fdd}| | j	||f}|| }| 
|}|S )Nr   dim)shaper8   sizer4   expandr6   r#   catr?   r:   r=   )r@   rC   
batch_sizenum_channelsheightwidth
embeddingsseq_len_
cls_tokensr6   r:   r)   r)   r*   forwarda   s   

zYolosEmbeddings.forward
r   r    r!   r"   r   r0   r#   TensorrS   __classcell__r)   r)   rA   r*   r+   L   s    r+   c                       0   e Zd Zd fddZd	dejfddZ  ZS )
r>   r-   Nc                       t    || _d S Nr/   r0   r,   r@   r,   rA   r)   r*   r0   x      

z-InterpolateInitialPositionEmbeddings.__init__i   i@  c                 C   s  |d d dd d f }|d d d f }|d d | j j d d d f }|d d d| j j d d f }|dd}|j\}}}| j jd | j j | j jd | j j }	}
||||	|
}|\}}|| j j || j j }}tjj	|||fddd}|
ddd}tj|||fdd}|S )Nr   r      bicubicFrH   modealign_cornersrE   )r,   r5   	transposerG   
image_size
patch_sizeviewr   
functionalinterpolateflattenr#   rJ   )r@   	pos_embedimg_sizecls_pos_embeddet_pos_embedpatch_pos_embedrK   r3   rP   patch_heightpatch_widthrM   rN   new_patch_heightnew_patch_widthscale_pos_embedr)   r)   r*   rS   |   s$     z,InterpolateInitialPositionEmbeddings.forwardr-   Nr]   r   r    r!   r0   r#   rU   rS   rV   r)   r)   rA   r*   r>   w       r>   c                       rW   )
 InterpolateMidPositionEmbeddingsr-   Nc                    rX   rY   rZ   r[   rA   r)   r*   r0      r\   z)InterpolateMidPositionEmbeddings.__init__r]   c                 C   sH  |d d d d dd d f }|d d d f }|d d d d | j j d d d f }|d d d d d| j j d d f }|dd}|j\}}}}	| j jd | j j | j jd | j j }
}||| ||
|}|\}}|| j j || j j }}tjj	|||fddd}|
ddd |||| |}tj|||fdd}|S )	Nr   r   r^   r   r_   Fr`   rE   )r,   r5   rc   rG   rd   re   rf   r   rg   rh   ri   
contiguousr#   rJ   )r@   rj   rk   rl   rm   rn   depthrK   r3   rP   ro   rp   rM   rN   rq   rr   rs   r)   r)   r*   rS      s,   &&z(InterpolateMidPositionEmbeddings.forwardrt   ru   rv   r)   r)   rA   r*   rx      rw   rx   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )r7   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)r/   r0   rd   re   rL   r3   
isinstancecollectionsabcIterabler9   r   Conv2d
projection)r@   r,   rd   re   rL   r3   r9   rA   r)   r*   r0      s   
 zYolosPatchEmbeddings.__init__rC   r-   c                 C   s<   |j \}}}}|| jkrtd| |ddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r^   r   )rG   rL   
ValueErrorr   ri   rc   )r@   rC   rK   rL   rM   rN   rO   r)   r)   r*   rS      s   
zYolosPatchEmbeddings.forward)	r   r    r!   r"   r0   r#   rU   rS   rV   r)   r)   rA   r*   r7      s    r7           modulequerykeyvalueattention_maskscalingr=   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d ur,|| }t ||}	|	dd }	|	|fS )NrD   )rF   dtype)ptrainingr   r^   )r#   matmulrc   r   rg   softmaxfloat32tor   r=   r   ry   )
r   r   r   r   r   r   r=   kwargsattn_weightsattn_outputr)   r)   r*   eager_attention_forward   s   r   c                
       sv   e Zd Zdeddf fddZdejdejfddZ		dd
eej de	de
eejejf eej f fddZ  ZS )YolosSelfAttentionr,   r-   Nc                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r/   r0   r3   num_attention_headshasattrr   r,   intattention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   r[   rA   r)   r*   r0      s"   

zYolosSelfAttention.__init__xc                 C   s6   |  d d | j| jf }||}|ddddS )NrD   r   r^   r   r   )rH   r   r   rf   permute)r@   r   new_x_shaper)   r)   r*   transpose_for_scores  s   
z'YolosSelfAttention.transpose_for_scoresF	head_maskoutput_attentionsc              
   C   s   |  | |}|  | |}|  | |}t}| jjdkr4| jjdkr.|r.td nt	| jj }|| ||||| j
| j| jsCdn| jd\}}	| d d | jf }
||
}|rc||	f}|S |f}|S )Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   r=   r   )r   r   r   r   r   r,   _attn_implementationloggerwarning_oncer   r   r   r   r   rH   r   reshape)r@   r   r   r   	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputsr)   r)   r*   rS     s4   

zYolosSelfAttention.forwardNF)r   r    r!   r   r0   r#   rU   r   r   boolr   r(   rS   rV   r)   r)   rA   r*   r      s    r   c                       sF   e Zd ZdZdeddf fddZdejdejdejfd	d
Z  Z	S )YolosSelfOutputz
    The residual connection is defined in YolosLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r,   r-   Nc                    s.   t    t|j|j| _t|j| _d S rY   )	r/   r0   r   r   r3   denser;   r<   r=   r[   rA   r)   r*   r0   ;     
zYolosSelfOutput.__init__r   input_tensorc                 C      |  |}| |}|S rY   r   r=   r@   r   r   r)   r)   r*   rS   @     

zYolosSelfOutput.forwardrT   r)   r)   rA   r*   r   5  s    $r   c                       s~   e Zd Zdeddf fddZdee ddfddZ			dd
ej	de
ej	 dedeeej	ej	f eej	 f fddZ  ZS )YolosAttentionr,   r-   Nc                    s*   t    t|| _t|| _t | _d S rY   )r/   r0   r   	attentionr   outputsetpruned_headsr[   rA   r)   r*   r0   I  s   


zYolosAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rE   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)r@   r   indexr)   r)   r*   prune_headsO  s   zYolosAttention.prune_headsFr   r   r   c                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )r   r   )r@   r   r   r   self_outputsattention_outputr   r)   r)   r*   rS   a  s   zYolosAttention.forwardr   )r   r    r!   r   r0   r   r   r   r#   rU   r   r   r   r(   rS   rV   r)   r)   rA   r*   r   H  s    r   c                       s<   e Zd Zdeddf fddZdejdejfddZ  ZS )	YolosIntermediater,   r-   Nc                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S rY   )r/   r0   r   r   r3   intermediate_sizer   r}   
hidden_actstrr   intermediate_act_fnr[   rA   r)   r*   r0   q  s
   
zYolosIntermediate.__init__r   c                 C   r   rY   )r   r   )r@   r   r)   r)   r*   rS   y  r   zYolosIntermediate.forward	r   r    r!   r   r0   r#   rU   rS   rV   r)   r)   rA   r*   r   p  s    r   c                       sB   e Zd Zdeddf fddZdejdejdejfdd	Z  ZS )
YolosOutputr,   r-   Nc                    s.   t    t|j|j| _t|j| _	d S rY   )
r/   r0   r   r   r   r3   r   r;   r<   r=   r[   rA   r)   r*   r0     r   zYolosOutput.__init__r   r   c                 C   s    |  |}| |}|| }|S rY   r   r   r)   r)   r*   rS     s   

zYolosOutput.forwardr   r)   r)   rA   r*   r     s    $r   c                       sl   e Zd ZdZdeddf fddZ		ddejd	eej d
e	de
eejejf eej f fddZ  ZS )
YolosLayerz?This corresponds to the Block class in the timm implementation.r,   r-   Nc                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r/   r0   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr3   layer_norm_epslayernorm_beforelayernorm_afterr[   rA   r)   r*   r0     s   



zYolosLayer.__init__Fr   r   r   c                 C   s`   | j | |||d}|d }|dd  }|| }| |}| |}| ||}|f| }|S )N)r   r   r   )r   r   r   r   r   )r@   r   r   r   self_attention_outputsr   r   layer_outputr)   r)   r*   rS     s   


zYolosLayer.forwardr   )r   r    r!   r"   r   r0   r#   rU   r   r   r   r(   rS   rV   r)   r)   rA   r*   r     s    r   c                       sb   e Zd Zdeddf fddZ				ddejd	eej d
ededede	e
ef fddZ  ZS )YolosEncoderr,   r-   Nc                    s   t     | _t fddt jD | _d| _d j	d  j	d   j
d    j } jrAtt jd d| jnd | _ jrNt | _d S d | _d S )Nc                    s   g | ]}t  qS r)   )r   ).0rQ   r,   r)   r*   
<listcomp>  s    z)YolosEncoder.__init__.<locals>.<listcomp>Fr   r   r^   )r/   r0   r,   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingrd   re   r5   use_mid_position_embeddingsr1   r#   r2   r3   mid_position_embeddingsrx   r?   )r@   r,   
seq_lengthrA   r   r*   r0     s$   
 &	zYolosEncoder.__init__FTr   r   r   output_hidden_statesreturn_dictc                 C   s   |rdnd }|r
dnd }	| j jr| | j||f}
t| jD ]:\}}|r)||f }|d ur1|| nd }||||}|d }| j jrO|| j jd k rO||
|  }|rX|	|d f }	q|r`||f }|sntdd |||	fD S t|||	dS )Nr)   r   r   c                 s   s    | ]	}|d ur|V  qd S rY   r)   )r   vr)   r)   r*   	<genexpr>  s    z'YolosEncoder.forward.<locals>.<genexpr>)r   r   r   )	r,   r   r?   r   	enumerater   r   r(   r
   )r@   r   rM   rN   r   r   r   r   all_hidden_statesall_self_attentions$interpolated_mid_position_embeddingsilayer_modulelayer_head_masklayer_outputsr)   r)   r*   rS     s2   


zYolosEncoder.forward)NFFT)r   r    r!   r   r0   r#   rU   r   r   r   r(   r
   rS   rV   r)   r)   rA   r*   r     s&    
	r   c                   @   sR   e Zd ZeZdZdZdZg ZdZ	dZ
dZdZdeejejejf ddfddZdS )	YolosPreTrainedModelvitrC   Tr   r-   Nc                 C   st   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS dS )zInitialize the weightsr   )meanstdNg      ?)r}   r   r   r   weightdatanormal_r,   initializer_ranger   zero_r   fill_)r@   r   r)   r)   r*   _init_weights  s   
z"YolosPreTrainedModel._init_weights)r   r    r!   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_2_supports_flex_attn_supports_attention_backendr   r   r   r   r   r  r)   r)   r)   r*   r     s    &r   c                       s   e Zd Zddedef fddZdefddZd	ee	e
e	 f dd
fddZe	
	
	
	
	
ddeej deej dee dee dee deeef fddZ  ZS )
YolosModelTr,   add_pooling_layerc                    sX   t  | || _t|| _t|| _tj|j	|j
d| _|r#t|nd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r/   r0   r,   r+   rO   r   encoderr   r   r3   r   	layernormYolosPoolerpooler	post_init)r@   r,   r  rA   r)   r*   r0     s   

zYolosModel.__init__r-   c                 C   s   | j jS rY   )rO   r8   )r@   r)   r)   r*   get_input_embeddings-  s   zYolosModel.get_input_embeddingsheads_to_pruneNc                 C   s*   |  D ]\}}| jj| j| qdS )a	  
        Prunes heads of the model.

        Args:
            heads_to_prune (`dict`):
                See base class `PreTrainedModel`. The input dictionary must have the following format: {layer_num:
                list of heads to prune in this layer}
        N)itemsr  r   r   r   )r@   r  r   r   r)   r)   r*   _prune_heads0  s   	zYolosModel._prune_headsrC   r   r   r   r   c              	   C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| || j j}| |}| j||j	d |j	d ||||d}|d }| 
|}| jd urX| |nd }	|so|	d urd||	fn|f}
|
|dd   S t||	|j|jdS )Nz You have to specify pixel_valuesr   rD   )rM   rN   r   r   r   r   r   r   )r   pooler_outputr   r   )r,   r   r   use_return_dictr   get_head_maskr   rO   r  rG   r  r  r   r   r   )r@   rC   r   r   r   r   embedding_outputencoder_outputssequence_outputpooled_outputhead_outputsr)   r)   r*   rS   <  s:   	
	
zYolosModel.forward)T)NNNNN)r   r    r!   r   r   r0   r7   r  r&   r   r'   r  r   r   r#   rU   r   r(   r   rS   rV   r)   r)   rA   r*   r    s.    
r  c                       s*   e Zd Zdef fddZdd Z  ZS )r  r,   c                    s*   t    t|j|j| _t | _d S rY   )r/   r0   r   r   r3   r   Tanh
activationr[   rA   r)   r*   r0   q  s   
zYolosPooler.__init__c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r"  )r@   r   first_token_tensorr  r)   r)   r*   rS   v  s   

zYolosPooler.forward)r   r    r!   r   r0   rS   rV   r)   r)   rA   r*   r  p  s    r  c                       s(   e Zd ZdZ fddZdd Z  ZS )YolosMLPPredictionHeada  
    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
    height and width of a bounding box w.r.t. an image.

    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py

    c                    sJ   t    || _|g|d  }tdd t|g| ||g D | _d S )Nr   c                 s   s     | ]\}}t ||V  qd S rY   )r   r   )r   nkr)   r)   r*   r     s    z2YolosMLPPredictionHead.__init__.<locals>.<genexpr>)r/   r0   
num_layersr   r   ziplayers)r@   	input_dim
hidden_dim
output_dimr'  hrA   r)   r*   r0     s   
,zYolosMLPPredictionHead.__init__c                 C   s>   t | jD ]\}}|| jd k rtj||n||}q|S r.   )r   r)  r'  r   rg   relu)r@   r   r   r   r)   r)   r*   rS     s   (zYolosMLPPredictionHead.forward)r   r    r!   r"   r0   rS   rV   r)   r)   rA   r*   r$    s    r$  zy
    YOLOS Model (consisting of a ViT encoder) with object detection heads on top, for tasks such as COCO detection.
    c                       s   e Zd Zdef fddZejjdd Ze					ddej
deee  d	ee d
ee dee deeef fddZ  ZS )YolosForObjectDetectionr,   c                    sX   t  | t|dd| _t|j|j|jd dd| _t|j|jddd| _| 	  d S )NF)r  r   r   )r*  r+  r,  r'     )
r/   r0   r  r   r$  r3   
num_labelsclass_labels_classifierbbox_predictorr  r[   rA   r)   r*   r0     s   z YolosForObjectDetection.__init__c                 C   s$   dd t |d d |d d D S )Nc                 S   s   g | ]	\}}||d qS ))r   r   r)   )r   abr)   r)   r*   r     s    z9YolosForObjectDetection._set_aux_loss.<locals>.<listcomp>rD   )r(  )r@   outputs_classoutputs_coordr)   r)   r*   _set_aux_loss  s   $z%YolosForObjectDetection._set_aux_lossNrC   labelsr   r   r   r-   c              
   C   s2  |dur|n| j j}| j||||d}|d }|dd| j j dddf }| |}| | }	d\}
}}|durid\}}| j jrY|rI|jn|d }| |}| | }| 	||| j
|	| j ||\}
}}|s|durx||	f| | }n||	f| }|
dur|
|f| S |S t|
|||	||j|j|jdS )a	  
        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
            following 2 keys: `'class_labels'` and `'boxes'` (the class labels and bounding boxes of an image in the
            batch respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding
            boxes in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image,
            4)`.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("hustvl/yolos-tiny")
        >>> model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-tiny")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> target_sizes = torch.tensor([image.size[::-1]])
        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
        ...     0
        ... ]

        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(
        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
        ...         f"{round(score.item(), 3)} at location {box}"
        ...     )
        Detected remote with confidence 0.991 at location [46.48, 72.78, 178.98, 119.3]
        Detected remote with confidence 0.908 at location [336.48, 79.27, 368.23, 192.36]
        Detected cat with confidence 0.934 at location [337.18, 18.06, 638.14, 373.09]
        Detected cat with confidence 0.979 at location [10.93, 53.74, 313.41, 470.67]
        Detected remote with confidence 0.974 at location [41.63, 72.23, 178.09, 119.99]
        ```N)r   r   r   r   )NNN)NNr0  )r   r   r   r   r   r   r   r   )r,   r  r   r5   r2  r3  sigmoidauxiliary_lossintermediate_hidden_statesloss_functiondevicer   r   r   r   )r@   rC   r9  r   r   r   r   r  r   r   r   r   r   r6  r7  r   r   r)   r)   r*   rS     sH   4 



zYolosForObjectDetection.forward)NNNN)r   r    r!   r   r0   r#   jitunusedr8  r   r$   r   r'   r&   r   r   r(   r   rS   rV   r)   r)   rA   r*   r/    s,    


r/  )r/  r  r   )r   )8r"   collections.abcr~   dataclassesr   typingr   r   r   r#   torch.utils.checkpointr   activationsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   configuration_yolosr   
get_loggerr   r   r   Moduler+   r>   rx   r7   rU   floatr   r   r   r   r   r   r   r   r   r  r  r$  r/  __all__r)   r)   r)   r*   <module>   sv   
!+!)
?(*FU 