o
    ei.                     @   s   d dl Z d dlmZ ddlmZ ddlmZ ddlmZm	Z	m
Z
mZ e r+d dlmZ 		dd	e jd
e jdededef
ddZG dd deZG dd de	Z				dddZdS )    N   )center_to_corners_format)is_scipy_available   )HungarianMatcher	ImageLoss_set_aux_lossgeneralized_box_ioulinear_sum_assignment      ?inputstargets	num_boxesalphagammac           
      C   sv   |   }tjj| |dd}|| d| d|   }|d| |  }|dkr5|| d| d|   }	|	| }| | S )aR  
    Loss used in RetinaNet for dense detection: https://huggingface.co/papers/1708.02002.

    Args:
        inputs (`torch.FloatTensor` of arbitrary shape):
            The predictions for each example.
        targets (`torch.FloatTensor` with the same shape as `inputs`)
            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
            and 1 for the positive class).
        num_boxes (`int`):
            The total number of boxes in the batch.
        alpha (`float`, *optional*, defaults to 0.25):
            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
        gamma (`int`, *optional*, defaults to 2):
            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.

    Returns:
        Loss tensor
    none)	reductionr   r   )sigmoidnn
functional binary_cross_entropy_with_logitssum)
r   r   r   r   r   probce_lossp_tlossalpha_t r   c/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/loss/loss_grounding_dino.pysigmoid_focal_loss   s   r    c                   @   s   e Zd Ze dd ZdS )GroundingDinoHungarianMatcherc                 C   sd  |d j dd \}}|d dd }|d dd}|d }tdd	 t||D }||jd
dd }tdd	 |D }d}	d}
d|	 ||
  d| d    }|	d| |
  |d    }|| |  }tj	||dd}t
t|t| }| j| | j|  | j|  }|||d
 }dd	 |D }dd	 t||d
D }dd	 |D S )a  
        Args:
            outputs (`dict`):
                A dictionary that contains at least these entries:
                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
                * "label_maps": Tuple of tensors of dim [num_classes, hidden_dim].
            targets (`list[dict]`):
                A list of targets (len(targets) = batch_size), where each target is a dict containing:
                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
                  ground-truth
                 objects in the target) containing the class labels
                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.

        Returns:
            `list[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
            - index_i is the indices of the selected predictions (in order)
            - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        logitsNr   r   r   
pred_boxes
label_mapsc                 S   s   g | ]
\}}||d   qS class_labelsr   ).0	label_maptargetr   r   r   
<listcomp>b       z9GroundingDinoHungarianMatcher.forward.<locals>.<listcomp>T)dimkeepdimc                 S   s   g | ]}|d  qS boxesr   r'   vr   r   r   r*   g   s    r          @g:0yE>)pc                 S      g | ]}t |d  qS r/   lenr1   r   r   r   r*   {       c                 S   s   g | ]
\}}t || qS r   r
   )r'   icr   r   r   r*   |   r+   c                 S   s0   g | ]\}}t j|t jd t j|t jd fqS ))dtype)torch	as_tensorint64)r'   r9   jr   r   r   r*   }   s   0 )shapeflattenr   r<   catzipr   logtcdistr	   r   	bbox_cost
class_cost	giou_costviewcpu	enumeratesplit)selfoutputsr   
batch_sizenum_queriesout_probout_bboxr$   target_bboxr   r   neg_cost_classpos_cost_classrH   rG   rI   cost_matrixsizesindicesr   r   r   forwardD   s&   "z%GroundingDinoHungarianMatcher.forwardN)__name__
__module____qualname__r<   no_gradrZ   r   r   r   r   r!   C   s    r!   c                   @   s8   e Zd ZdZdd Ze dd Zdd Zdd	 Z	d
S )GroundingDinoImageLossa  
    This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we
    compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of
    matched ground-truth / prediction (supervise class and box).

    Args:
        matcher (`GroundingDinoHungarianMatcher`):
            Module able to compute a matching between targets and proposals.
        focal_alpha (`float`):
            Alpha parameter in focal loss.
        losses (`list[str]`):
            List of all the losses to be applied. See `get_loss` for a list of all available losses.
    c                 C   s"   t j|  || _|| _|| _d S N)r   Module__init__matcherfocal_alphalosses)rN   rc   rd   re   r   r   r   rb      s   
zGroundingDinoImageLoss.__init__c                 C   sb   |d }|j }tjdd |D |d}| djdkd}tj	|
 |
 }	d|	i}
|
S )	z
        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.

        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
        r"   c                 S   r5   r%   r6   r1   r   r   r   r*      r8   z;GroundingDinoImageLoss.loss_cardinality.<locals>.<listcomp>)devicer,   g      ?r   cardinality_error)rf   r<   r=   r   maxvaluesr   r   r   l1_lossfloat)rN   rO   r   rY   r   r"   rf   target_lengths	card_predcard_errre   r   r   r   loss_cardinality   s   z'GroundingDinoImageLoss.loss_cardinalityc           	         sr    d }t  fddtt||D }t j d dd}| |}t j||jt jd}|| t j||< |S )z>
        Create one_hot based on the matching indices
        r"   c                    sH   g | ] \}\}\}}|d kr|d | t  d |  n|d | qS )r   r&   r$   r6   )r'   r9   r)   _JrO   r   r   r*      s    .zFGroundingDinoImageLoss._get_target_classes_one_hot.<locals>.<listcomp>r$   r   )r-   )rf   r;   )	r<   rB   rL   rC   _get_source_permutation_idx
zeros_likerf   longto)	rN   rO   r   rY   r"   r&   r$   idxtarget_classes_onehotr   rr   r   _get_target_classes_one_hot   s   

z2GroundingDinoImageLoss._get_target_classes_one_hotc           
      C   s~   d|vrt dd|vrt d| |||}|d }|d }t||}t||}| }t|||| jdd}d|i}	|	S )z
        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
        of dim [nb_target_boxes]
        r"   z#No logits were found in the outputs	text_maskz&No text_mask were found in the outputsr   )r   r   r   r   r   loss_ce)KeyErrorry   r<   masked_selectrk   r    rd   )
rN   rO   r   rY   r   rx   source_logitsrz   r{   re   r   r   r   loss_labels   s&   z"GroundingDinoImageLoss.loss_labelsN)
r[   r\   r]   __doc__rb   r<   r^   ro   ry   r   r   r   r   r   r_      s    
r_   c                    sl  t |j|j|jd}g d}t||j|d}|| i }| |d< ||d< ||d< ||d< d }|jrHt||}|D ]
}||d< ||d< q9||d< ||||j	rj|	|
||d	}|||}d
d |
 D }| d|j|jd|j	rdd 
 D }| |jri }t|jd D ] | fdd
 D  q| tfddD }||fS )N)rH   rG   rI   )labelsr0   cardinality)rc   rd   re   r"   r#   r$   rz   auxiliary_outputs)r"   r#   r$   rz   c                 S      i | ]	\}}|d  |qS _encr   r'   kr2   r   r   r   
<dictcomp>      z7GroundingDinoForObjectDetectionLoss.<locals>.<dictcomp>r3   )r{   	loss_bbox	loss_giouc                 S   r   r   r   r   r   r   r   r     r   r   c                    s    i | ]\}}|d    |qS )rp   r   r   )r9   r   r   r     s     c                 3   s(    | ]}|v r | |  V  qd S r`   r   )r'   r   )	loss_dictweight_dictr   r   	<genexpr>  s   & z6GroundingDinoForObjectDetectionLoss.<locals>.<genexpr>)r!   rH   rG   rI   r_   rd   rv   auxiliary_lossr   	two_stageitemsupdatebbox_loss_coefficientgiou_loss_coefficientrangedecoder_layersr   )r"   r   rf   r#   configr$   rz   outputs_classoutputs_coordencoder_logitsencoder_pred_boxesrc   re   	criterionoutputs_lossr   
aux_outputencoder_outputs_lossencoder_loss_dictenc_weight_dictaux_weight_dictr   r   )r9   r   r   r   #GroundingDinoForObjectDetectionLoss   s\   








r   )r   r   )NNNN)r<   torch.nnr   image_transformsr   utilsr   loss_for_object_detectionr   r   r   r	   scipy.optimizer   Tensorintrk   r    r!   r_   r   r   r   r   r   <module>   s6   	
'=a