o
    i^                     @   s8  d dl Zd dlZd dlmZ d dlmZmZ d dlZ	d dl
Z
d dlm  mZ d dl
mZmZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ e rrd dl%m&Z& e  rd dl'm(Z( d dl)m*Z* eeddG dd deZ+	dWde
jde
jde
jfddZ,dededefddZ-de
jde
jde
jfd d!Z.G d"d# d#ej/Z0deded$e1defd%d&Z2de
jde
jd$e1de
jfd'd(Z3G d)d* d*ej/Z4G d+d, d,ej/Z5G d-d. d.ej/Z6	/dXd0ej/d1e
jd2e
jd3e
jd4ee
j d5e7d6e7fd7d8Z8G d9d: d:ej/Z9G d;d< d<ej/Z:dYd=e
jd>e7d?e;de
jfd@dAZ<G dBdC dCej/Z=G dDdE dEej/Z>G dFdG dGej/Z?G dHdI dIeZ@G dJdK dKejAZBG dLdM dMej/ZCG dNdO dOej/ZDG dPdQ dQej/ZEeG dRdS dSeZFedTdG dUdV dVeFZGdSdVgZHdS )Z    N)	dataclass)CallableOptional)Tensornn   )ACT2FN)ModelOutputis_scipy_availablerequires_backends)GradientCheckpointingLayer)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_accelerate_available)check_model_inputs   )
EomtConfig)linear_sum_assignment)PartialState)reducea  
    Class for outputs of [`EomtForUniversalSegmentationOutput`].

    This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or
    [`~EomtImageProcessor.post_process_instance_segmentation`] or
    [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
    [`~EomtImageProcessor] for details regarding usage.
    )custom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeeej  ed< dZeeej  ed< dZeeej  ed	< dS )
"EomtForUniversalSegmentationOutputa*  
    loss (`torch.Tensor`, *optional*):
        The computed loss, returned when labels are present.
    class_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
        query. Note the `+ 1` is needed because we incorporate the null class.
    masks_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
        query.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Last hidden states (final feature map) of the last layer.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Self and Cross Attentions weights from transformer decoder.
    patch_offsets (`list[torch.Tensor]`, *optional*):
        list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
    Nlossclass_queries_logitsmasks_queries_logitslast_hidden_statehidden_states
attentionspatch_offsets)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r   tupler    r!   listr    r+   r+   c/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/eomt/modeling_eomt.pyr   2   s   
 r   Finput_featurespoint_coordinatesreturnc                 K   sL   |  dkrd}|d}tjjj| d| d fi |}|r$|d}|S )a(  
    A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors.

    Args:
        input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
            A tensor that contains features map on a height * width grid
        point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,:
        2)):
            A tensor that contains [0, 1] * [0, 1] normalized point coordinates
        add_dim (`bool`):
            boolean value to keep track of added dimension

    Returns:
        point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
        height_grid, width_grid):
            A tensor that contains features for points in `point_coordinates`.
    r   T   g       @      ?)dim	unsqueezer&   r   
functionalgrid_samplesqueeze)r-   r.   add_dimkwargspoint_featuresr+   r+   r,   sample_point]   s   
 
r:   inputslabelsc                 C   sd   |   d} dt| |j }| ddddf |ddddf  }d|d |d   }|S )a  
    A pair wise version of the dice loss, see `dice_loss` for usage.

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        `torch.Tensor`: The computed loss between each pairs.
    r   r0   N)sigmoidflattenr&   matmulTsum)r;   r<   	numeratordenominatorr   r+   r+   r,   pair_wise_dice_loss}   s
   ,rE   c           	      C   sj   | j d }tjdd}|| t| }|| t| }t|| |j}t|| d| j}|| }|S )a  
    A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage.

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        loss (`torch.Tensor`): The computed loss between each pairs.
    r   none	reduction)shaper   BCEWithLogitsLossr&   	ones_like
zeros_liker@   rA   )	r;   r<   height_and_width	criterioncross_entropy_loss_poscross_entropy_loss_negloss_posloss_negr   r+   r+   r,   $pair_wise_sigmoid_cross_entropy_loss   s   
rS   c                       sl   e Zd ZdZ	ddedededef fdd	Ze d
ej	dej	dej	dej	de
ee	  f
ddZ  ZS )EomtHungarianMatcheraq  This class computes an assignment between the labels and the predictions of the network.

    For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
    predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
    un-matched (and thus treated as non-objects).
    r1    1  
cost_class	cost_mask	cost_dice
num_pointsc                    sF   t    |dkr|dkr|dkrtd|| _|| _|| _|| _dS )aH  Creates the matcher

        Params:
            cost_class (`float`, *optional*, defaults to 1.0):
                Relative weight of the classification error in the matching cost.
            cost_mask (`float`, *optional*,  defaults to 1.0):
                This is the relative weight of the focal loss of the binary mask in the matching cost.
            cost_dice (`float`, *optional*, defaults to 1.0):
                This is the relative weight of the dice loss of the binary mask in the matching cost.
            num_points (`int`, *optional*, defaults to 12544):
                No. of points to sample on which the mask loss will be calculated. The same set of K points are
                uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite
                matching.
        r   zAll costs can't be 0N)super__init__
ValueErrorrY   rV   rW   rX   )selfrV   rW   rX   rY   	__class__r+   r,   r[      s   

zEomtHungarianMatcher.__init__r   r   mask_labelsclass_labelsr/   c                 C   sf  g }|j d }t|D ]}|| d}|| }	|dd|| f  }
|| |	}|dddf }|	dddf }	tjd| jd|	jd}||j d dd}t	||dd
d}||	j d dd}t	|	|dd
d}	t|	|}t|	|}| j| | j|
  | j|  }t|td	}t|td
}t|d}t| }|| qdd |D }|S )ao  
        Params:
            masks_queries_logits (`torch.Tensor`):
                A tensor of dim `batch_size, num_queries, num_labels` with the classification logits.
            class_queries_logits (`torch.Tensor`):
                A tensor of dim `batch_size, num_queries, height, width` with the predicted masks.
            class_labels (`torch.Tensor`):
                A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the
                target) containing the class labels.
            mask_labels (`torch.Tensor`):
                A tensor of dim `num_target_boxes, height, width` containing the target masks.

        Returns:
            matched_indices (`list[tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j)
            where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected labels (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
        r   r=   Nr   r0   deviceFalign_cornersg    _Bg    _c                 S   s0   g | ]\}}t j|t jd t j|t jd fqS )dtype)r&   	as_tensorint64).0ijr+   r+   r,   
<listcomp>  s    $z0EomtHungarianMatcher.forward.<locals>.<listcomp>)rI   rangesoftmaxtor&   randrY   rc   repeatr:   r6   rS   rE   rW   rV   rX   minimumtensormaximum
nan_to_numr   cpuappend)r]   r   r   r`   ra   indices
batch_sizerk   
pred_probs	pred_maskrV   target_maskr.   target_coordinatespred_coordinatesrW   rX   cost_matrixassigned_indicesmatched_indicesr+   r+   r,   forward   s4   


zEomtHungarianMatcher.forward)r1   r1   r1   rU   )r"   r#   r$   r%   floatintr[   r&   no_gradr   r*   r)   r   __classcell__r+   r+   r^   r,   rT      s0    
rT   	num_masksc                 C   sX   |   d}d|| d }|d|d }d|d |d   }| | }|S )a4  
    Compute the DICE loss, similar to generalized IOU for masks as follows:

    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$

    In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow

    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).
        num_masks (`int`):
            The number of masks present in the current batch, used for normalization.

    Returns:
        `torch.Tensor`: The computed loss.
    r   r0   r=   )r>   r?   rB   )r;   r<   r   probsrC   rD   r   r+   r+   r,   	dice_loss  s   r   c                 C   s,   t jdd}|| |}|d | }|S )a|  
    Args:
        inputs (`torch.Tensor`):
            A float tensor of arbitrary shape.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        loss (`torch.Tensor`): The computed loss.
    rF   rG   r   )r   rJ   meanrB   )r;   r<   r   rN   cross_entropy_lossr   r+   r+   r,   sigmoid_cross_entropy_loss7  s   
r   c                       s~  e Zd Zdedeeef f fddZdeee	  dee	 fddZ
d	ee deeef fd
dZdedee deej deeef fddZdejdeej deej de	deeejf f
ddZdd Zdd ZdejdejfddZdejde	de	dedejf
d d!Z	"d)dejdejdeej deej d#eeeejf  deeejf fd$d%Zdejd&ejdejfd'd(Z  ZS )*EomtLossconfigweight_dictc                    s   t    t| dg |j| _|| _|j| _t| jd }| j|d< | 	d| |j
| _|j| _|j| _t|j|j|j| jd| _dS )aH  
        The Eomt Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we
        compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair
        of matched ground-truth / prediction (supervise class and mask)

        Args:
            config (`EomtConfig`):
                The configuration for Eomt model also containing loss calculation specific parameters.
            weight_dict (`dict[str, float]`):
                A dictionary of weights to be applied to the different losses.
        scipyr   r=   empty_weight)rV   rX   rW   rY   N)rZ   r[   r   
num_labelsr   no_object_weighteos_coefr&   onesregister_buffertrain_num_pointsrY   oversample_ratioimportance_sample_ratiorT   class_weightdice_weightmask_weightmatcher)r]   r   r   r   r^   r+   r,   r[   L  s"   

zEomtLoss.__init__sizesr/   c                 C   sB   |d }|dd  D ]}t |D ]\}}t|| |||< qq
|S )Nr   r   )	enumeratemax)r]   r   maxessublistindexitemr+   r+   r,   _max_by_axiso  s   zEomtLoss._max_by_axistensorsc                 C   s   |  dd |D }t|g| }|\}}}}|d j}|d j}	tj|||	d}
tj|||ftj|	d}t||
|D ].\}}}|d |j	d d |j	d d |j	d f 
| d|d |j	d d |j	d f< q;|
|fS )Nc                 S   s   g | ]}t |jqS r+   )r*   rI   )rj   rt   r+   r+   r,   rm   y  s    z8EomtLoss._pad_images_to_max_in_batch.<locals>.<listcomp>r   rg   rc   r   r0   F)r   lenrg   rc   r&   zerosr   boolziprI   copy_)r]   r   max_sizebatch_shaperz   _heightwidthrg   rc   padded_tensorspadding_masksrt   padded_tensorpadding_maskr+   r+   r,   _pad_images_to_max_in_batchw  s   

2"z$EomtLoss._pad_images_to_max_in_batchr   ra   ry   c                 C   s   |}|j \}}}tj| jd}| |}	tdd t||D }
tj||f| j	tj
|jd}|
||	< |dd}|||}d|i}|S )a  Compute the losses related to the labels using cross entropy.

        Args:
            class_queries_logits (`torch.Tensor`):
                A tensor of shape `batch_size, num_queries, num_labels`
            class_labels (`list[torch.Tensor]`):
                List of class labels of shape `(labels)`.
            indices (`tuple[np.array])`:
                The indices computed by the Hungarian matcher.

        Returns:
            `dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
        )weightc                 S   s   g | ]
\}\}}|| qS r+   r+   )rj   targetr   rl   r+   r+   r,   rm     s    z(EomtLoss.loss_labels.<locals>.<listcomp>)
fill_valuerg   rc   r   r0   loss_cross_entropy)rI   r   CrossEntropyLossr   $_get_predictions_permutation_indicesr&   catr   fullr   ri   rc   	transpose)r]   r   ra   ry   pred_logitsrz   num_queriesr   rN   idxtarget_classes_otarget_classespred_logits_transposedloss_celossesr+   r+   r,   loss_labels  s   

zEomtLoss.loss_labelsr   r`   r   c                    s     |} |}|| } |\}}	|| }|dddf }|dddf }t "  | fdd j j j}
t	||
dd
d}W d   n1 sRw   Y  t	||
dd
d}t|||t|||d}~~|S )a  Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss.

        Args:
            masks_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, height, width)`.
            mask_labels (`torch.Tensor`):
                List of mask labels of shape `(labels, height, width)`.
            indices (`tuple[np.array])`:
                The indices computed by the Hungarian matcher.
            num_masks (`int)`:
                The number of masks, used for normalization.

        Returns:
            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys:
            - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth.
              masks.
            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth,
              masks.
        Nc                    s
     | S N)calculate_uncertainty)logitsr]   r+   r,   <lambda>  s   
 z%EomtLoss.loss_masks.<locals>.<lambda>Frd   r   )	loss_mask	loss_dice)r    _get_targets_permutation_indicesr   r&   r   sample_points_using_uncertaintyrY   r   r   r:   r6   r   r   )r]   r   r`   ry   r   src_idxtgt_idx
pred_maskstarget_masksr   r.   point_labelspoint_logitsr   r+   r   r,   
loss_masks  s0   





zEomtLoss.loss_masksc                 C   4   t dd t|D }t dd |D }||fS )Nc                 S   s    g | ]\}\}}t ||qS r+   r&   	full_like)rj   rk   srcr   r+   r+   r,   rm          zAEomtLoss._get_predictions_permutation_indices.<locals>.<listcomp>c                 S   s   g | ]\}}|qS r+   r+   )rj   r   r   r+   r+   r,   rm         r&   r   r   )r]   ry   batch_indicespredictions_indicesr+   r+   r,   r        z-EomtLoss._get_predictions_permutation_indicesc                 C   r   )Nc                 S   s    g | ]\}\}}t ||qS r+   r   )rj   rk   r   tgtr+   r+   r,   rm     r   z=EomtLoss._get_targets_permutation_indices.<locals>.<listcomp>c                 S   s   g | ]\}}|qS r+   r+   )rj   r   r   r+   r+   r,   rm     r   r   )r]   ry   r   target_indicesr+   r+   r,   r     r   z)EomtLoss._get_targets_permutation_indicesr   c                 C   s   t | }|S )a  
        In Eomt paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits'
        for the foreground class in `classes`.

        Args:
            logits (`torch.Tensor`):
            A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is:
            the number of foreground classes. The values are logits.

        Returns:
            scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most
            uncertain locations having the highest uncertainty score.
        )r&   abs)r]   r   uncertainty_scoresr+   r+   r,   r     s   zEomtLoss.calculate_uncertaintyrY   r   r   c                 C   s   |j d }t|| }tj||d|jd}t||dd}	||	}
t|| }|| }tj|
dddddf |ddd }|tj|tj|jd	 }||dddf 7 }|	d
d|	d
ddf 	||d}|dkr|tj
|tj||d|jdgdd}|S )a  
        This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The
        uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit
        prediction as input.

        Args:
            logits (`float`):
                Logit predictions for P points.
            uncertainty_function:
                A function that takes logit predictions for P points and returns their uncertainties.
            num_points (`int`):
                The number of points P to sample.
            oversample_ratio (`int`):
                Oversampling parameter.
            importance_sample_ratio (`float`):
                Ratio of points that are sampled via importance sampling.

        Returns:
            point_coordinates (`torch.Tensor`):
                Coordinates for P sampled points.
        r   r0   rb   Frd   Nr   )kr2   r   r=   r2   )rI   r   r&   rq   rc   r:   topkarangelongviewr   )r]   r   uncertainty_functionrY   r   r   	num_boxesnum_points_sampledr.   r   point_uncertaintiesnum_uncertain_pointsnum_random_pointsr   shiftr+   r+   r,   r     s"   
&(z(EomtLoss.sample_points_using_uncertaintyNauxiliary_predictionsc                    s   |  ||||}| j||d jd}i | ||||| |||}|durPt|D ]$\ }	|	d }|	d }| ||||}
 fdd|
 D }
||
 q+|S )a  
        This performs the loss computation.

        Args:
            masks_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, height, width)`.
            class_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, num_labels)`.
            mask_labels (`torch.Tensor`):
                List of mask labels of shape `(labels, height, width)`.
            class_labels (`list[torch.Tensor]`):
                List of class labels of shape `(labels)`.
            auxiliary_predictions (`dict[str, torch.Tensor]`, *optional*):
                if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], then it contains the logits from
                the inner layers of the EomtMaskedAttentionDecoder.

        Returns:
            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys:
            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
            - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth
              masks.
            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
              masks.
            if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], the dictionary contains additional
            losses for each auxiliary predictions.
        r   rb   Nr   r   c                    s    i | ]\}}| d   |qS )r   r+   )rj   keyvaluer   r+   r,   
<dictcomp>n  r   z$EomtLoss.forward.<locals>.<dictcomp>)	r   get_num_masksrc   r   r   r   r   itemsupdate)r]   r   r   r`   ra   r   ry   r   r   aux_outputs	loss_dictr+   r   r,   r   <  s   $zEomtLoss.forwardrc   c                 C   s^   t dd |D }tj|tj|d}d}t r$tji kr$t|}t j}tj	|| dd}|S )zk
        Computes the average number of target masks across the batch, for normalization purposes.
        c                 s   s    | ]}t |V  qd S r   )r   )rj   classesr+   r+   r,   	<genexpr>w  s    z)EomtLoss.get_num_masks.<locals>.<genexpr>r   r   )min)
rB   r&   rh   r   r   r   _shared_stater   num_processesclamp)r]   ra   rc   r   
world_sizer+   r+   r,   r   s  s   
zEomtLoss.get_num_masksr   )r"   r#   r$   r   dictstrr   r[   r*   r   r   r   r)   r   nparrayr   r&   r   r   r   r   r   r   r   rc   r   r   r+   r+   r^   r,   r   K  sj    #

"
>
=
$7r   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )EomtPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   kernel_sizestride)rZ   r[   
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)r]   r   r  r  r  r  r  r^   r+   r,   r[     s   
 zEomtPatchEmbeddings.__init__pixel_valuesr/   c                 C   sH   |j d }|| jkrtd| j d| d| |ddd}|S )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r0   )rI   r  r\   r  r?   r   )r]   r  r  
embeddingsr+   r+   r,   r     s   

zEomtPatchEmbeddings.forward)	r"   r#   r$   r%   r[   r&   r   r   r   r+   r+   r^   r,   r
    s    r
  c                       s@   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
EomtEmbeddingszM
    Construct the CLS token, mask token, position and patch embeddings.
    r   r/   Nc                    s   t    || _|j| _ttdd|j| _	tt
d|j|j| _t|| _| jj}t|j| _d|j | _t||j| _| jdt|ddd d S )Nr   position_ids)r   r=   F)
persistent)rZ   r[   r   r  r   	Parameterr&   randnr  	cls_tokenr   num_register_tokensregister_tokensr
  patch_embeddingsr  Dropouthidden_dropout_probdropoutnum_prefix_tokens	Embeddingposition_embeddingsr   r   expand)r]   r   r  r^   r+   r,   r[     s   

 zEomtEmbeddings.__init__r  c                 C   s~   |j \}}}}| jjjj}| |j|d}| j|dd}| j|dd}|| 	| j
 }tj|||gdd}| |}|S )Nrf   r=   r   r   )rI   r$  r  r   rg   rp   r!  r+  r#  r*  r  r&   r   r'  )r]   r  rz   r   target_dtyper  
cls_tokensr#  r+   r+   r,   r     s   
zEomtEmbeddings.forward)
r"   r#   r$   r%   r   r[   r&   r   r   r   r+   r+   r^   r,   r    s    r          modulequeryr   r   attention_maskscalingr'  c           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr=   )r2   rg   )ptrainingr   r0   )r&   r@   r   r   r4   ro   float32rp   rg   r'  r5  
contiguous)
r/  r0  r   r   r1  r2  r'  r8   attn_weightsattn_outputr+   r+   r,   eager_attention_forward  s   
r:  c                
       sR   e Zd ZdZ fddZ	d
dejdeej deejeej f fdd	Z	  Z
S )EomtAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)rZ   r[   r   r  	embed_dimnum_attention_heads	num_headshead_dimr\   scaleattention_dropoutr'  	is_causalr   Lineark_projv_projq_projout_projr]   r   r^   r+   r,   r[     s$   

zEomtAttention.__init__Nr   r1  r/   c              
   K   s   |j \}}}| |}| |}| |}	|||| j| jdd}|||| j| jdd}|	||| j| jdd}	t}
| j	j
dkrMt| j	j
 }
|
| |||	|| j| j| js\dn| jd\}}|||| }| |}||fS )z#Input shape: Batch x Time x Channelr   r0   eagerr.  )rB  r2  r'  )rI   rF  rD  rE  r   r>  r?  r   r:  r   _attn_implementationr   rB  r@  r5  r'  reshaper7  rG  )r]   r   r1  r8   rz   
seq_lengthr<  querieskeysvaluesattention_interfacer9  r8  r+   r+   r,   r     s.   




zEomtAttention.forwardr   )r"   r#   r$   r%   r[   r&   r   r   r)   r   r   r+   r+   r^   r,   r;    s    r;  c                       4   e Zd Zd fddZdejdejfddZ  ZS )	EomtLayerScaler/   Nc                    s(   t    t|jt|j | _d S r   )	rZ   r[   r   r  layerscale_valuer&   r   r  lambda1rH  r^   r+   r,   r[     s   
zEomtLayerScale.__init__hidden_statec                 C   s
   || j  S r   )rT  r]   rU  r+   r+   r,   r   #  s   
zEomtLayerScale.forwardr/   Nr"   r#   r$   r[   r&   r   r   r   r+   r+   r^   r,   rR    s    rR  input	drop_probr5  c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r.  r   r   )r   r   )rI   ndimr&   rq   rg   rc   floor_div)rY  rZ  r5  	keep_probrI   random_tensoroutputr+   r+   r,   	drop_path'  s   
ra  c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )EomtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).NrZ  r/   c                    s   t    || _d S r   )rZ   r[   rZ  )r]   rZ  r^   r+   r,   r[   >  s   

zEomtDropPath.__init__r   c                 C   s   t || j| jS r   )ra  rZ  r5  r]   r   r+   r+   r,   r   B  s   zEomtDropPath.forwardc                 C   s   d| j  S )Nzp=)rZ  r   r+   r+   r,   
extra_reprE     zEomtDropPath.extra_reprr   )r"   r#   r$   r%   r   r   r[   r&   r   r   r  rd  r   r+   r+   r^   r,   rb  ;  s
    rb  c                       rQ  )	EomtMLPr/   Nc                    sn   t    |j }}t|j|j }tj||dd| _t|j	t
r(t|j	 | _n|j	| _tj||dd| _d S )NTbias)rZ   r[   r  r   	mlp_ratior   rC  fc1r  
hidden_actr  r   
activationfc2r]   r   in_featuresout_featureshidden_featuresr^   r+   r,   r[   J  s   

zEomtMLP.__init__rU  c                 C   s"   |  |}| |}| |}|S r   )rj  rl  rm  rV  r+   r+   r,   r   U  s   


zEomtMLP.forwardrW  rX  r+   r+   r^   r,   rf  I  s    rf  c                       rQ  )	EomtSwiGLUFFNr/   Nc                    sl   t    |j }}t|j|j }t|d d d d d }tj|d| dd| _tj||dd| _d S )Nr0   r         Trg  )	rZ   r[   r  r   ri  r   rC  
weights_inweights_outrn  r^   r+   r,   r[   ]  s   

zEomtSwiGLUFFN.__init__rU  c                 C   s6   |  |}|jddd\}}tj|| }| |S )Nr0   r=   r   )ru  chunkr   r4   silurv  )r]   rU  x1x2hiddenr+   r+   r,   r   f  s   

zEomtSwiGLUFFN.forwardrW  rX  r+   r+   r^   r,   rr  \  s    	rr  c                       sN   e Zd ZdZdeddf fddZ	ddejdeej dejfd	d
Z	  Z
S )	EomtLayerzCThis corresponds to the Block class in the original implementation.r   r/   Nc                    s   t    tj|j|jd| _t|| _t	|| _
|jdkr#t|jnt | _tj|j|jd| _|jr;t|| _nt|| _t	|| _d S )Nepsr.  )rZ   r[   r   	LayerNormr  layer_norm_epsnorm1r;  	attentionrR  layer_scale1drop_path_raterb  Identityra  norm2use_swiglu_ffnrr  mlprf  layer_scale2rH  r^   r+   r,   r[   p  s   



zEomtLayer.__init__r   	head_maskc                 C   sb   |  |}| ||\}}| |}| || }| |}| |}| |}| || }|S r   )r  r  r  ra  r  r  r  )r]   r   r  hidden_states_normself_attention_outputr   layer_outputr+   r+   r,   r     s   




zEomtLayer.forwardr   )r"   r#   r$   r%   r   r[   r&   r   r   r   r   r+   r+   r^   r,   r|  m  s    r|  c                       s4   e Zd Zd	 fdd	ZdejdejfddZ  ZS )
EomtLayerNorm2dư>Tc                    s   t  j|||d d S )N)r~  elementwise_affine)rZ   r[   )r]   r  r~  affiner^   r+   r,   r[     s   zEomtLayerNorm2d.__init__rU  r/   c                 C   s>   | dddd}t|| j| j| j| j}| dddd}|S )Nr   r0   r   r   )permuteF
layer_normnormalized_shaper   rh  r~  rV  r+   r+   r,   r     s   zEomtLayerNorm2d.forward)r  TrX  r+   r+   r^   r,   r    s    r  c                       8   e Zd Zdef fddZdejdejfddZ  ZS )EomtScaleLayerr   c                    sV   t    |j}tj||ddd| _t|j | _tj	||dd|dd| _
t|| _d S )Nr0   r  r   r   F)r  paddinggroupsrh  )rZ   r[   r  r   ConvTranspose2dconv1r   rk  rl  r  conv2r  layernorm2dr]   r   r  r^   r+   r,   r[     s   
	zEomtScaleLayer.__init__r   r/   c                 C   s,   |  |}| |}| |}| |}|S r   )r  rl  r  r  rc  r+   r+   r,   r     s
   



zEomtScaleLayer.forward	r"   r#   r$   r   r[   r&   r   r   r   r+   r+   r^   r,   r    s    r  c                       r  )EomtScaleBlockr   c                    s6   t     j| _t fddt| jD | _d S )Nc                       g | ]}t  qS r+   )r  rj   r   r   r+   r,   rm     r   z+EomtScaleBlock.__init__.<locals>.<listcomp>)rZ   r[   num_upscale_blocks
num_blocksr   
ModuleListrn   blockrH  r^   r  r,   r[     s   
$zEomtScaleBlock.__init__r   r/   c                 C   s   | j D ]}||}q|S r   )r  )r]   r   r  r+   r+   r,   r     s   

zEomtScaleBlock.forwardr  r+   r+   r^   r,   r    s    r  c                       r  )EomtMaskHeadr   c                    sJ   t    |j}t||| _t||| _t||| _t|j	 | _
d S r   )rZ   r[   r  r   rC  rj  rm  fc3r   rk  rl  r  r^   r+   r,   r[     s   
zEomtMaskHead.__init__r   r/   c                 C   s.   |  | |}|  | |}| |}|S r   )rl  rj  rm  r  rc  r+   r+   r,   r     s   
zEomtMaskHead.forwardr  r+   r+   r^   r,   r    s    	r  c                   @   sN   e Zd ZU dZeed< dZdZdZdgZ	dZ
eedZd	ejd
dfddZdS )EomtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r   eomtr  Fr|  T)r   r    r/  r/   Nc                 C   sl  | j j}t|tjtjtjfrDtjj|j	t
dd |jd urBtj|j	\}}|dkr4dt
| nd}tj|j| | d S d S t|tjrY|j	jd |jj  d S t|tjrz|j	jjddd |jd urx|j	j|j   d S d S t|trt|dr|jj| j j d S d S t|trtjj|jjtjd|d|jj |j_|j!j  d S d S )	N   )ar   r   r1   r.  )r   stdrT  )"r   initializer_ranger  r   rC  r  r  initkaiming_uniform_r   mathsqrtrh  _calculate_fan_in_and_fan_outuniform_r  datafill_zero_r)  normal_padding_idxrR  hasattrrT  rS  r  trunc_normal_r!  rp   r&   r6  rg   r#  )r]   r/  r  fan_inr   boundr+   r+   r,   _init_weights  s8   





z!EomtPreTrainedModel._init_weights)r"   r#   r$   r%   r   r(   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpar|  r;  _can_record_outputsr   Moduler  r+   r+   r+   r,   r    s   
 r  zV
    The EoMT Model with head on top for instance/semantic/panoptic segmentation.
    c                       s   e Zd ZdZdef fddZdedededed	eeef d
eeef fddZ	deeef d
efddZ
ee			ddedeee  deee  deee  dee d
efddZdd ZdejfddZedd Z  ZS )EomtForUniversalSegmentationr  r   c                    s   t     | _ j| _t | _tj j j	d| _
t j j| _t fddt jD | _t | _t | _t j jd | _ j j  j j f| _ j j jd| _t | jd| _ | !dt"# j$ | %  d S )Nr}  c                    r  r+   )r|  r  r  r+   r,   rm     r   z9EomtForUniversalSegmentation.__init__.<locals>.<listcomp>r   )r   r   r   )r   r   attn_mask_probs)&rZ   r[   r   num_hidden_layersr  r  r   r  r  r  	layernormr)  r   r0  r  rn   layersr  upscale_blockr  	mask_headrC  r   class_predictorr  r  	grid_sizer   r   r   r   r   rN   r   r&   r   r  	post_initrH  r^   r  r,   r[     s$   
 

z%EomtForUniversalSegmentation.__init__r   r   r`   ra   r   r/   c                 C   sN   | j |||||d}| j D ]\}}| D ]\}	}
||	v r#|
|9 }
qq|S )Nr   r   r`   ra   r   )rN   r   r   )r]   r   r   r`   ra   r   r   r   r   loss_keyr   r+   r+   r,   get_loss_dict'  s   	z*EomtForUniversalSegmentation.get_loss_dictr   c                 C   s   t | S r   )rB   rO  )r]   r   r+   r+   r,   get_loss?  re  z%EomtForUniversalSegmentation.get_lossNr!   r8   c                 K   sr  d\}}d}|du rt d| |}	t| jD ]\}
}|
| j| jj krG| jjdddddf 	|	j
d dd|	j}tj||	fdd}	|
| j| jj kr| jsa| j|
| j | jj  dkr| |	}| |\}}||f7 }||f7 }tj|	j
d |	j
d |	j
d |	jtjd}tj|| jd	d
}||d|dd}| jj}|| jj }|dk|ddd||df< | j|| j|
| j | jj  |||jd}|ddddf 	d| jjdd}| | d}||	|}	q| |	}| |\}}||f7 }||f7 }d}|dur0|dur0d}t ||D ]\}}| j!||||dd}|| "|7 }qt#|||||dS )ag  
        mask_labels (`list[torch.Tensor]`, *optional*):
            list of mask labels of shape `(num_labels, height, width)` to be fed to a model
        class_labels (`list[torch.LongTensor]`, *optional*):
            list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
            labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
        patch_offsets (`list[torch.Tensor]`, *optional*):
            list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
        )r+   r+   Nz You have to specify pixel_valuesr   r=   r   r   )rc   rg   bilinear)sizemode)probnum_query_tokensencoder_start_tokensrc   .g    er.  r  )r   r   r   r   r!   )$r\   r  r   r  r  r   r  r0  r   r+  rI   rp   rc   r&   r   r5  r  r  predictr   r   r  interpolater  r   r  r   r(  _disable_attention_maskr=  r   masked_fillr   r  r  r   )r]   r  r`   ra   r!   r8   masks_queries_logits_per_layerclass_queries_logits_per_layerr1  r   r   layer_moduler0  norm_hidden_statesr   r   interpolated_logitsr  r  sequence_outputr   r   r+   r+   r,   r   B  s   
2


"	


z$EomtForUniversalSegmentation.forwardc                 C   s   | j jS r   )r  r$  r   r+   r+   r,   get_input_embeddings  s   z1EomtForUniversalSegmentation.get_input_embeddingsr   c                 C   s   |d d d | j jd d f }| |}|d d | j j| jj d d d f }|dd}|j|jd dg| jR  }| 	|}| 
|}td||}||fS )Nr   r0   r   r=   zbqc, bchw -> bqhw)r   r   r  r  r(  r   rK  rI   r  r  r  r&   einsum)r]   r   query_tokensclass_logitsprefix_tokensmask_logitsr+   r+   r,   r    s   
&

z$EomtForUniversalSegmentation.predictc                 C   sD   |dk r t j| jd ||d|k}d| d d d ||d f |< | S )Nr   r   rb   )r&   rq   rI   )	attn_maskr  r  r  rc   random_queriesr+   r+   r,   r    s   z4EomtForUniversalSegmentation._disable_attention_mask)NNN)r"   r#   r$   r  r   r[   r   r  r  r  r  r   r   r   r*   r   r   r   r   r  r&   r  staticmethodr  r   r+   r+   r^   r,   r    sN    





gr  )F)r.  )r.  F)Icollections.abcr  r  dataclassesr   typingr   r   numpyr  r&   torch.nn.functionalr   r4   r  r   activationsr   
file_utilsr	   r
   r   modeling_layersr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   configuration_eomtr   scipy.optimizer   
accelerater   accelerate.utilsr   r   r:   rE   rS   r  rT   r   r   r   r   r
  r  r   r:  r;  rR  r   ra  rb  rf  rr  r|  r  r  r  r  r  r  r  __all__r+   r+   r+   r,   <module>   s   
!
 j   :!,
> 	** C