o
    ei                     @   sL  d dl Zd dlZd dl mZ d dlmZ d dlZd dlZd dl	m
  mZ d dlmZm
Z
 ddlmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' e r|d dl(m)Z) e! rd dl*m+Z+ d dl,m-Z- ee ddG dd deZ.	dYdejdejdejfddZ/dededefd d!Z0dejdejdejfd"d#Z1G d$d% d%e
j2Z3deded&e4defd'd(Z5dejdejd&e4dejfd)d*Z6G d+d, d,e
j2Z7G d-d. d.e
j2Z8G d/d0 d0e
j2Z9	1dZd2e
j2d3ejd4ejd5ejd6ejdB d7e:d8e:fd9d:Z;G d;d< d<e
j2Z<G d=d> d>e
j2Z=d[d?ejd@e:dAe>dejfdBdCZ?G dDdE dEe
j2Z@G dFdG dGe
j2ZAG dHdI dIe
j2ZBG dJdK dKeZCG dLdM dMe
jDZEG dNdO dOe
j2ZFG dPdQ dQe
j2ZGG dRdS dSe
j2ZHe G dTdU dUeZIe dVdG dWdX dXeIZJdUdXgZKdS )\    N)Callable)	dataclass)Tensornn   )initialization)ACT2FN)ModelOutputis_scipy_availablerequires_backends)GradientCheckpointingLayer)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_accelerate_available)merge_with_config_defaults)capture_outputs   )
EomtConfig)linear_sum_assignment)PartialState)reducea  
    Class for outputs of [`EomtForUniversalSegmentationOutput`].

    This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or
    [`~EomtImageProcessor.post_process_instance_segmentation`] or
    [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
    [`~EomtImageProcessor] for details regarding usage.
    )custom_introc                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZeej dB ed< dZeej dB ed< dZeej dB ed	< dS )
"EomtForUniversalSegmentationOutputa*  
    loss (`torch.Tensor`, *optional*):
        The computed loss, returned when labels are present.
    class_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
        query. Note the `+ 1` is needed because we incorporate the null class.
    masks_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
        query.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Last hidden states (final feature map) of the last layer.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Self and Cross Attentions weights from transformer decoder.
    patch_offsets (`list[torch.Tensor]`, *optional*):
        list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
    Nlossclass_queries_logitsmasks_queries_logitslast_hidden_statehidden_states
attentionspatch_offsets)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   r    tupler!   r"   listr    r,   r,   d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/eomt/modeling_eomt.pyr   3   s   
 r   Finput_featurespoint_coordinatesreturnc                 K   sL   |  dkrd}|d}tjjj| d| d fi |}|r$|d}|S )a(  
    A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors.

    Args:
        input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
            A tensor that contains features map on a height * width grid
        point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,:
        2)):
            A tensor that contains [0, 1] * [0, 1] normalized point coordinates
        add_dim (`bool`):
            boolean value to keep track of added dimension

    Returns:
        point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
        height_grid, width_grid):
            A tensor that contains features for points in `point_coordinates`.
    r   T   g       @      ?)dim	unsqueezer'   r   
functionalgrid_samplesqueeze)r.   r/   add_dimkwargspoint_featuresr,   r,   r-   sample_point^   s   
 
r;   inputslabelsc                 C   sd   |   d} dt| |j }| ddddf |ddddf  }d|d |d   }|S )a  
    A pair wise version of the dice loss, see `dice_loss` for usage.

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        `torch.Tensor`: The computed loss between each pairs.
    r   r1   N)sigmoidflattenr'   matmulTsum)r<   r=   	numeratordenominatorr   r,   r,   r-   pair_wise_dice_loss~   s
   ,rF   c           	      C   sj   | j d }tjdd}|| t| }|| t| }t|| |j}t|| d| j}|| }|S )a  
    A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage.

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        loss (`torch.Tensor`): The computed loss between each pairs.
    r   none	reduction)shaper   BCEWithLogitsLossr'   	ones_like
zeros_likerA   rB   )	r<   r=   height_and_width	criterioncross_entropy_loss_poscross_entropy_loss_negloss_posloss_negr   r,   r,   r-   $pair_wise_sigmoid_cross_entropy_loss   s   
rT   c                       sl   e Zd ZdZ	ddedededef fdd	Ze d
ej	dej	dej	dej	de
ee	  f
ddZ  ZS )EomtHungarianMatcheraq  This class computes an assignment between the labels and the predictions of the network.

    For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
    predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
    un-matched (and thus treated as non-objects).
    r2    1  
cost_class	cost_mask	cost_dice
num_pointsc                    sF   t    |dkr|dkr|dkrtd|| _|| _|| _|| _dS )aH  Creates the matcher

        Params:
            cost_class (`float`, *optional*, defaults to 1.0):
                Relative weight of the classification error in the matching cost.
            cost_mask (`float`, *optional*,  defaults to 1.0):
                This is the relative weight of the focal loss of the binary mask in the matching cost.
            cost_dice (`float`, *optional*, defaults to 1.0):
                This is the relative weight of the dice loss of the binary mask in the matching cost.
            num_points (`int`, *optional*, defaults to 12544):
                No. of points to sample on which the mask loss will be calculated. The same set of K points are
                uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite
                matching.
        r   zAll costs can't be 0N)super__init__
ValueErrorrZ   rW   rX   rY   )selfrW   rX   rY   rZ   	__class__r,   r-   r\      s   

zEomtHungarianMatcher.__init__r   r   mask_labelsclass_labelsr0   c                 C   sf  g }|j d }t|D ]}|| d}|| }	|dd|| f  }
|| |	}|dddf }|	dddf }	tjd| jd|	jd}||j d dd}t	||dd
d}||	j d dd}t	|	|dd
d}	t|	|}t|	|}| j| | j|
  | j|  }t|td	}t|td
}t|d}t| }|| qdd |D }|S )ao  
        Params:
            masks_queries_logits (`torch.Tensor`):
                A tensor of dim `batch_size, num_queries, num_labels` with the classification logits.
            class_queries_logits (`torch.Tensor`):
                A tensor of dim `batch_size, num_queries, height, width` with the predicted masks.
            class_labels (`torch.Tensor`):
                A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the
                target) containing the class labels.
            mask_labels (`torch.Tensor`):
                A tensor of dim `num_target_boxes, height, width` containing the target masks.

        Returns:
            matched_indices (`list[tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j)
            where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected labels (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
        r   r>   Nr   r1   deviceFalign_cornersg    _Bg    _c                 S   s0   g | ]\}}t j|t jd t j|t jd fqS )dtype)r'   	as_tensorint64).0ijr,   r,   r-   
<listcomp>  s    $z0EomtHungarianMatcher.forward.<locals>.<listcomp>)rJ   rangesoftmaxtor'   randrZ   rd   repeatr;   r7   rT   rF   rX   rW   rY   minimumtensormaximum
nan_to_numr   cpuappend)r^   r   r   ra   rb   indices
batch_sizerl   
pred_probs	pred_maskrW   target_maskr/   target_coordinatespred_coordinatesrX   rY   cost_matrixassigned_indicesmatched_indicesr,   r,   r-   forward   s4   


zEomtHungarianMatcher.forward)r2   r2   r2   rV   )r#   r$   r%   r&   floatintr\   r'   no_gradr   r+   r*   r   __classcell__r,   r,   r_   r-   rU      s0    
rU   	num_masksc                 C   sX   |   d}d|| d }|d|d }d|d |d   }| | }|S )a4  
    Compute the DICE loss, similar to generalized IOU for masks as follows:

    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$

    In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow

    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).
        num_masks (`int`):
            The number of masks present in the current batch, used for normalization.

    Returns:
        `torch.Tensor`: The computed loss.
    r   r1   r>   )r?   r@   rC   )r<   r=   r   probsrD   rE   r   r,   r,   r-   	dice_loss  s   r   c                 C   s,   t jdd}|| |}|d | }|S )a|  
    Args:
        inputs (`torch.Tensor`):
            A float tensor of arbitrary shape.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        loss (`torch.Tensor`): The computed loss.
    rG   rH   r   )r   rK   meanrC   )r<   r=   r   rO   cross_entropy_lossr   r,   r,   r-   sigmoid_cross_entropy_loss8  s   
r   c                       s~  e Zd Zdedeeef f fddZdeee	  dee	 fddZ
d	ee deeef fd
dZdedee deej deeef fddZdejdeej deej de	deeejf f
ddZdd Zdd ZdejdejfddZdejde	de	dedejf
d d!Z	"d)dejdejdeej deej d#eeejf d"B deeejf fd$d%Zdejd&ejdejfd'd(Z  ZS )*EomtLossconfigweight_dictc                    s   t    t| dg |j| _|| _|j| _t| jd }| j|d< | 	d| |j
| _|j| _|j| _t|j|j|j| jd| _dS )aH  
        The Eomt Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we
        compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair
        of matched ground-truth / prediction (supervise class and mask)

        Args:
            config (`EomtConfig`):
                The configuration for Eomt model also containing loss calculation specific parameters.
            weight_dict (`dict[str, float]`):
                A dictionary of weights to be applied to the different losses.
        scipyr   r>   empty_weight)rW   rY   rX   rZ   N)r[   r\   r   
num_labelsr   no_object_weighteos_coefr'   onesregister_buffertrain_num_pointsrZ   oversample_ratioimportance_sample_ratiorU   class_weightdice_weightmask_weightmatcher)r^   r   r   r   r_   r,   r-   r\   M  s"   

zEomtLoss.__init__sizesr0   c                 C   sB   |d }|dd  D ]}t |D ]\}}t|| |||< qq
|S )Nr   r   )	enumeratemax)r^   r   maxessublistindexitemr,   r,   r-   _max_by_axisp  s   zEomtLoss._max_by_axistensorsc                 C   s   |  dd |D }t|g| }|\}}}}|d j}|d j}	tj|||	d}
tj|||ftj|	d}t||
|D ].\}}}|d |j	d d |j	d d |j	d f 
| d|d |j	d d |j	d f< q;|
|fS )Nc                 S   s   g | ]}t |jqS r,   )r+   rJ   )rk   ru   r,   r,   r-   rn   z  s    z8EomtLoss._pad_images_to_max_in_batch.<locals>.<listcomp>r   rh   rd   r   r1   F)r   lenrh   rd   r'   zerosr   boolziprJ   copy_)r^   r   max_sizebatch_shaper{   _heightwidthrh   rd   padded_tensorspadding_masksru   padded_tensorpadding_maskr,   r,   r-   _pad_images_to_max_in_batchx  s   

2"z$EomtLoss._pad_images_to_max_in_batchr   rb   rz   c                 C   s   |}|j \}}}tj| jd}| |}	tdd t||D }
tj||f| j	tj
|jd}|
||	< |dd}|||}d|i}|S )a  Compute the losses related to the labels using cross entropy.

        Args:
            class_queries_logits (`torch.Tensor`):
                A tensor of shape `batch_size, num_queries, num_labels`
            class_labels (`list[torch.Tensor]`):
                List of class labels of shape `(labels)`.
            indices (`tuple[np.array])`:
                The indices computed by the Hungarian matcher.

        Returns:
            `dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
        )weightc                 S   s   g | ]
\}\}}|| qS r,   r,   )rk   targetr   rm   r,   r,   r-   rn     s    z(EomtLoss.loss_labels.<locals>.<listcomp>)
fill_valuerh   rd   r   r1   loss_cross_entropy)rJ   r   CrossEntropyLossr   $_get_predictions_permutation_indicesr'   catr   fullr   rj   rd   	transpose)r^   r   rb   rz   pred_logitsr{   num_queriesr   rO   idxtarget_classes_otarget_classespred_logits_transposedloss_celossesr,   r,   r-   loss_labels  s   

zEomtLoss.loss_labelsr   ra   r   c                    s     |} |}|| } |\}}	|| }|dddf }|dddf }t "  | fdd j j j}
t	||
dd
d}W d   n1 sRw   Y  t	||
dd
d}t|||t|||d}~~|S )a  Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss.

        Args:
            masks_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, height, width)`.
            mask_labels (`torch.Tensor`):
                List of mask labels of shape `(labels, height, width)`.
            indices (`tuple[np.array])`:
                The indices computed by the Hungarian matcher.
            num_masks (`int)`:
                The number of masks, used for normalization.

        Returns:
            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys:
            - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth.
              masks.
            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth,
              masks.
        Nc                    s
     | S N)calculate_uncertainty)logitsr^   r,   r-   <lambda>  s   
 z%EomtLoss.loss_masks.<locals>.<lambda>Fre   r   )	loss_mask	loss_dice)r    _get_targets_permutation_indicesr   r'   r   sample_points_using_uncertaintyrZ   r   r   r;   r7   r   r   )r^   r   ra   rz   r   src_idxtgt_idx
pred_maskstarget_masksr   r/   point_labelspoint_logitsr   r,   r   r-   
loss_masks  s0   





zEomtLoss.loss_masksc                 C   4   t dd t|D }t dd |D }||fS )Nc                 S   s    g | ]\}\}}t ||qS r,   r'   	full_like)rk   rl   srcr   r,   r,   r-   rn          zAEomtLoss._get_predictions_permutation_indices.<locals>.<listcomp>c                 S   s   g | ]\}}|qS r,   r,   )rk   r   r   r,   r,   r-   rn         r'   r   r   )r^   rz   batch_indicespredictions_indicesr,   r,   r-   r        z-EomtLoss._get_predictions_permutation_indicesc                 C   r   )Nc                 S   s    g | ]\}\}}t ||qS r,   r   )rk   rl   r   tgtr,   r,   r-   rn     r   z=EomtLoss._get_targets_permutation_indices.<locals>.<listcomp>c                 S   s   g | ]\}}|qS r,   r,   )rk   r   r   r,   r,   r-   rn     r   r   )r^   rz   r   target_indicesr,   r,   r-   r     r   z)EomtLoss._get_targets_permutation_indicesr   c                 C   s   t | }|S )a  
        In Eomt paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits'
        for the foreground class in `classes`.

        Args:
            logits (`torch.Tensor`):
            A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is:
            the number of foreground classes. The values are logits.

        Returns:
            scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most
            uncertain locations having the highest uncertainty score.
        )r'   abs)r^   r   uncertainty_scoresr,   r,   r-   r     s   zEomtLoss.calculate_uncertaintyrZ   r   r   c                 C   s   |j d }t|| }tj||d|jd}t||dd}	||	}
t|| }|| }tj|
dddddf |ddd }|tj|tj|jd	 }||dddf 7 }|	d
d|	d
ddf 	||d}|dkr|tj
|tj||d|jdgdd}|S )a  
        This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The
        uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit
        prediction as input.

        Args:
            logits (`float`):
                Logit predictions for P points.
            uncertainty_function:
                A function that takes logit predictions for P points and returns their uncertainties.
            num_points (`int`):
                The number of points P to sample.
            oversample_ratio (`int`):
                Oversampling parameter.
            importance_sample_ratio (`float`):
                Ratio of points that are sampled via importance sampling.

        Returns:
            point_coordinates (`torch.Tensor`):
                Coordinates for P sampled points.
        r   r1   rc   Fre   Nr   )kr3   r   r>   r3   )rJ   r   r'   rr   rd   r;   topkarangelongviewr   )r^   r   uncertainty_functionrZ   r   r   	num_boxesnum_points_sampledr/   r   point_uncertaintiesnum_uncertain_pointsnum_random_pointsr   shiftr,   r,   r-   r     s"   
&(z(EomtLoss.sample_points_using_uncertaintyNauxiliary_predictionsc                    s   |  ||||}| j||d jd}i | ||||| |||}|durPt|D ]$\ }	|	d }|	d }| ||||}
 fdd|
 D }
||
 q+|S )a  
        This performs the loss computation.

        Args:
            masks_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, height, width)`.
            class_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, num_labels)`.
            mask_labels (`torch.Tensor`):
                List of mask labels of shape `(labels, height, width)`.
            class_labels (`list[torch.Tensor]`):
                List of class labels of shape `(labels)`.
            auxiliary_predictions (`dict[str, torch.Tensor]`, *optional*):
                if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], then it contains the logits from
                the inner layers of the EomtMaskedAttentionDecoder.

        Returns:
            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys:
            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
            - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth
              masks.
            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
              masks.
            if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], the dictionary contains additional
            losses for each auxiliary predictions.
        r   rc   Nr   r   c                    s    i | ]\}}| d   |qS )r   r,   )rk   keyvaluer   r,   r-   
<dictcomp>o  r   z$EomtLoss.forward.<locals>.<dictcomp>)	r   get_num_masksrd   r   r   r   r   itemsupdate)r^   r   r   ra   rb   r   rz   r   r   aux_outputs	loss_dictr,   r   r-   r   =  s   $zEomtLoss.forwardrd   c                 C   s^   t dd |D }tj|tj|d}d}t r$tji kr$t|}t j}tj	|| dd}|S )zk
        Computes the average number of target masks across the batch, for normalization purposes.
        c                 s   s    | ]}t |V  qd S r   )r   )rk   classesr,   r,   r-   	<genexpr>x  s    z)EomtLoss.get_num_masks.<locals>.<genexpr>r   r   )min)
rC   r'   ri   r   r   r   _shared_stater   num_processesclamp)r^   rb   rd   r   
world_sizer,   r,   r-   r   t  s   
zEomtLoss.get_num_masksr   )r#   r$   r%   r   dictstrr   r\   r+   r   r   r   r*   r   nparrayr   r'   r   r   r   r   r   r   rd   r   r   r,   r,   r_   r-   r   L  sj    #

"
>
=
$7r   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )EomtPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   kernel_sizestride)r[   r\   
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)r^   r   r  r  r  r  r  r_   r,   r-   r\     s   
 zEomtPatchEmbeddings.__init__pixel_valuesr0   c                 C   sH   |j d }|| jkrtd| j d| d| |ddd}|S )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r1   )rJ   r  r]   r  r@   r   )r^   r  r  
embeddingsr,   r,   r-   r     s   

zEomtPatchEmbeddings.forward)	r#   r$   r%   r&   r\   r'   r   r   r   r,   r,   r_   r-   r    s    r  c                       s@   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
EomtEmbeddingszM
    Construct the CLS token, mask token, position and patch embeddings.
    r   r0   Nc                    s   t    || _|j| _ttdd|j| _	tt
d|j|j| _t|| _| jj}t|j| _d|j | _t||j| _| jdt|ddd d S )Nr   position_idsr   r>   F)
persistent)r[   r\   r   r  r   	Parameterr'   randnr  	cls_tokenr   num_register_tokensregister_tokensr  patch_embeddingsr  Dropouthidden_dropout_probdropoutnum_prefix_tokens	Embeddingposition_embeddingsr   r   expand)r^   r   r  r_   r,   r-   r\     s   

 zEomtEmbeddings.__init__r  c                 C   s~   |j \}}}}| jjjj}| |j|d}| j|dd}| j|dd}|| 	| j
 }tj|||gdd}| |}|S )Nrg   r>   r   r   )rJ   r&  r  r   rh   rq   r#  r-  r%  r,  r  r'   r   r)  )r^   r  r{   r   target_dtyper  
cls_tokensr%  r,   r,   r-   r     s   
zEomtEmbeddings.forward
r#   r$   r%   r&   r   r\   r'   r   r   r   r,   r,   r_   r-   r    s    r          modulequeryr   r   attention_maskscalingr)  c           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr>   )r3   rh   )ptrainingr   r1   )r'   rA   r   r   r5   rp   float32rq   rh   r)  r8  
contiguous)
r2  r3  r   r   r4  r5  r)  r9   attn_weightsattn_outputr,   r,   r-   eager_attention_forward  s   
r=  c                
       sR   e Zd ZdZ fddZ	d
dejdejdB deejejdB f fdd	Z  Z	S )EomtAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)r[   r\   r   r  	embed_dimnum_attention_heads	num_headshead_dimr]   scaleattention_dropoutr)  	is_causalr   Lineark_projv_projq_projout_projr^   r   r_   r,   r-   r\     s$   

zEomtAttention.__init__Nr    r4  r0   c              
   K   s   |j \}}}| |}| |}| |}	|||| j| jdd}|||| j| jdd}|	||| j| jdd}	t	| j
jt}
|
| |||	|| j| j| jsVdn| jd\}}|||| }| |}||fS )z#Input shape: Batch x Time x Channelr   r1   r1  )rE  r5  r)  )rJ   rI  rG  rH  r   rA  rB  r   r   get_interfacer   _attn_implementationr=  rE  rC  r8  r)  reshaper:  rJ  )r^   r    r4  r9   r{   
seq_lengthr?  querieskeysvaluesattention_interfacer<  r;  r,   r,   r-   r     s.   




zEomtAttention.forwardr   )
r#   r$   r%   r&   r\   r'   r   r*   r   r   r,   r,   r_   r-   r>    s    r>  c                       4   e Zd Zd fddZdejdejfddZ  ZS )	EomtLayerScaler0   Nc                    s(   t    t|jt|j | _d S r   )	r[   r\   r   r!  layerscale_valuer'   r   r  lambda1rK  r_   r,   r-   r\      s   
zEomtLayerScale.__init__hidden_statec                 C   s
   || j  S r   )rW  r^   rX  r,   r,   r-   r   $  s   
zEomtLayerScale.forwardr0   Nr#   r$   r%   r\   r'   r   r   r   r,   r,   r_   r-   rU    s    rU  input	drop_probr8  c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    r1  r   r   )r   r   )rJ   ndimr'   rr   rh   rd   floor_div)r\  r]  r8  	keep_probrJ   random_tensoroutputr,   r,   r-   	drop_path(  s   rd  c                       sT   e Zd ZdZddedB ddf fddZdejdejfdd	Zde	fd
dZ
  ZS )EomtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr]  r0   c                    s   t    || _d S r   )r[   r\   r]  )r^   r]  r_   r,   r-   r\   :  s   

zEomtDropPath.__init__r    c                 C   s   t || j| jS r   )rd  r]  r8  r^   r    r,   r,   r-   r   >  s   zEomtDropPath.forwardc                 C   s   d| j  S )Nzp=)r]  r   r,   r,   r-   
extra_reprA     zEomtDropPath.extra_reprr   )r#   r$   r%   r&   r   r\   r'   r   r   r  rg  r   r,   r,   r_   r-   re  7  s
    re  c                       rT  )	EomtMLPr0   Nc                    sn   t    |j }}t|j|j }tj||dd| _t|j	t
r(t|j	 | _n|j	| _tj||dd| _d S )NTbias)r[   r\   r  r   	mlp_ratior   rF  fc1r  
hidden_actr  r   
activationfc2r^   r   in_featuresout_featureshidden_featuresr_   r,   r-   r\   F  s   

zEomtMLP.__init__rX  c                 C   s"   |  |}| |}| |}|S r   )rm  ro  rp  rY  r,   r,   r-   r   Q  s   


zEomtMLP.forwardrZ  r[  r,   r,   r_   r-   ri  E  s    ri  c                       rT  )	EomtSwiGLUFFNr0   Nc                    sl   t    |j }}t|j|j }t|d d d d d }tj|d| dd| _tj||dd| _d S )Nr1   r         Trj  )	r[   r\   r  r   rl  r   rF  
weights_inweights_outrq  r_   r,   r-   r\   Y  s   

zEomtSwiGLUFFN.__init__rX  c                 C   s6   |  |}|jddd\}}tj|| }| |S )Nr1   r>   r   )rx  chunkr   r5   silury  )r^   rX  x1x2hiddenr,   r,   r-   r   b  s   

zEomtSwiGLUFFN.forwardrZ  r[  r,   r,   r_   r-   ru  X  s    	ru  c                       sN   e Zd ZdZdeddf fddZ	ddejdejdB dejfd	d
Z  Z	S )	EomtLayerzCThis corresponds to the Block class in the original implementation.r   r0   Nc                    s   t    tj|j|jd| _t|| _t	|| _
|jdkr#t|jnt | _tj|j|jd| _|jr;t|| _nt|| _t	|| _d S )Nepsr1  )r[   r\   r   	LayerNormr  layer_norm_epsnorm1r>  	attentionrU  layer_scale1drop_path_ratere  Identityrd  norm2use_swiglu_ffnru  mlpri  layer_scale2rK  r_   r,   r-   r\   l  s   



zEomtLayer.__init__r    r4  c                 C   sb   |  |}| ||\}}| |}| || }| |}| |}| |}| || }|S r   )r  r  r  rd  r  r  r  )r^   r    r4  hidden_states_normself_attention_outputr   layer_outputr,   r,   r-   r   |  s   




zEomtLayer.forwardr   r0  r,   r,   r_   r-   r  i  s    r  c                       s4   e Zd Zd	 fdd	ZdejdejfddZ  ZS )
EomtLayerNorm2dư>Tc                    s   t  j|||d d S )N)r  elementwise_affine)r[   r\   )r^   r  r  affiner_   r,   r-   r\     s   zEomtLayerNorm2d.__init__rX  r0   c                 C   s>   | dddd}t|| j| j| j| j}| dddd}|S )Nr   r1   r   r   )permuteF
layer_normnormalized_shaper   rk  r  rY  r,   r,   r-   r     s   zEomtLayerNorm2d.forward)r  Tr[  r,   r,   r_   r-   r    s    r  c                       8   e Zd Zdef fddZdejdejfddZ  ZS )EomtScaleLayerr   c                    sV   t    |j}tj||ddd| _t|j | _tj	||dd|dd| _
t|| _d S )Nr1   r  r   r   F)r  paddinggroupsrk  )r[   r\   r  r   ConvTranspose2dconv1r   rn  ro  r  conv2r  layernorm2dr^   r   r  r_   r,   r-   r\     s   
	zEomtScaleLayer.__init__r    r0   c                 C   s,   |  |}| |}| |}| |}|S r   )r  ro  r  r  rf  r,   r,   r-   r     s
   



zEomtScaleLayer.forward	r#   r$   r%   r   r\   r'   r   r   r   r,   r,   r_   r-   r    s    r  c                       r  )EomtScaleBlockr   c                    s6   t     j| _t fddt| jD | _d S )Nc                       g | ]}t  qS r,   )r  rk   r   r   r,   r-   rn     r   z+EomtScaleBlock.__init__.<locals>.<listcomp>)r[   r\   num_upscale_blocks
num_blocksr   
ModuleListro   blockrK  r_   r  r-   r\     s   
$zEomtScaleBlock.__init__r    r0   c                 C   s   | j D ]}||}q|S r   )r  )r^   r    r  r,   r,   r-   r     s   

zEomtScaleBlock.forwardr  r,   r,   r_   r-   r    s    r  c                       r  )EomtMaskHeadr   c                    sJ   t    |j}t||| _t||| _t||| _t|j	 | _
d S r   )r[   r\   r  r   rF  rm  rp  fc3r   rn  ro  r  r_   r,   r-   r\     s   
zEomtMaskHead.__init__r    r0   c                 C   s.   |  | |}|  | |}| |}|S r   )ro  rm  rp  r  rf  r,   r,   r-   r     s   
zEomtMaskHead.forwardr  r,   r,   r_   r-   r    s    	r  c                   @   sZ   e Zd ZU dZeed< dZdZdZdZ	dgZ
dZeed	Ze d
ejddfddZdS )EomtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r   eomtr  )imageFr  T)r    r!   r2  r0   Nc                 C   s  | j j}t|tjtjtjfrCtj|j	t
dd |jd urAtjj|j	\}}|dkr4dt
| nd}t|j| | d S d S t|tjrWt|j	 t|j d S t|tjrtj|j	ddd |jd ur}t|j	ddst|j	|j  d S d S d S t|trt|d	rt|j| j j d S d S t|trtj|jd|d t|j t |j!t"|j!j#d
 $d d S t|t%rt&|j'd }|j(|d
< t |j)| d S t|t*rt|j+ d S d S )N   )ar   r   r1  )r   std_is_hf_initializedFrW  r>   r  ),r   initializer_ranger  r   rF  r  r  initkaiming_uniform_r   mathsqrtrk  r'   _calculate_fan_in_and_fan_outuniform_r  ones_zeros_r+  normal_padding_idxgetattrrU  hasattr	constant_rW  rV  r  trunc_normal_r#  r%  r   r  r   rJ   r-  r   r   r   r   r   EomtForUniversalSegmentationattn_mask_probs)r^   r2  r  fan_inr   boundr   r,   r,   r-   _init_weights  s>   



&


z!EomtPreTrainedModel._init_weights)r#   r$   r%   r&   r   r)   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpar  r>  _can_record_outputsr'   r   r   Moduler  r,   r,   r,   r-   r    s   
 r  zV
    The EoMT Model with head on top for instance/semantic/panoptic segmentation.
    c                       s   e Zd ZdZdef fddZdedededed	eeef d
eeef fddZ	deeef d
efddZ
eee			ddedee dB dee dB dee dB dee d
efddZdd ZdejfddZedd Z  ZS )r  r  r   c                    s   t     | _ j| _t | _tj j j	d| _
t j j| _t fddt jD | _t | _t | _t j jd | _ j j  j j f| _ j j jd| _t | jd| _ | !dt"# j$ | %  d S )Nr  c                    r  r,   )r  r  r  r,   r-   rn     r   z9EomtForUniversalSegmentation.__init__.<locals>.<listcomp>r   )r   r   r   )r   r   r  )&r[   r\   r   num_hidden_layersr  r  r   r  r  r  	layernormr+  r   r3  r  ro   layersr  upscale_blockr  	mask_headrF  r   class_predictorr  r  	grid_sizer   r   r   r   r   rO   r   r'   r   r  	post_initrK  r_   r  r-   r\     s$   
 

z%EomtForUniversalSegmentation.__init__r   r   ra   rb   r   r0   c                 C   sN   | j |||||d}| j D ]\}}| D ]\}	}
||	v r#|
|9 }
qq|S )Nr   r   ra   rb   r   )rO   r   r   )r^   r   r   ra   rb   r   r   r   r   loss_keyr   r,   r,   r-   get_loss_dict+  s   	z*EomtForUniversalSegmentation.get_loss_dictr   c                 C   s   t | S r   )rC   rR  )r^   r   r,   r,   r-   get_lossC  rh  z%EomtForUniversalSegmentation.get_lossNr"   r9   c                 K   sr  d\}}d}|du rt d| |}	t| jD ]\}
}|
| j| jj krG| jjdddddf 	|	j
d dd|	j}tj||	fdd}	|
| j| jj kr| jsa| j|
| j | jj  dkr| |	}| |\}}||f7 }||f7 }tj|	j
d |	j
d |	j
d |	jtjd}tj|| jd	d
}||d|dd}| jj}|| jj }|dk|ddd||df< | j|| j|
| j | jj  |||jd}|ddddf 	d| jjdd}| | d}||	|}	q| |	}| |\}}||f7 }||f7 }d}|dur0|dur0d}t ||D ]\}}| j!||||dd}|| "|7 }qt#|||||dS )ag  
        mask_labels (`list[torch.Tensor]`, *optional*):
            list of mask labels of shape `(num_labels, height, width)` to be fed to a model
        class_labels (`list[torch.LongTensor]`, *optional*):
            list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
            labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
        patch_offsets (`list[torch.Tensor]`, *optional*):
            list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
        )r,   r,   Nz You have to specify pixel_valuesr   r>   r   r   )rd   rh   bilinear)sizemode)probnum_query_tokensencoder_start_tokensrd   .g    er1  r  )r   r   r   r   r"   )$r]   r  r   r  r  r   r  r3  r   r-  rJ   rq   rd   r'   r   r8  r  r  predictr   r   r  interpolater  r   r  r   r*  _disable_attention_maskr@  r   masked_fillr   r  r  r   )r^   r  ra   rb   r"   r9   masks_queries_logits_per_layerclass_queries_logits_per_layerr4  r    r   layer_moduler3  norm_hidden_statesr   r   interpolated_logitsr  r  sequence_outputr   r   r,   r,   r-   r   F  s   
2


"	


z$EomtForUniversalSegmentation.forwardc                 C   s   | j jS r   )r  r&  r   r,   r,   r-   get_input_embeddings  s   z1EomtForUniversalSegmentation.get_input_embeddingsr   c                 C   s   |d d d | j jd d f }| |}|d d | j j| jj d d d f }|dd}|j|jd dg| jR  }| 	|}| 
|}td||}||fS )Nr   r1   r   r>   zbqc, bchw -> bqhw)r   r   r  r  r*  r   rN  rJ   r  r  r  r'   einsum)r^   r   query_tokensclass_logitsprefix_tokensmask_logitsr,   r,   r-   r    s   
&

z$EomtForUniversalSegmentation.predictc                 C   sD   |dk r t j| jd ||d|k}d| d d d ||d f |< | S )Nr   r   rc   )r'   rr   rJ   )	attn_maskr  r  r  rd   random_queriesr,   r,   r-   r    s   z4EomtForUniversalSegmentation._disable_attention_mask)NNN)r#   r$   r%   r  r   r\   r   r  r  r  r  r   r   r   r+   r   r   r   r   r  r'   r  staticmethodr  r   r,   r,   r_   r-   r    sP    





gr  )F)r1  )r1  F)Lcollections.abcr  r  r   dataclassesr   numpyr	  r'   torch.nn.functionalr   r5   r  r    r   r  activationsr   
file_utilsr	   r
   r   modeling_layersr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   configuration_eomtr   scipy.optimizer   
accelerater   accelerate.utilsr   r   r;   rF   rT   r  rU   r   r   r   r   r  r  r   r=  r>  rU  r   rd  re  ri  ru  r  r  r  r  r  r  r  r  __all__r,   r,   r,   r-   <module>   s   
!
 j   :!,
> 	*2 D