o
    ib                     @   s  d Z ddlZddlmZ ddlmZ ddlZddlm  m	Z
 ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZmZmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% e&e'Z(G dd de%Z)eeddG dd deZ*G dd de!Z+G dd deZ,G dd deZ-G dd de#Z.G d d! d!eZ/G d"d# d#eZ0G d$d% d%ej1Z2G d&d' d'ej3Z4G d(d) d)ej3Z5G d*d+ d+ej3Z6eG d,d- d-eZ7ed.dG d/d0 d0e Z8g d1Z9dS )2zPyTorch EoMT model.    N)	dataclass)Optional)Tensornn   )ACT2FN)ModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)check_model_inputs   )Dinov2EmbeddingsDinov2LayerDinov2LayerScaleDinov2PatchEmbeddings)#Mask2FormerForUniversalSegmentationMask2FormerLoss)SiglipAttention)	ViTConfigc                       sx   e Zd ZdZdZ									
																		d dededededededef fddZ  ZS )!
EomtConfiga  
    This is the configuration class to store the configuration of a [`EomtForUniversalSegmentation`]. It is used to instantiate an EoMT model
    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the EoMT
    [tue-mps/coco_panoptic_eomt_large_640](https://huggingface.co/tue-mps/coco_panoptic_eomt_large_640)
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the hidden representations.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads in each attention layer.
        mlp_ratio (`int`, *optional*, defaults to 4):
            Ratio of the MLP hidden dimensionality to the hidden size.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder.
        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings and encoder.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        image_size (`int`, *optional*, defaults to 640):
            The size (resolution) of each input image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        layerscale_value (`float`, *optional*, defaults to 1.0):
            Initial value for the LayerScale parameter.
        drop_path_rate (`float`, *optional*, defaults to 0.0):
            The stochastic depth rate (drop path) used during training.
        num_upscale_blocks (`int`, *optional*, defaults to 2):
            Number of upsampling blocks used in the decoder or segmentation head.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability applied after attention projection.
        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
            Whether to use the SwiGLU feedforward neural network.
        num_blocks (`int`, *optional*, defaults to 4):
            Number of feature blocks or stages in the architecture.
        no_object_weight (`float`, *optional*, defaults to 0.1):
            Loss weight for the 'no object' class in panoptic/instance segmentation.
        class_weight (`float`, *optional*, defaults to 2.0):
            Loss weight for classification targets.
        mask_weight (`float`, *optional*, defaults to 5.0):
            Loss weight for mask prediction.
        dice_weight (`float`, *optional*, defaults to 5.0):
            Loss weight for the dice loss component.
        train_num_points (`int`, *optional*, defaults to 12544):
            Number of points to sample for mask loss computation during training.
        oversample_ratio (`float`, *optional*, defaults to 3.0):
            Oversampling ratio used in point sampling for mask training.
        importance_sample_ratio (`float`, *optional*, defaults to 0.75):
            Ratio of points to sample based on importance during training.
        num_queries (`int`, *optional*, defaults to 200):
            Number of object queries in the Transformer.
        num_register_tokens (`int`, *optional*, defaults to 4):
            Number of learnable register tokens added to the transformer input.

    Example:

    ```python
    >>> from transformers import EomtConfig, EomtForUniversalSegmentation

    >>> # Initialize configuration
    >>> config = EomtConfig()

    >>> # Initialize model
    >>> model = EomtForUniversalSegmentation(config)

    >>> # Access config
    >>> config = model.config
    ```eomt            gelu        {Gz?ư>  r         ?r   F皙?       @      @ 1        @      ?   no_object_weightclass_weightmask_weightdice_weighttrain_num_pointsoversample_ratioimportance_sample_ratioc                    s   t  jd||||||||	|
|d
| | `| `| `| `| `| `|| _|| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _d S )N)
hidden_sizenum_hidden_layersnum_attention_headshidden_dropout_prob
hidden_actinitializer_rangelayer_norm_eps
image_size
patch_sizenum_channels )super__init__intermediate_sizeqkv_bias
pooler_actpooler_output_sizeencoder_strideattention_probs_dropout_prob	mlp_ratioattention_dropoutlayerscale_valuedrop_path_ratenum_upscale_blocksuse_swiglu_ffn
num_blocksr+   r,   r-   r.   r/   r0   r1   num_queriesnum_register_tokens)selfr2   r3   r4   rE   r6   r5   r7   r8   r9   r:   r;   rG   rH   rI   rF   rJ   rK   r+   r,   r-   r.   r/   r0   r1   rL   rM   kwargs	__class__r<   Y/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/eomt/modular_eomt.pyr>      sH   
zEomtConfig.__init__)r   r   r   r   r   r   r    r!   r"   r   r   r#   r   r   r   Fr   r$   r%   r&   r&   r'   r(   r)   r*   r   )	__name__
__module____qualname____doc__
model_typefloatintr>   __classcell__r<   r<   rP   rR   r   3   sX    Or   a  
    Class for outputs of [`EomtForUniversalSegmentationOutput`].

    This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or
    [`~EomtImageProcessor.post_process_instance_segmentation`] or
    [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
    [`~EomtImageProcessor] for details regarding usage.
    )custom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeeej  ed< dZeeej  ed< dZeeej  ed	< dS )
"EomtForUniversalSegmentationOutputa*  
    loss (`torch.Tensor`, *optional*):
        The computed loss, returned when labels are present.
    class_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
        query. Note the `+ 1` is needed because we incorporate the null class.
    masks_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
        query.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Last hidden states (final feature map) of the last layer.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Self and Cross Attentions weights from transformer decoder.
    patch_offsets (`list[torch.Tensor]`, *optional*):
        list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
    Nlossclass_queries_logitsmasks_queries_logitslast_hidden_statehidden_states
attentionspatch_offsets)rS   rT   rU   rV   r]   r   torchFloatTensor__annotations__r^   r_   r`   ra   tuplerb   rc   listr   r<   r<   r<   rR   r\      s   
 r\   c                   @      e Zd ZdS )EomtLossNrS   rT   rU   r<   r<   r<   rR   rj          rj   c                   @   ri   )EomtPatchEmbeddingsNrk   r<   r<   r<   rR   rm      rl   rm   c                   @   s<   e Zd ZdeddfddZdd Zdejdejfd	d
ZdS )EomtEmbeddingsconfigreturnNc                 C   s   t j|  || _|j| _t tdd|j| _	t t
d|j|j| _t|| _| jj}t |j| _d|j | _t ||j| _| jdt|ddd d S )N   position_ids)rq   F)
persistent)r   Moduler>   ro   r:   	Parameterrd   randnr2   	cls_tokenzerosrM   register_tokensrm   patch_embeddingsnum_patchesDropoutr5   dropoutnum_prefix_tokens	Embeddingposition_embeddingsregister_bufferarangeexpand)rN   ro   r|   r<   r<   rR   r>      s   
 zEomtEmbeddings.__init__c                 C      t d)NzNot needed for Eomt ModelAttributeErrorrN   r<   r<   rR   interpolate_pos_encoding     z'EomtEmbeddings.interpolate_pos_encodingpixel_valuesc                 C   s~   |j \}}}}| jjjj}| |j|d}| j|dd}| j|dd}|| 	| j
 }tj|||gdd}| |}|S )N)dtypers   rq   dim)shaper{   
projectionweightr   torx   r   rz   r   rr   rd   catr~   )rN   r   
batch_size_target_dtype
embeddings
cls_tokensrz   r<   r<   rR   forward  s   
zEomtEmbeddings.forward)	rS   rT   rU   r   r>   r   rd   r   r   r<   r<   r<   rR   rn      s    rn   c                   @   ri   )EomtAttentionNrk   r<   r<   r<   rR   r      rl   r   c                   @   ri   )EomtLayerScaleNrk   r<   r<   r<   rR   r   $  rl   r   c                   @   s0   e Zd Z	ddejdeej dejfddZdS )	EomtLayerNra   	head_maskrp   c                 C   sb   |  |}| ||\}}| |}| || }| |}| |}| |}| || }|S N)norm1	attentionlayer_scale1	drop_pathnorm2mlplayer_scale2)rN   ra   r   hidden_states_normself_attention_outputr   layer_outputr<   r<   rR   r   )  s   




zEomtLayer.forwardr   )rS   rT   rU   rd   r   r   r   r<   r<   r<   rR   r   (  s    r   c                       s4   e Zd Zd	 fdd	ZdejdejfddZ  ZS )
EomtLayerNorm2dr!   Tc                    s   t  j|||d d S )N)epselementwise_affine)r=   r>   )rN   r;   r   affinerP   r<   rR   r>   A  s   zEomtLayerNorm2d.__init__hidden_staterp   c                 C   s>   | dddd}t|| j| j| j| j}| dddd}|S )Nr   r   r   rq   )permuteF
layer_normnormalized_shaper   biasr   )rN   r   r<   r<   rR   r   D  s   zEomtLayerNorm2d.forward)r!   T)rS   rT   rU   r>   rd   r   r   rZ   r<   r<   rP   rR   r   @  s    r   c                       8   e Zd Zdef fddZdejdejfddZ  ZS )EomtScaleLayerro   c                    sV   t    |j}tj||ddd| _t|j | _tj	||dd|dd| _
t|| _d S )Nr   )kernel_sizestrider   rq   F)r   paddinggroupsr   )r=   r>   r2   r   ConvTranspose2dconv1r   r6   
activationConv2dconv2r   layernorm2drN   ro   r2   rP   r<   rR   r>   L  s   
	zEomtScaleLayer.__init__ra   rp   c                 C   s,   |  |}| |}| |}| |}|S r   )r   r   r   r   rN   ra   r<   r<   rR   r   \  s
   



zEomtScaleLayer.forward	rS   rT   rU   r   r>   rd   r   r   rZ   r<   r<   rP   rR   r   K  s    r   c                       r   )EomtScaleBlockro   c                    s6   t     j| _t fddt| jD | _d S )Nc                       g | ]}t  qS r<   )r   .0r   ro   r<   rR   
<listcomp>h      z+EomtScaleBlock.__init__.<locals>.<listcomp>)r=   r>   rI   rK   r   
ModuleListrangeblockrN   ro   rP   r   rR   r>   e  s   
$zEomtScaleBlock.__init__ra   rp   c                 C   s   | j D ]}||}q|S r   )r   )rN   ra   r   r<   r<   rR   r   j  s   

zEomtScaleBlock.forwardr   r<   r<   rP   rR   r   d  s    r   c                       r   )EomtMaskHeadro   c                    sJ   t    |j}t||| _t||| _t||| _t|j	 | _
d S r   )r=   r>   r2   r   Linearfc1fc2fc3r   r6   r   r   rP   r<   rR   r>   q  s   
zEomtMaskHead.__init__ra   rp   c                 C   s.   |  | |}|  | |}| |}|S r   )r   r   r   r   r   r<   r<   rR   r   z  s   
zEomtMaskHead.forwardr   r<   r<   rP   rR   r   p  s    	r   c                   @   sN   e Zd ZU dZeed< dZdZdZdgZ	dZ
eedZd	ejd
dfddZdS )EomtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    ro   r   r   Fr   T)ra   rb   modulerp   Nc                 C   sl  | j j}t|tjtjtjfrDtjj|j	t
dd |jd urBtj|j	\}}|dkr4dt
| nd}tj|j| | d S d S t|tjrY|j	jd |jj  d S t|tjrz|j	jjddd |jd urx|j	j|j   d S d S t|trt|dr|jj| j j d S d S t|trtjj|jjtjd|d|jj |j_|j!j  d S d S )	N   )ar   rq   r#   r   )meanstdlambda1)"ro   r7   
isinstancer   r   r   r   initkaiming_uniform_r   mathsqrtr   _calculate_fan_in_and_fan_outuniform_	LayerNormdatafill_zero_r   normal_padding_idxr   hasattrr   rG   rn   trunc_normal_rx   r   rd   float32r   rz   )rN   r   r   fan_inr   boundr<   r<   rR   _init_weights  s8   





z!EomtPreTrainedModel._init_weights)rS   rT   rU   rV   r   rf   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpar   r   _can_record_outputsr   ru   r   r<   r<   r<   rR   r     s   
 r   zV
    The EoMT Model with head on top for instance/semantic/panoptic segmentation.
    c                   @   s   e Zd ZdefddZdd Zdd Zdejfd	d
Z	e
dd Zee			ddedeee  deee  deee  dee defddZdS )EomtForUniversalSegmentationro   c                    s   t |    | _ j| _t | _tj j j	d| _
t j j| _t fddt jD | _t | _t | _t j jd | _ j j  j j f| _ j j jd| _t | jd| _ | !dt"# j$ | %  d S )N)r   c                    r   r<   )r   r   r   r<   rR   r     r   z9EomtForUniversalSegmentation.__init__.<locals>.<listcomp>rq   )loss_cross_entropy	loss_mask	loss_dice)ro   weight_dictattn_mask_probs)&r	   r>   ro   r3   rn   r   r   r   r2   r8   	layernormr   rL   queryr   r   layersr   upscale_blockr   	mask_headr   
num_labelsclass_predictorr9   r:   	grid_sizer,   r-   r.   r   rj   	criterionr   rd   onesrK   	post_initr   r<   r   rR   r>     s$   
 

z%EomtForUniversalSegmentation.__init__c                 C   s   | j jS r   )r   r{   r   r<   r<   rR   get_input_embeddings  r   z1EomtForUniversalSegmentation.get_input_embeddingsc                 C   r   )NzNote needed for Eomt Model.r   r   r<   r<   rR   get_auxiliary_logits  r   z1EomtForUniversalSegmentation.get_auxiliary_logitslogitsc                 C   s   |d d d | j jd d f }| |}|d d | j j| jj d d d f }|dd}|j|jd dg| jR  }| 	|}| 
|}td||}||fS )Nrq   r   r   rs   zbqc, bchw -> bqhw)ro   rL   r   r   r   	transposereshaper   r   r   r   rd   einsum)rN   r  query_tokensclass_logitsprefix_tokensmask_logitsr<   r<   rR   predict  s   
&

z$EomtForUniversalSegmentation.predictc                 C   sD   |dk r t j| jd ||d|k}d| d d d ||d f |< | S )Nrq   r   )device)rd   randr   )	attn_maskprobnum_query_tokensencoder_start_tokensr  random_queriesr<   r<   rR   _disable_attention_mask  s   z4EomtForUniversalSegmentation._disable_attention_maskNr   mask_labelsclass_labelsrc   rO   rp   c                 K   sr  d\}}d}|du rt d| |}	t| jD ]\}
}|
| j| jj krG| jjdddddf 	|	j
d dd|	j}tj||	fdd}	|
| j| jj kr| jsa| j|
| j | jj  dkr| |	}| |\}}||f7 }||f7 }tj|	j
d |	j
d |	j
d |	jtjd}tj|| jd	d
}||d|dd}| jj}|| jj }|dk|ddd||df< | j|| j|
| j | jj  |||jd}|ddddf 	d| jjdd}| | d}||	|}	q| |	}| |\}}||f7 }||f7 }d}|dur0|dur0d}t ||D ]\}}| j!||||dd}|| "|7 }qt#|||||dS )ag  
        mask_labels (`list[torch.Tensor]`, *optional*):
            list of mask labels of shape `(num_labels, height, width)` to be fed to a model
        class_labels (`list[torch.LongTensor]`, *optional*):
            list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
            labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
        patch_offsets (`list[torch.Tensor]`, *optional*):
            list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
        )r<   r<   Nz You have to specify pixel_valuesr   rs   rq   r   )r  r   bilinear)sizemode)r  r  r  r  .g    er   )r_   r^   r  r  auxiliary_predictions)r]   r_   r^   r`   rc   )$
ValueErrorr   	enumerater   r3   ro   rK   r   r   r   r   r   r  rd   r   trainingr   r   r  r  boolr   interpolater   viewr  rL   r   r  r4   rX   masked_fillzipget_loss_dictget_lossr\   )rN   r   r  r  rc   rO   masks_queries_logits_per_layerclass_queries_logits_per_layerattention_maskra   idxlayer_moduler   norm_hidden_statesr_   r^   interpolated_logitsr  r  sequence_outputr]   	loss_dictr<   r<   rR   r     s   
2


"	


z$EomtForUniversalSegmentation.forward)NNN)rS   rT   rU   r   r>   r  r  rd   r   r  staticmethodr  r   r   r   rh   r
   r   r\   r   r<   r<   r<   rR   r     s2    




r   )r   r   r   ):rV   r   dataclassesr   typingr   rd   torch.nn.functionalr   
functionalr   r   activationsr   
file_utilsr   modeling_utilsr	   processing_utilsr
   utilsr   r   r   utils.genericr   dinov2.modeling_dinov2r   r   r   r    mask2former.modeling_mask2formerr   r   siglip.modeling_siglipr   vit.configuration_vitr   
get_loggerrS   loggerr   r\   rj   rm   rn   r   r   r   r   r   ru   r   r   r   r   r   __all__r<   r<   r<   rR   <module>   sV   
 
$* )