o
    ij                  	   @   s@  d Z ddlZddlmZ ddlmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ eeZeeddG dd deZeeddG dd deZG dd de	jZ G dd de	jZ!G dd de	jZ"G dd de	jZ#dCdej$d e%d!e&d"ej$fd#d$Z'G d%d& d&e	jZ(G d'd( d(eZ)G d)d* d*e	jZ*G d+d, d,e	j+Z,G d-d. d.e	jZ-G d/d0 d0e	jZ.eG d1d2 d2eZ/eG d3d4 d4e/Z0d5ej$d6e1d"ej$fd7d8Z2d5ej$d9e1d:e1d"ej$fd;d<Z3G d=d> d>e	jZ4ed?dG d@dA dAe/Z5g dBZ6dS )DzPyTorch SegGpt model.    N)	dataclass)OptionalUnion)nn)
functional   )ACT2FN)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )SegGptConfigz1
    Output type of [`SegGptEncoderOutput`].
    )custom_introc                   @   s^   e Zd ZU dZejed< dZee	ej  ed< dZ
ee	ej  ed< dZee	ej  ed< dS )SegGptEncoderOutputay  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
        of shape `(batch_size, patch_height, patch_width, hidden_size)`.
    attentions (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
        Tuple of *torch.FloatTensor* (one for each layer) of shape
        `(batch_size, num_heads, seq_len, seq_len)`.
    intermediate_hidden_states (`tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
        Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
        Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
        Additionally, each feature passes through a LayerNorm.
    last_hidden_stateNhidden_states
attentionsintermediate_hidden_states)__name__
__module____qualname____doc__torchFloatTensor__annotations__r   r   tupler   r    r   r   g/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/seggpt/modeling_seggpt.pyr   #   s   
 
r   z;
    Output type of [`SegGptImageSegmentationOutput`].
    c                   @   sb   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dS )SegGptImageSegmentationOutputa  
    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
        The loss value.
    pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        The predicted masks.
    hidden_states (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
        of shape `(batch_size, patch_height, patch_width, hidden_size)`.
    attentions (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape
        `(batch_size, num_heads, seq_len, seq_len)`.
    Nloss
pred_masksr   r   )r   r   r   r   r"   r   r   r   r   r#   r   r   r   r   r   r   r    r!   ?   s   
 r!   c                       s(   e Zd ZdZ fddZdd Z  ZS )SegGptPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)selfconfigr)   r*   r+   r,   r1   	__class__r   r    r(   a   s   
 zSegGptPatchEmbeddings.__init__c              
   C   s   |j \}}}}|| jkrtd|| jd ks|| jd kr5td| d| d| jd  d| jd  d	| |ddd	d}|S )
NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model ().   r   )shaper+   
ValueErrorr)   r3   permute)r4   pixel_values
batch_sizer+   heightwidth
embeddingsr   r   r    forwardo   s   
(zSegGptPatchEmbeddings.forward)r   r   r   r   r(   rC   __classcell__r   r   r6   r    r$   Z   s    r$   c                       sv   e Zd ZdZdeddf fddZdededejfd	d
Z			ddejdejde
ej de
e dejf
ddZ  ZS )SegGptEmbeddingszX
    Construct the embeddings from patch, position embeddings for input and prompt.
    r5   returnNc                    s   t    ttddd|j| _ttddd|j| _ttddd|j| _	ttddd|j| _
ttddd|j| _t|| _|j|j d d }ttd||j| _t|j| _d S )Nr   r:   )r'   r(   r   	Parameterr   zerosr,   
mask_tokensegment_token_inputsegment_token_prompttype_token_semantictype_token_instancer$   patch_embeddingspretrain_image_sizer*   randnposition_embeddingsDropouthidden_dropout_probdropout)r4   r5   num_positionsr6   r   r    r(      s   

zSegGptEmbeddings.__init__r@   rA   c                 C   s   | j d d dd f }|jd }t|d }tj s#||ks#||krBtj|d||d	dddd||fddd	}|	ddddS |d||dS )
Nr         ?r   r   r:   bicubicF)sizemodealign_corners)
rQ   r;   r   r   jit
is_tracingFinterpolatereshaper=   )r4   r@   rA   patch_pos_embedr1   pretrain_patch_sizer   r   r    interpolate_pos_encoding   s   
z)SegGptEmbeddings.interpolate_pos_encodingr>   prompt_pixel_valuesbool_masked_posembedding_typec                 C   s   |  |}|  |}|j\}}}	}
| j|||	d}|d|d||	d}|d|  ||  }|d ur8|nd}| ||	}|| j }|| j	 }|| }|| }|dkrZ| j
}n|dkrb| j}ntd| || }|| }tj||fdd}|S )NrW   r   instancesemanticzBEmbedding type should be either 'semantic' or 'instance', but got r   dim)rN   r;   rI   expand	unsqueezetype_asr`   rc   rJ   rK   rL   rM   r<   r   cat)r4   r>   rd   re   rf   input_embeddingsprompt_embeddingsr?   patch_heightpatch_width_rI   w	pos_embedtype_embeddingrB   r   r   r    rC      s*   



zSegGptEmbeddings.forward)NN)r   r   r   r   r   r(   intr   Tensorrc   r   
BoolTensorstrrC   rD   r   r   r6   r    rE   }   s"    rE   c                       s   e Zd ZdZ fddZdededejdejfdd	Zd
ejdejdejdejde	eef de	eef dejfddZ
ddejdejfddZ  ZS )SegGptAttentionz=Multi-head Attention block with relative position embeddings.c                    s  t    |j|j}}t|tjjr|n||f}t|tjjr"|n||f}|d |j |d |j f}|j|j	 }|j	| _	|d | _
tj|j|jd |jd| _t|j|j| _|j| _| jr|d u ritdttd|d  d || _ttd|d  d || _d S d S )Nr   r   g      r   biaszBInput size must be provided if using relative positional encoding.r:   )r'   r(   r)   r*   r-   r.   r/   r0   r,   num_attention_headsscaler   Linearqkv_biasqkvproj use_relative_position_embeddingsr<   rG   r   rH   	rel_pos_h	rel_pos_w)r4   r5   r)   r*   
input_sizehead_dimr6   r   r    r(      s"   

 $zSegGptAttention.__init__q_sizek_sizerel_posrF   c           	      C   s   t dt|| d }tj|d|jd dddd|dd}|d|dd}t|dddf t|| d }t|dddf t|| d }|| |d t|| d  }||	  S )	a  
        Get relative positional embeddings according to the relative positions of
            query and key sizes.

        Args:
            q_size (int):
                size of the query.
            k_size (int):
                size of key k.
            rel_pos (`torch.Tensor`):
                relative position embeddings (L, channel).

        Returns:
            Extracted positional embeddings according to relative positions.
        r:   r   r   rW   linear)rY   rZ   N      ?)
rw   maxr^   r_   r`   r;   r=   r   arangelong)	r4   r   r   r   max_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordsr   r   r    get_rel_pos   s   $$zSegGptAttention.get_rel_posattnqueryr   r   c                 C   s   |\}}|\}	}
|  ||	|}|  ||
|}|j\}}}|||||}td||}td||}|||||	|
}||dddddddddf  |dddddddddf  }|||| |	|
 }|S )a  
        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py

        Args:
            attn (`torch.Tensor`):
                attention map.
            query (`torch.Tensor`):
                query q in the attention layer with shape (batch_size, query_height * query_width, channel).
            rel_pos_h (`torch.Tensor`):
                relative position embeddings (Lh, channel) for height axis.
            rel_pos_w (`torch.Tensor`):
                relative position embeddings (Lw, channel) for width axis.
            q_size (tuple):
                spatial sequence size of query q with (query_height, query_width).
            k_size (tuple):
                spatial sequence size of key k with (key_height, key_width).

        Returns:
            attn (`torch.Tensor`):
                attention map with added relative positional embeddings.
        zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)r   r;   r`   r   einsum)r4   r   r   r   r   r   r   query_heightquery_width
key_height	key_widthrelative_position_heightrelative_position_widthr?   rs   rj   reshaped_queryrel_hrel_wr   r   r    add_decomposed_rel_pos  s   Hz&SegGptAttention.add_decomposed_rel_posFr   c              	   C   s:  |j \}}}}| |||| d| jdddddd}|d|| j || dd\}}	}
|| j |	dd }| jrN| 	||| j
| j||f||f}tjjj|tjdd|j}|rw||| j|| d}||| j || d}nd }||
 || j||d}|ddddd|||d}| |}||fS )	Nr   rW   r:   r   r      )dtyperj   )r;   r   r`   r~   r=   unbindr   	transposer   r   r   r   r   r   r   softmaxfloat32tor   viewr   )r4   r   output_attentionsr?   r@   rA   rs   r   r   keyvalueattn_weightsattn_weights_reshapedattn_outputr   r   r    rC   :  s(   &
zSegGptAttention.forward)F)r   r   r   r   r(   rw   r   rx   r   r   r   rC   rD   r   r   r6   r    r{      s(     


 -r{   c                       s2   e Zd Z fddZdejdejfddZ  ZS )	SegGptMlpc                    s>   t    t|j|j| _t|j|j| _t|j	 | _
d S N)r'   r(   r   r   r,   mlp_dimlin1lin2r   
hidden_actactr4   r5   r6   r   r    r(   b  s   
zSegGptMlp.__init__r   rF   c                 C   "   |  |}| |}| |}|S r   )r   r   r   r4   r   r   r   r    rC   h  s   


zSegGptMlp.forward)r   r   r   r(   r   rx   rC   rD   r   r   r6   r    r   a  s    r           Finput	drop_probtrainingrF   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   r   device)r;   ndimr   randr   r   floor_div)r   r   r   	keep_probr;   random_tensoroutputr   r   r    	drop_pathp  s   
r   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )SegGptDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rF   c                    s   t    || _d S r   )r'   r(   r   )r4   r   r6   r   r    r(     s   

zSegGptDropPath.__init__r   c                 C   s   t || j| jS r   )r   r   r   r   r   r   r    rC     s   zSegGptDropPath.forwardc                 C   s   d| j  S )Nzp=)r   r4   r   r   r    
extra_repr  s   zSegGptDropPath.extra_reprr   )r   r   r   r   r   floatr(   r   rx   rC   rz   r   rD   r   r   r6   r    r     s
    r   c                       sj   e Zd Zdededdf fddZ		ddejd	ed
e	de	de
eejejf eej f f
ddZ  ZS )SegGptLayerr5   drop_path_raterF   Nc                    sd   t    t|| _t|| _|dkrt|nt | _	tj
|j|jd| _tj
|j|jd| _d S )Nr   eps)r'   r(   r{   	attentionr   mlpr   r   Identityr   	LayerNormr,   layer_norm_epslayernorm_beforelayernorm_after)r4   r5   r   r6   r   r    r(     s   


zSegGptLayer.__init__Fr   ensemble_condfeature_ensembler   c                 C   s  | j | ||d}|d }|dd  }|rc|jd d |krc|j|jd d dd\}}	|dkrP|jd d }
|	d|
d}	|	jddd|	}	|	j|j }	n
|	jddd|	}	tj||	gdd}| 	|| }|}| 
|}| |}|| 	| }|f| }|S )	N)r   r   r   r:   ri   rW   T)rj   keepdim)r   r   r;   splitr`   mean	expand_asr   rn   r   r   r   )r4   r   r   r   r   self_attention_outputsattention_outputoutputspromptinputsnum_promptsresidualr   r   r    rC     s,   


zSegGptLayer.forward)FF)r   r   r   r   r   r(   r   rx   rw   boolr   r   rC   rD   r   r   r6   r    r     s    r   c                       s\   e Zd Zdeddf fddZ				ddejd	ed
edededee	e
f fddZ  ZS )SegGptEncoderr5   rF   Nc                    sp   t     | _dd tjd j jddD t fddt	 jD | _
tj j jd| _d| _d S )	Nc                 S   s   g | ]}|  qS r   )item).0xr   r   r    
<listcomp>  s    z*SegGptEncoder.__init__.<locals>.<listcomp>r   cpu)r   c                    s   g | ]	}t  | qS r   )r   )r   ir5   dprr   r    r     s    r   F)r'   r(   r5   r   linspacer   num_hidden_layersr   
ModuleListrangelayersr   r,   r   	layernormgradient_checkpointingr   r6   r   r    r(     s   
 "
zSegGptEncoder.__init__FTr   r   r   output_hidden_statesreturn_dictc                 C   s  |rdnd }|r
dnd }g }t | jD ]U\}	}
|r||f }| jj|	kr&dnd}|
||||}|d }|	| jjkrQ|d |jd d  ||jd d d   d }|	| jjv r_|| | |rh||d f }q|rp||f }|stdd ||||fD S t	||||dS )	Nr   r:   r   r   rV   c                 s   s    | ]	}|d ur|V  qd S r   r   )r   vr   r   r    	<genexpr>  s    z(SegGptEncoder.forward.<locals>.<genexpr>)r   r   r   r   )
	enumerater   r5   merge_indexr;   !intermediate_hidden_state_indicesappendr   r   r   )r4   r   r   r   r   r   all_hidden_statesall_self_attentionsr   r   layer_moduler   layer_outputsr   r   r    rC     s<   
*

zSegGptEncoder.forward)FFFT)r   r   r   r   r(   r   rx   r   r   r   r   rC   rD   r   r   r6   r    r     s&    
r   c                       sB   e Zd ZdZddd fdd
Zdejdejf fd	d
Z  ZS )SegGptLayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    gư>channels_last)r   data_formatc                   s8   t  j|fd|i| |dvrtd| || _d S )Nr   )r  channels_firstzUnsupported data format: )r'   r(   NotImplementedErrorr  )r4   normalized_shaper   r  kwargsr6   r   r    r(     s   
zSegGptLayerNorm.__init__featuresrF   c                    sJ   | j dkr|dddd}t |}|dddd}|S t |}|S )z
        Args:
            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
        r  r   r:   r   r   )r  r=   r'   rC   )r4   r  r6   r   r    rC     s   
zSegGptLayerNorm.forward)	r   r   r   r   r(   r   rx   rC   rD   r   r   r6   r    r    s    "r  c                       s,   e Zd Z fddZdejfddZ  ZS )SegGptDecoderHeadc                    s\   t    tj|j|jddd| _t|j|jdd| _t	|j
 | _tj|jdddd| _d S )Nr   r   )r%   paddingr  )r	  r   r  T)r%   r}   )r'   r(   r   r2   decoder_hidden_sizeconvr  r   r   r   r   act_fctheadr   r6   r   r    r(     s   

zSegGptDecoderHead.__init__r   c                 C   s,   |  |}| |}| |}| |}|S r   )r  r   r  r  r   r   r   r    rC   (  s
   



zSegGptDecoderHead.forward)r   r   r   r(   r   r   rC   rD   r   r   r6   r    r    s    r  c                       sB   e Zd Z fddZdejdejfddZdejfddZ  ZS )	SegGptDecoderc                    sX   t    tj|jt|j |jd |j dd| _	t
|| _|j| _|j| _|| _d S )Nr:   Tr|   )r'   r(   r   r   r,   lenr   r*   r  decoder_embedr  decoder_predr5   r   r6   r   r    r(   2  s   


zSegGptDecoder.__init__r   rF   c                 C   s`   |j \}}}}||||| j| j| j}|dddddd}|j|d|| j || j fd}|S )	Nr      r   r   r:   r   rW   r;   )r;   r`   r*   r  r=   )r4   r   r?   rq   rr   rs   r   r   r    _reshape_hidden_states>  s   z$SegGptDecoder._reshape_hidden_statesc                 C   r   r   )r  r  r  r   r   r   r    rC   J  s   


zSegGptDecoder.forward)	r   r   r   r(   r   r   r  rC   rD   r   r   r6   r    r  1  s    r  c                   @   s>   e Zd ZU eed< dZdZdZddgZde	j
dd	fd
dZd	S )SegGptPreTrainedModelr5   modelr>   TrE   r   modulerF   Nc                 C   s  | j j}t|tjtjfr2tjj|jj	
tjd|d
|jj|j_	|jdur0|jj	  dS dS t|tjtfrI|jj	  |jj	d dS t|tr|tjj|jj	
tjd|d
|jj|j_	tjj|jj	
tjd|d
|jj|j_	dS t|trtjj|jj	
tjd|d
|jj|j_	tjjj|j|d tjjj|j|d tjjj|j|d tjjj|j|d tjjj|j|d dS dS )zInitialize the weightsr   )r   stdNr   )r  )r5   initializer_ranger-   r   r   r2   inittrunc_normal_weightdatar   r   r   r   r}   zero_r   r  fill_r{   r   r   rE   rQ   normal_rI   rJ   rK   rL   rM   )r4   r  r  r   r   r    _init_weightsZ  sP   






z#SegGptPreTrainedModel._init_weights)r   r   r   r   r   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   Moduler%  r   r   r   r    r  R  s   
 r  c                       s   e Zd Zdef fddZdefddZdeee	e f ddfd	d
Z
e							ddejdejdejdeej dee dee deej dee dee dee deeef fddZ  ZS )SegGptModelr5   c                    2   t  | || _t|| _t|| _|   d S r   )r'   r(   r5   rE   rB   r   encoder	post_initr   r6   r   r    r(     
   

zSegGptModel.__init__rF   c                 C   s   | j jS r   )rB   rN   r   r   r   r    get_input_embeddings  s   z SegGptModel.get_input_embeddingsheads_to_pruneNc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr-  layerr   prune_heads)r4   r1  r3  headsr   r   r    _prune_heads  s   zSegGptModel._prune_headsr>   rd   prompt_masksre   r   rf   labelsr   r   r   c                 C   sN  |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
|dur$|nd}| jjjjj}|	|}|	|}t
j||fdd}|du rMt
j||fddnt
j||fdd}|du rc|durctd |du r| jjj}t
j|d t
j|jd}t
j||d  t
j|jd}t
||g}|d}| j||||d}| j||||	|
d	}|S )
a
  
        prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt pixel values. Prompt pixel values can be obtained using [`AutoImageProcessor`]. See
            [`SegGptImageProcessor.__call__`] for details.
        prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt mask. Prompt mask can be obtained using [`AutoImageProcessor`]. See [`SegGptImageProcessor.__call__`] for
            details.
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        feature_ensemble (`bool`, *optional*):
            Boolean indicating whether to use feature ensemble or not. If `True`, the model will use feature ensemble
            if we have at least two prompts. If `False`, the model will not use feature ensemble. This argument should
            be considered when doing few-shot inference on an input image i.e. more than one prompt for the same image.
        embedding_type (`str`, *optional*):
            Embedding type. Indicates whether the prompt is a semantic or instance embedding. Can be either
            instance or semantic.
        labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
            Ground truth mask for input images.

        Examples:

        ```python
        >>> from transformers import SegGptImageProcessor, SegGptModel
        >>> from PIL import Image
        >>> import requests

        >>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
        >>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
        >>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"

        >>> image_input = Image.open(requests.get(image_input_url, stream=True).raw)
        >>> image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
        >>> mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw).convert("L")

        >>> checkpoint = "BAAI/seggpt-vit-large"
        >>> model = SegGptModel.from_pretrained(checkpoint)
        >>> image_processor = SegGptImageProcessor.from_pretrained(checkpoint)

        >>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> list(outputs.last_hidden_state.shape)
        [1, 56, 28, 1024]
        ```
        NFr:   ri   zLabels were provided, but bool_masked_pos were not. It will be set to default value. If you're training the model, make sure to provide a bool_masked_pos.r   r   )rf   re   )r   r   r   r   )r5   r   r   use_return_dictrB   rN   r3   r   r   r   r   rn   loggerwarning_oncer1   rH   r   r   onesrl   r-  )r4   r>   rd   r7  re   r   rf   r8  r   r   r   expected_dtyper1   bool_masked_pos_zerosbool_masked_pos_onesembedding_outputencoder_outputsr   r   r    rC     sH   ;



zSegGptModel.forwardNNNNNNN)r   r   r   r   r(   r$   r0  dictrw   listr6  r   r   rx   r   ry   r   rz   r   r   r   r   rC   rD   r   r   r6   r    r+    sF    
	

r+  tensorr*   c                 C   sl   | j \}}}}|| }|| }| j||||||fd} | dddddd} | j||| |d d fd} | S )Nr  r   r:   r   r   r  r   )r;   r`   r=   )rE  r*   r?   r+   r@   rA   rq   rr   r   r   r    patchify	  s   rF  rq   rr   c                 C   s   | j d }t| j d d d }|| | j d kr*td| j d  d| d| d	| j|||||dfd
} | dddddd} | j|d|| || fd
} | S )Nr   rW   r   rV   r   zNumber of patches z does not match patch height (z) and width (r9   r  r  r:   r   )r;   rw   r<   r`   r=   )rE  rq   rr   r?   r*   r   r   r    
unpatchify  s   
rG  c                       s>   e Zd Z fddZdejdejdejdejfddZ  ZS )	
SegGptLossc                    s   t    |j| _|j| _d S r   )r'   r(   betar*   r   r6   r   r    r(   %  s   
zSegGptLoss.__init__r7  r#   r8  re   c                 C   s   t j||fdd}|dddddf dd| jd d }t||jd | j |jd | j }tj||d| jd}|| 	 |	  }|S )aN  Computes the L1 loss between the predicted masks and the ground truth masks.

        Args:
            prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values from mask prompt.

            pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
                Predicted masks.

            labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Ground truth mask for input images.

            bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
                Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:
            `torch.FloatTensor`: The mean L1 loss between the predicted masks and the ground truth masks.
        r:   ri   Nr   r   none)	reductionrI  )
r   rn   repeatr*   rG  r;   r^   smooth_l1_lossrI  sum)r4   r7  r#   r8  re   ground_truthmaskr"   r   r   r    rC   *  s   *$zSegGptLoss.forward)	r   r   r   r(   r   r   ry   rC   rD   r   r   r6   r    rH  $  s    rH  zM
    SegGpt model with a decoder on top for one-shot image segmentation.
    c                       s   e Zd Zdef fddZe							ddejdejdejdeej	 d	ee
 d
ee deej dee
 dee
 dee
 deeef fddZ  ZS )SegGptForImageSegmentationr5   c                    r,  r   )r'   r(   r5   r+  r  r  decoderr.  r   r6   r   r    r(   T  r/  z#SegGptForImageSegmentation.__init__Nr>   rd   r7  re   r   rf   r8  r   r   r   rF   c                 C   sl  |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
|du rN| jjjj}tj	|d tj
|jd}tj||d  tj
|jd}t||g}|d}| j|||||||||	|
d
}|
rb|jn|d }tj|dd}| |}d}|durt| j }|||||}|
s|f}|	r||d f }|r|	rdnd}||| f }|dur|f| }|S t|||j|jd	S )
aY  
        prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt pixel values. Prompt pixel values can be obtained using [`AutoImageProcessor`]. See
            [`SegGptImageProcessor.__call__`] for details.
        prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt mask. Prompt mask can be obtained using [`AutoImageProcessor`]. See [`SegGptImageProcessor.__call__`] for
            details.
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        feature_ensemble (`bool`, *optional*):
            Boolean indicating whether to use feature ensemble or not. If `True`, the model will use feature ensemble
            if we have at least two prompts. If `False`, the model will not use feature ensemble. This argument should
            be considered when doing few-shot inference on an input image i.e. more than one prompt for the same image.
        embedding_type (`str`, *optional*):
            Embedding type. Indicates whether the prompt is a semantic or instance embedding. Can be either
            instance or semantic.
        labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
            Ground truth mask for input images.

        Examples:

        ```python
        >>> from transformers import SegGptImageProcessor, SegGptForImageSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
        >>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
        >>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"

        >>> image_input = Image.open(requests.get(image_input_url, stream=True).raw)
        >>> image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
        >>> mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw).convert("L")

        >>> checkpoint = "BAAI/seggpt-vit-large"
        >>> model = SegGptForImageSegmentation.from_pretrained(checkpoint)
        >>> image_processor = SegGptImageProcessor.from_pretrained(checkpoint)

        >>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> result = image_processor.post_process_semantic_segmentation(outputs, target_sizes=[(image_input.height, image_input.width)])[0]
        >>> print(list(result.shape))
        [170, 297]
        ```
        Nr:   r   r   )
r>   rd   r7  re   r   rf   r8  r   r   r   rW   ri   r   )r"   r#   r   r   )r5   r   r   r9  r  rB   rN   r1   r   rH   r   r   r<  rn   rl   r   rR  rH  r!   r   r   )r4   r>   rd   r7  re   r   rf   r8  r   r   r   r1   r>  r?  r   r   r#   r"   loss_fnr   idxr   r   r    rC   ^  s^   ;



z"SegGptForImageSegmentation.forwardrB  )r   r   r   r   r(   r   r   rx   r   ry   r   rz   r   r   r   r!   rC   rD   r   r   r6   r    rQ  N  sB    
	

rQ  )r+  r  rQ  )r   F)7r   collections.abcr.   dataclassesr   typingr   r   r   r   torch.nnr   r^   activationsr   modeling_layersr	   modeling_utilsr
   utilsr   r   r   r   configuration_seggptr   
get_loggerr   r:  r   r!   r*  r$   rE   r{   r   rx   r   r   r   r   r   r   r   r  r  r  r  r+  rw   rF  rG  rH  rQ  __all__r   r   r   r    <module>   sb   
#U  /=!0 * 