o
    wie                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZmZmZ ddlm Z m!Z!m"Z" e#e$Z%de
j&de
j&fddZ'de
j&de
j&fddZ(eeG dd deZ)eeG dd deZ*eeG dd deZ+G dd dej,Z-G dd dej,Z.	 dGd!ej,d"e
j&d#e
j&d$e
j&d%ee
j& d&e/d'e/fd(d)Z0G d*d+ d+ej,Z1G d,d- d-ej,Z2G d.d/ d/eZ3eG d0d1 d1eZ4G d2d3 d3ej,Z5G d4d5 d5ej,Z6G d6d7 d7e4Z7G d8d9 d9ej,Z8G d:d; d;e4Z9eG d<d= d=e4Z:G d>d? d?ej,Z;G d@dA dAe4Z<edBdCG dDdE dEe4Z=g dFZ>dS )HzPyTorch CLIPSeg model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfiglogitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalcross_entropytorcharangelenr   )r    r"   i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/clipseg/modeling_clipseg.pycontrastive_loss(   s   r$   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r$   t)r%   caption_loss
image_lossr"   r"   r#   clipseg_loss-   s   r)   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< d
ee fddZdS )CLIPSegOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                       t  fdd  D S )Nc                 3   .    | ]}|d vr | nt  | V  qdS ))r0   r1   Ngetattrto_tuple.0kselfr"   r#   	<genexpr>S   
    
z)CLIPSegOutput.to_tuple.<locals>.<genexpr>tuplekeysr:   r"   r:   r#   r6   R      zCLIPSegOutput.to_tuple)__name__
__module____qualname____doc__r+   r   r   FloatTensor__annotations__r,   r-   r.   r/   r0   r   r1   r?   r   r6   r"   r"   r"   r#   r*   3   s   
 r*   c                   @   sP   e Zd ZU dZdZeej ed< dZ	ee
ej  ed< dZee
ej  ed< dS )CLIPSegDecoderOutputz|
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    Nr   hidden_states
attentions)rB   rC   rD   rE   r   r   r   rF   rG   rI   r?   rJ   r"   r"   r"   r#   rH   Y   s
   
 rH   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeed< dZeed< d	ee fd
dZdS )CLIPSegImageSegmentationOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Binary cross entropy loss for segmentation.
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
        Conditional embeddings used for segmentation.
    pooled_output (`torch.FloatTensor` of shape `(batch_size, embed_dim)`):
        Pooled output of the [`CLIPSegVisionModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    decoder_output (`CLIPSegDecoderOutput`):
        The output of the [`CLIPSegDecoder`].
    Nr+   r   conditional_embeddingspooled_outputr1   decoder_outputr   c                    r2   )Nc                 3   r3   ))r1   rN   Nr4   r7   r:   r"   r#   r<      r=   z:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>r>   r:   r"   r:   r#   r6      rA   z'CLIPSegImageSegmentationOutput.to_tuple)rB   rC   rD   rE   r+   r   r   rF   rG   r   rL   rM   r1   r   rN   rH   r?   r   r6   r"   r"   r"   r#   rK   f   s   
 rK   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )CLIPSegVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__rP   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr    expandr;   rP   	__class__r"   r#   r]      s"   
"z CLIPSegVisionEmbeddings.__init__
embeddingsheightwidthr   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   NrY   g      ?r   rV   bicubicF)sizemodealign_cornersdim)shaperk   weight	unsqueezer   jit
is_tracingrW   ra   r   reshapepermuter   r   interpolateviewcat)r;   rq   rr   rs   rh   rk   ri   class_pos_embedpatch_pos_embedry   
new_height	new_widthsqrt_num_positionsr"   r"   r#   interpolate_pos_encoding   s*   



z0CLIPSegVisionEmbeddings.interpolate_pos_encodingTpixel_valuesc           
   
   C   s   |j \}}}}|s&|| jks|| jkr&td| d| d| j d| j d	| |}|ddd}| j|dd}tj	||gdd}	|rR|	| 
|	|| }	|	S |	| | j }	|	S )	NzInput image size (*z) doesn't match model ().rV   r   rY   rx   )rz   r`   
ValueErrorrg   flatten	transposerd   rm   r   r   r   rk   rW   )
r;   r   r   
batch_size_rr   rs   patch_embedsclass_embedsrq   r"   r"   r#   forward   s    
zCLIPSegVisionEmbeddings.forward)T)rB   rC   rD   r   r]   r   Tensorintr   rF   r   __classcell__r"   r"   ro   r#   rO      s     )rO   c                	       sX   e Zd Zdef fddZ			ddeej deej deej dej	fd	d
Z
  ZS )CLIPSegTextEmbeddingsrP   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )NrW   rX   FrZ   )r\   r]   r^   r   rj   
vocab_sizetoken_embeddingmax_position_embeddingsrk   rl   r   r    rm   r;   rP   r_   ro   r"   r#   r]      s   

zCLIPSegTextEmbeddings.__init__N	input_idsrW   inputs_embedsr   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )NrY   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rz   rk   r{   r   rW   r   )r;   r   rW   r   
seq_lengthmax_position_embeddingposition_embeddingsrq   r"   r"   r#   r      s"   

zCLIPSegTextEmbeddings.forward)NNN)rB   rC   rD   r   r]   r   r   
LongTensorrF   r   r   r   r"   r"   ro   r#   r      s    r           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )NrY   r   )ry   dtype)ptrainingr   rV   )r   matmulr   r   r   softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr"   r"   r#   eager_attention_forward  s   
r   c                       sv   e Zd ZdZdeeef f fddZ			ddej	de
ej	 d	e
ej	 d
e
e deej	e
ej	 f f
ddZ  ZS )CLIPSegAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrP   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)r\   r]   rP   r^   r_   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrn   ro   r"   r#   r]     s$   

zCLIPSegAttention.__init__NFrI   r   causal_attention_maskoutput_attentionsr   c              
   C   sL  |j \}}}| |}| |}	| |}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
| jj	dkrY|durR|durR|| }n|durX|}n|du| _
t}| jj	dkrz| jj	dkrt|rttd nt| jj	 }|| ||	|
|| j
| j| jsdn| jd	\}}|||| }| |}|sd}||fS )
z#Input shape: Batch x Time x Channelr   rV   flash_attention_2Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   r   )rz   r   r   r   r   r   r   r   rP   _attn_implementationr   r   loggerwarning_oncer   r   r   r   r   r   r   )r;   rI   r   r   r   r   r   r_   queriesr@   valuesattention_interfacer   r   r"   r"   r#   r   0  sH   	






zCLIPSegAttention.forward)NNF)rB   rC   rD   rE   r   r   r   r]   r   r   r   boolr?   r   r   r"   r"   ro   r#   r     s"    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )
CLIPSegMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)r\   r]   rP   r	   
hidden_actactivation_fnr   r   r^   intermediate_sizefc1fc2rn   ro   r"   r#   r]   k  s
   
zCLIPSegMLP.__init__rI   r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r;   rI   r"   r"   r#   r   r  s   


zCLIPSegMLP.forward)rB   rC   rD   r]   r   r   r   r   r"   r"   ro   r#   r   j  s    r   c                       sT   e Zd Zdef fddZ	ddejdejdejdee d	e	ej
 f
d
dZ  ZS )CLIPSegEncoderLayerrP   c                    R   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S N)epsr\   r]   r^   r_   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rn   ro   r"   r#   r]   {     


zCLIPSegEncoderLayer.__init__FrI   r   r   r   r   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rI   r   r   r   )r   r   r   r   r;   rI   r   r   r   residualr   outputsr"   r"   r#   r     s"   




zCLIPSegEncoderLayer.forwardF)rB   rC   rD   r   r]   r   r   r   r   r?   rF   r   r   r"   r"   ro   r#   r   z  s    r   c                   @   s    e Zd ZeZdZdZdd ZdS )CLIPSegPreTrainedModelclipTc                 C   sV  | j j}t|tr"|jjjjd|d d |jjjjd|d d nt|t	rW| j j}t
jj|jd|jd | d t
jj|jj|j j| d t
jj|jj|j j| d nt|tr| j j}|jd d|j j d  | }|jd | }t
jj|jj|d t
jj|jj|d t
jj|jj|d t
jj|jj|d n_t|tr| j j}|j jd d|j j d  | }d|j j d | }t
jj|jj|d t
jj|jj|d n't|trt
jj|jj|jd | j j d t
jj|jj|jd | j j d t|t
jr|j j!  |jj"d t|t
j#r'|j dur)|j j!  dS dS dS )	zInitialize the weightsr   g{Gz?)meanstdr   )r   rV   g      ?N)$rP   initializer_factor
isinstancer   r   r{   datanormal_rk   rO   r   initrd   r_   rg   initializer_ranger   num_hidden_layersr   r   r   r   r   r^   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   rU   zero_fill_r   )r;   r   factorin_proj_stdout_proj_stdfc_stdr"   r"   r#   _init_weights  sL   



 
z$CLIPSegPreTrainedModel._init_weightsN)rB   rC   rD   r   config_classbase_model_prefixsupports_gradient_checkpointingr   r"   r"   r"   r#   r     s
    r   c                       st   e Zd ZdZdef fddZ					ddeej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )CLIPSegEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    rP   c                    s:   t     | _t fddt jD | _d| _d S )Nc                       g | ]}t  qS r"   )r   r8   r   rP   r"   r#   
<listcomp>      z+CLIPSegEncoder.__init__.<locals>.<listcomp>F)	r\   r]   rP   r   
ModuleListranger   layersgradient_checkpointingrn   ro   r  r#   r]     s   
 
zCLIPSegEncoder.__init__Nr   r   r   output_hidden_statesreturn_dictr   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ] \}
}|r<||	f }||	|||d}|d }	|rQ||d f }q1|rY||	f }|sgtdd |	||fD S t|	||dS )	a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr"   )r   r   r   c                 s       | ]	}|d ur|V  qd S r   r"   r8   vr"   r"   r#   r<   .      z)CLIPSegEncoder.forward.<locals>.<genexpr>)last_hidden_staterI   rJ   )rP   r   r  use_return_dict	enumerater  r?   r   )r;   r   r   r   r   r  r  encoder_statesall_attentionsrI   idxencoder_layerlayer_outputsr"   r"   r#   r     s6   &

zCLIPSegEncoder.forwardNNNNN)rB   rC   rD   rE   r   r]   r   r   r   r   r   r?   r   r   r   r"   r"   ro   r#   r    s*    	
r  c                       s   e Zd Zdef fddZe						ddeej deej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )CLIPSegTextTransformerrP   c                    sH   t    || _|j}t|| _t|| _tj	||j
d| _|j| _d S r   )r\   r]   rP   r^   r   rq   r  encoderr   r   r   final_layer_normeos_token_idr   ro   r"   r#   r]   5  s   


zCLIPSegTextTransformer.__init__Nr   r   rW   r   r  r  r   c                 C   sj  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| }|d|d }| j||d}t||j	|j
d}	|d urLt||j	}| j|||	|||d}
|
d }| |}| jdkr|tj|jd |j
d|jtj|j
djdd	f }n|tj|jd |j
d|jtj|j
d| jk jdd	f }|s||f|
d
d   S t|||
j|
jdS )NzYou have to specify input_idsrY   )r   rW   r   )r   r   r   r   r  r  r   rV   )r   r   rx   r   r  pooler_outputrI   rJ   )rP   r   r  r  r   ru   r   rq   r
   r   r   r   r  r  r  r   r    rz   r   r   argmaxr   rI   rJ   )r;   r   r   rW   r   r  r  input_shaperI   r   encoder_outputsr  rM   r"   r"   r#   r   @  s\   

	

	zCLIPSegTextTransformer.forwardNNNNNN)rB   rC   rD   r   r]   r   r   r   r   r   r   r?   r   r   r   r"   r"   ro   r#   r  4  s0    
r  c                       s   e Zd ZeZddgZdef fddZdejfddZ	d	d
 Z
e						ddeej deej deej dee dee dee deeef fddZ  ZS )CLIPSegTextModelr   r   rP   c                    "   t  | t|| _|   d S r   )r\   r]   r  
text_model	post_initrn   ro   r"   r#   r]        
zCLIPSegTextModel.__init__r   c                 C   
   | j jjS r   r(  rq   r   r:   r"   r"   r#   get_input_embeddings     
z%CLIPSegTextModel.get_input_embeddingsc                 C   s   || j j_d S r   r,  )r;   r   r"   r"   r#   set_input_embeddings  s   z%CLIPSegTextModel.set_input_embeddingsNr   r   rW   r   r  r  c                 C   s   | j ||||||dS )a;  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rW   r   r  r  )r(  )r;   r   r   rW   r   r  r  r"   r"   r#   r     s   zCLIPSegTextModel.forwardr%  )rB   rC   rD   r   r   _no_split_modulesr]   r   Moduler-  r/  r   r   r   r   r   r   r?   r   r   r   r"   r"   ro   r#   r&    s8    
r&  c                       sp   e Zd Zdef fddZe				ddeej dee	 dee	 d	ee	 d
ee	 de
eef fddZ  ZS )CLIPSegVisionTransformerrP   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )r\   r]   rP   r^   rO   rq   r   r   r   pre_layrnormr  r  post_layernormr   ro   r"   r#   r]     s   


z!CLIPSegVisionTransformer.__init__NTr   r   r  r  r   r   c           
      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||d}| |}| j||||d}|d }|d d dd d f }	| |	}	|sS||	f|dd   S t||	|j	|j
dS )N)r   )r   r   r  r  r   r   r   )rP   r   r  r  rq   r4  r  r5  r   rI   rJ   )
r;   r   r   r  r  r   rI   r$  r  rM   r"   r"   r#   r     s.   	

z CLIPSegVisionTransformer.forward)NNNT)rB   rC   rD   r   r]   r   r   r   rF   r   r   r?   r   r   r   r"   r"   ro   r#   r3    s(    

r3  c                       s   e Zd ZeZdZdef fddZdejfddZ	e
						ddeej d
ee dee dee dee deeef fddZ  ZS )CLIPSegVisionModelr   rP   c                    r'  r   )r\   r]   r3  vision_modelr)  rn   ro   r"   r#   r]     r*  zCLIPSegVisionModel.__init__r   c                 C   r+  r   )r7  rq   rg   r:   r"   r"   r#   r-    r.  z'CLIPSegVisionModel.get_input_embeddingsNTr   r  r   r  c                 C   s   | j |||||dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r  r   r  )r7  )r;   r   r   r  r   r  r"   r"   r#   r     s   zCLIPSegVisionModel.forwardNNNTN)rB   rC   rD   r   r   main_input_namer]   r   r2  r-  r   r   r   rF   r   r   r?   r   r   r   r"   r"   ro   r#   r6    s0    
r6  c                       s>  e Zd ZeZdef fddZe						ddeej	 deej	 deej	 dee
 d	ee
 d
ee
 dejfddZe					ddeej dee
 d	ee
 de
d
ee
 dejfddZe									ddeej deej deej	 deej dee
 dee
 d	ee
 de
d
ee
 deeef fddZ  ZS )r   rP   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _|j
| _t|| _t|| _tj| j| j	dd| _tj| j| j	dd| _tt| jj| _|   d S )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)rU   )r\   r]   r   text_configr   	TypeErrortypevision_configr   projection_dimr^   r   r   r  r(  r3  r7  r   r   r   r   rb   r   tensorrP   logit_scale_init_valuelogit_scaler)  )r;   rP   r<  r?  ro   r"   r#   r]   .  s0   

zCLIPSegModel.__init__Nr   r   rW   r   r  r  r   c           
      C   sh   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||||d}|d }| |}	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```Nr0  r   )rP   r   r  r  r(  r   )
r;   r   r   rW   r   r  r  text_outputsrM   text_featuresr"   r"   r#   get_text_featuresN  s   	
zCLIPSegModel.get_text_featuresTr   r   c           	      C   sf   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j|||||d}|d }| |}|S )aI  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```Nr8  r   )rP   r   r  r  r7  r   )	r;   r   r   r  r   r  vision_outputsrM   image_featuresr"   r"   r#   get_image_features}  s    
zCLIPSegModel.get_image_featuresreturn_lossc
              	   C   s(  |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	| j|||||	d}
| j||||||	d}|
d }| |}|d }| |}||jdddd }||jdddd }| j	
 }t|| | }| }d}|rtt|}|	s||||||
f}|dur|f| S |S t|||||||
d	S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr8  r0  r   rV   rY   T)r   ry   keepdim)r+   r,   r-   r.   r/   r0   r1   )rP   r   r  r  r7  r(  r   r   normrC  expr   r   r&   r)   r*   )r;   r   r   r   rW   rJ  r   r  r   r  rG  rD  r/   r.   rC  r-   r,   r+   outputr"   r"   r#   r     sV   '	


zCLIPSegModel.forwardr%  r9  )	NNNNNNNTN)rB   rC   rD   r   r   r]   r   r   r   r   r   rF   rF  rI  r   r   r?   r*   r   r   r"   r"   ro   r#   r   *  s     .2	

r   c                       sX   e Zd ZdZdef fddZ	ddejdejdejd	ee	 d
e
ej f
ddZ  ZS )CLIPSegDecoderLayerz
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    rP   c                    r   r   r   rn   ro   r"   r#   r]     r   zCLIPSegDecoderLayer.__init__FrI   r   r   r   r   c                 C   sd   |}| j ||||d\}}|| }| |}|}| |}|| }| |}|f}|r0||f7 }|S r   )r   r   r   r   r   r"   r"   r#   r     s"   




zCLIPSegDecoderLayer.forwardr   )rB   rC   rD   rE   r   r]   r   r   r   r   r?   rF   r   r   r"   r"   ro   r#   rO    s    rO  c                       s\   e Zd Zdef fddZ			ddeej dejdee	 d	ee	 d
ee	 f
ddZ
  ZS )CLIPSegDecoderrP   c                    sX  t     j| _t j j| _t j j| _ j	r` j
jd  j
jd f}ttj j jdddt tj j jd |d |d dt tj jd d|d |d d| _ntj jd j
j j
jd| _t j}t fd	d
t|D | _t j
 j_ j_ j_d_tfdd
tt jD | _d S )N   r   r   )rS   paddingrV   r   )rS   rT   )rT   c                    s   g | ]}t  jj jqS r"   )r   r   r?  r^   
reduce_dimr  r  r"   r#   r  j  s    z+CLIPSegDecoder.__init__.<locals>.<listcomp>reluc                    r  r"   )rO  r  )decoder_configr"   r#   r  r  r  )r\   r]   conditional_layerr   r   r@  rS  film_mulfilm_add"use_complex_transposed_convolutionr?  ra   
Sequentialre   ReLUConvTranspose2dtransposed_convolutionr!   extract_layersr	  r
  reducescopydeepcopyr^   decoder_num_attention_headsr   decoder_intermediate_sizer   r   r  )r;   rP   transposed_kernelsdepthro   )rP   rU  r#   r]   J  sB   
(zCLIPSegDecoder.__init__NTrI   rL   r   r  r  c                 C   sp  |rdnd }|r
dnd }|d d d }d }	t t|| j| jD ]O\}
\}}}|	d ur1|||	 }	n||}	|
| jkrR| ||	ddd | | }	|	ddd}	||	d d |d}|d }	|re||	f7 }|rn||d f7 }q|	d d dd d d f ddd}	tt	
|	jd }|jd }|	||	jd ||}	| |	d}|stdd |||fD S t|||d	S )
Nr"   rY   r   r   rV   )r   r   r   c                 s   r  r   r"   r  r"   r"   r#   r<     r  z)CLIPSegDecoder.forward.<locals>.<genexpr>)r   rI   rJ   )r  zipr  r_  rV  rW  r   rX  r   mathsqrtrz   r   r]  squeezer?   rH   )r;   rI   rL   r   r  r  all_hidden_statesr  activationsrN  i
activationlayerreducer  ru   r   r   r"   r"   r#   r   t  sD   "

$
zCLIPSegDecoder.forward)NNT)rB   rC   rD   r   r]   r?   r   r   r   r   r   r   r"   r"   ro   r#   rP  I  s     .rP  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    )custom_introc                       s   e Zd ZeZdef fddZ					ddee deej	 deej	 deej	 d	eej	 f
d
dZ
e											ddeej deej d	eej deej deej	 deej deej dee dee dedee deeef fddZ  ZS )CLIPSegForImageSegmentationrP   c                    s:   t  | || _t|| _|j| _t|| _|   d S r   )	r\   r]   rP   r   r   r^  rP  decoderr)  rn   ro   r"   r#   r]     s   

z$CLIPSegForImageSegmentation.__init__Nr   r   r   rW   conditional_pixel_valuesc                 C   s   |d ur.t ||krtdt  | jj|||d}W d    |S 1 s'w   Y  |S |d urYt ||kr<tdt  | j|}W d    |S 1 sRw   Y  |S td)Nz@Make sure to pass as many prompt texts as there are query images)r   rW   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r!   r   r   no_gradr   rF  rI  )r;   r   r   r   rW   rs  rL   r"   r"   r#   get_conditional_embeddings  s.   



z6CLIPSegForImageSegmentation.get_conditional_embeddingsTr   rL   labelsr   r  r   r  r   c                    s  |dur|n| j j}t Q | jj||d|
|d}| j|d }|r'|jn|d   fdd| jD }|rHt	|j
|j|	rA|jnd|jd}n|	sV|dd |d	d  n|}W d   n1 sbw   Y  |du ry| j|jd
 ||||d}n|jd
 |jd
 krtd|jd | j jkrtd| j||||	|d}|r|jn|d
 }d}|dur||j}t }|||}|s|||||f}|dur|f| S |S t||||||dS )aX  
        conditional_pixel_values (`torch.FloatTensor`, *optional*):
            The pixel values of the conditional images.
        conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
            The conditional embeddings for the query images. If provided, the model will use this instead of computing
            the embeddings from the conditional_pixel_values.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = ["a cat", "a remote", "a blanket"]
        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> print(logits.shape)
        torch.Size([3, 352, 352])
        ```NTr8  r   rV   c                    s   g | ]} |d   qS )r   r"   )r8   rl  rI   r"   r#   r    s    z7CLIPSegForImageSegmentation.forward.<locals>.<listcomp>r   r   r   )r   r   r   rW   rs  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r  r  )r+   r   rL   rM   r1   rN   )rP   r  r   rt  r   r7  r   rI   r^  r   r  r!  rJ   ru  rz   r   r@  rr  r   r   r   r   BCEWithLogitsLossrK   )r;   r   r   rs  rL   r   rW   rv  r   r  r   r  rG  rM   rk  decoder_outputsr   r+   loss_fnrN  r"   rw  r#   r     s~   /

z#CLIPSegForImageSegmentation.forwardr  )NNNNNNNNNTN)rB   rC   rD   r   r   r]   r   r   r   r   ru  r   rF   r   r   r   r?   r*   r   r   r"   r"   ro   r#   rq    sp    
	

rq  )r   r   r&  r6  rq  )r   )?rE   r`  rg  dataclassesr   typingr   r   r   r   r   torch.utils.checkpointr   rk  r	   modeling_attn_mask_utilsr
   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   utilsr   r   r   r   configuration_clipsegr   r   r   
get_loggerrB   r   r   r$   r)   r*   rH   rK   r2  rO   r   floatr   r   r   r   r   r  r  r&  r3  r6  r   rO  rP  rq  __all__r"   r"   r"   r#   <module>   s   
#T0
Q20W[443 f9d .