o
    i                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZmZmZmZmZ ddl m!Z!m"Z"m#Z# e$e%Z&de
j'de
j'fddZ(de
j'de
j'fddZ)eeG dd deZ*eeG dd deZ+eeG dd deZ,G dd dej-Z.G dd dej-Z/	 dGd!ej-d"e
j'd#e
j'd$e
j'd%ee
j' d&e0d'e0fd(d)Z1G d*d+ d+ej-Z2G d,d- d-ej-Z3G d.d/ d/eZ4eG d0d1 d1eZ5G d2d3 d3ej-Z6G d4d5 d5ej-Z7G d6d7 d7e5Z8G d8d9 d9ej-Z9G d:d; d;e5Z:eG d<d= d=e5Z;G d>d? d?ej-Z<G d@dA dAe5Z=edBdCG dDdE dEe5Z>g dFZ?dS )HzPyTorch CLIPSeg model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplefilter_out_non_signature_kwargslogging	torch_int   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfiglogitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalcross_entropytorcharangelenr   )r    r$   `/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/clipseg/modeling_clipseg.pycontrastive_loss'   s   r&   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r&   t)r'   caption_loss
image_lossr$   r$   r%   clipseg_loss,   s   r+   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< d
ee fddZdS )CLIPSegOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                       t  fdd  D S )Nc                 3   .    | ]}|d vr | nt  | V  qdS ))r2   r3   Ngetattrto_tuple.0kselfr$   r%   	<genexpr>R   
    
z)CLIPSegOutput.to_tuple.<locals>.<genexpr>tuplekeysr<   r$   r<   r%   r8   Q      zCLIPSegOutput.to_tuple)__name__
__module____qualname____doc__r-   r   r!   FloatTensor__annotations__r.   r/   r0   r1   r2   r   r3   rA   r   r8   r$   r$   r$   r%   r,   2   s   
 r,   c                   @   sP   e Zd ZU dZdZeej ed< dZ	ee
ej  ed< dZee
ej  ed< dS )CLIPSegDecoderOutputz|
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    Nr   hidden_states
attentions)rD   rE   rF   rG   r   r   r!   rH   rI   rK   rA   rL   r$   r$   r$   r%   rJ   X   s
   
 rJ   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeed< dZeed< d	ee fd
dZdS )CLIPSegImageSegmentationOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Binary cross entropy loss for segmentation.
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
        Conditional embeddings used for segmentation.
    pooled_output (`torch.FloatTensor` of shape `(batch_size, embed_dim)`):
        Pooled output of the [`CLIPSegVisionModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    decoder_output (`CLIPSegDecoderOutput`):
        The output of the [`CLIPSegDecoder`].
    Nr-   r   conditional_embeddingspooled_outputr3   decoder_outputr   c                    r4   )Nc                 3   r5   ))r3   rP   Nr6   r9   r<   r$   r%   r>      r?   z:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>r@   r<   r$   r<   r%   r8   ~   rC   z'CLIPSegImageSegmentationOutput.to_tuple)rD   rE   rF   rG   r-   r   r!   rH   rI   r   rN   rO   r3   r   rP   rJ   rA   r   r8   r$   r$   r$   r%   rM   e   s   
 rM   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )CLIPSegVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__rR   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr!   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr"   expandr=   rR   	__class__r$   r%   r_      s"   
"z CLIPSegVisionEmbeddings.__init__
embeddingsheightwidthr   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr[   g      ?r   rX   bicubicF)sizemodealign_cornersdim)shaperm   weight	unsqueezer!   jit
is_tracingrY   rc   r   reshapepermuter   r   interpolateviewcat)r=   rs   rt   ru   rj   rm   rk   class_pos_embedpatch_pos_embedr{   
new_height	new_widthsqrt_num_positionsr$   r$   r%   interpolate_pos_encoding   s*   



z0CLIPSegVisionEmbeddings.interpolate_pos_encodingTpixel_valuesc           
   
   C   s   |j \}}}}|s&|| jks|| jkr&td| d| d| j d| j d	| |}|ddd}| j|dd}tj	||gdd}	|rR|	| 
|	|| }	|	S |	| | j }	|	S )	NzInput image size (*z) doesn't match model ().rX   r   r[   rz   )r|   rb   
ValueErrorri   flatten	transposerf   ro   r!   r   r   rm   rY   )
r=   r   r   
batch_size_rt   ru   patch_embedsclass_embedsrs   r$   r$   r%   forward   s    
zCLIPSegVisionEmbeddings.forwardT)rD   rE   rF   r   r_   r!   Tensorintr   rH   r   __classcell__r$   r$   rq   r%   rQ      s     )rQ   c                	       sX   e Zd Zdef fddZ			ddeej deej deej dej	fd	d
Z
  ZS )CLIPSegTextEmbeddingsrR   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )NrY   rZ   Fr\   )r^   r_   r`   r   rl   
vocab_sizetoken_embeddingmax_position_embeddingsrm   rn   r!   r"   ro   r=   rR   ra   rq   r$   r%   r_      s   

zCLIPSegTextEmbeddings.__init__N	input_idsrY   inputs_embedsr   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )Nr[   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r|   rm   r}   r   rY   r   )r=   r   rY   r   
seq_lengthmax_position_embeddingposition_embeddingsrs   r$   r$   r%   r      s"   

zCLIPSegTextEmbeddings.forward)NNN)rD   rE   rF   r   r_   r   r!   
LongTensorrH   r   r   r   r$   r$   rq   r%   r      s    r           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr[   r   )r{   dtype)ptrainingr   rX   )r!   matmulr   r   r   softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr$   r$   r%   eager_attention_forward  s   
r   c                       sv   e Zd ZdZdeeef f fddZ			ddej	de
ej	 d	e
ej	 d
e
e deej	e
ej	 f f
ddZ  ZS )CLIPSegAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrR   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)r^   r_   rR   r`   ra   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrp   rq   r$   r%   r_     s$   

zCLIPSegAttention.__init__NFrK   r   causal_attention_maskoutput_attentionsr   c              
   C   s0  |j \}}}| |}| |}	| |}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
| jj	dkrY|durR|durR|| }n|durX|}n|du| _
t}| jj	dkrlt| jj	 }|| ||	|
|| j
| j| js{dn| jd\}}|||| }| |}|sd}||fS )z#Input shape: Batch x Time x Channelr   rX   flash_attention_2Neagerr   )r   r   r   )r|   r   r   r   r   r   r   r   rR   _attn_implementationr   r   r   r   r   r   r   r   r   )r=   rK   r   r   r   r   r   ra   queriesrB   valuesattention_interfacer   r   r$   r$   r%   r   0  s@   	






zCLIPSegAttention.forward)NNF)rD   rE   rF   rG   r   r   r   r_   r!   r   r   boolrA   r   r   r$   r$   rq   r%   r     s"    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )
CLIPSegMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)r^   r_   rR   r	   
hidden_actactivation_fnr   r   r`   intermediate_sizefc1fc2rp   rq   r$   r%   r_   e  s
   
zCLIPSegMLP.__init__rK   r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r=   rK   r$   r$   r%   r   l  s   


zCLIPSegMLP.forward)rD   rE   rF   r_   r!   r   r   r   r$   r$   rq   r%   r   d  s    r   c                       sT   e Zd Zdef fddZ	ddejdejdejdee d	e	ej
 f
d
dZ  ZS )CLIPSegEncoderLayerrR   c                    R   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S N)epsr^   r_   r`   ra   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rp   rq   r$   r%   r_   u     


zCLIPSegEncoderLayer.__init__FrK   r   r   r   r   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rK   r   r   r   )r   r   r   r   r=   rK   r   r   r   residualr   outputsr$   r$   r%   r   }  s"   




zCLIPSegEncoderLayer.forwardF)rD   rE   rF   r   r_   r!   r   r   r   rA   rH   r   r   r$   r$   rq   r%   r   t  s    r   c                   @   s&   e Zd ZU eed< dZdZdd ZdS )CLIPSegPreTrainedModelrR   clipTc                 C   sV  | j j}t|tr"|jjjjd|d d |jjjjd|d d nt|t	rW| j j}t
jj|jd|jd | d t
jj|jj|j j| d t
jj|jj|j j| d nt|tr| j j}|jd d|j j d  | }|jd | }t
jj|jj|d t
jj|jj|d t
jj|jj|d t
jj|jj|d n_t|tr| j j}|j jd d|j j d  | }d|j j d | }t
jj|jj|d t
jj|jj|d n't|trt
jj|jj|jd | j j d t
jj|jj|jd | j j d t|t
jr|j j!  |jj"d t|t
j#r'|j dur)|j j!  dS dS dS )	zInitialize the weightsr   g{Gz?)meanstdr   )r   rX   g      ?N)$rR   initializer_factor
isinstancer   r   r}   datanormal_rm   rQ   r   initrf   ra   ri   initializer_ranger   num_hidden_layersr   r   r   r   r   r`   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   rW   zero_fill_r   )r=   r   factorin_proj_stdout_proj_stdfc_stdr$   r$   r%   _init_weights  sL   



 
z$CLIPSegPreTrainedModel._init_weightsN)rD   rE   rF   r   rI   base_model_prefixsupports_gradient_checkpointingr   r$   r$   r$   r%   r     s
   
 r   c                       sx   e Zd ZdZdef fddZe					ddeej	 deej	 dee
 d	ee
 d
ee
 deeef fddZ  ZS )CLIPSegEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    rR   c                    s:   t     | _t fddt jD | _d| _d S )Nc                       g | ]}t  qS r$   )r   r:   r   rR   r$   r%   
<listcomp>      z+CLIPSegEncoder.__init__.<locals>.<listcomp>F)	r^   r_   rR   r   
ModuleListranger   layersgradient_checkpointingrp   rq   r  r%   r_     s   
 
zCLIPSegEncoder.__init__Nr   r   r   output_hidden_statesreturn_dictr   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ] \}
}|r<||	f }||	|||d}|d }	|rQ||d f }q1|rY||	f }t|	||dS )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr$   )r   r   r   )last_hidden_staterK   rL   )rR   r   r  use_return_dict	enumerater
  r   )r=   r   r   r   r   r  r  encoder_statesall_attentionsrK   idxencoder_layerlayer_outputsr$   r$   r%   r     s2   '

zCLIPSegEncoder.forwardNNNNN)rD   rE   rF   rG   r   r_   r   r   r!   r   r   r   rA   r   r   r   r$   r$   rq   r%   r    s,    
r  c                       s   e Zd Zdef fddZe						ddeej deej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )CLIPSegTextTransformerrR   c                    sH   t    || _|j}t|| _t|| _tj	||j
d| _|j| _d S r   )r^   r_   rR   r`   r   rs   r  encoderr   r   r   final_layer_normeos_token_idr   rq   r$   r%   r_   .  s   


zCLIPSegTextTransformer.__init__Nr   r   rY   r   r  r  r   c                 C   sj  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| }|d|d }| j||d}t||j	|j
d}	|d urLt||j	}| j|||	|||d}
|
d }| |}| jdkr|tj|jd |j
d|jtj|j
djdd	f }n|tj|jd |j
d|jtj|j
d| jk jdd	f }|s||f|
d
d   S t|||
j|
jdS )NzYou have to specify input_idsr[   )r   rY   r   )r   r   r   r   r  r  r   rX   )r   r   rz   r   r  pooler_outputrK   rL   )rR   r   r  r  r   rw   r   rs   r
   r   r   r   r  r  r  r!   r"   r|   r   r   argmaxr   rK   rL   )r=   r   r   rY   r   r  r  input_shaperK   r   encoder_outputsr  rO   r$   r$   r%   r   9  s\   

	

	zCLIPSegTextTransformer.forwardNNNNNN)rD   rE   rF   r   r_   r   r   r!   r   r   r   rA   r   r   r   r$   r$   rq   r%   r  -  s0    
r  c                       s   e Zd ZU eed< ddgZdef fddZdejfddZ	d	d
 Z
e						ddeej deej deej dee dee dee deeef fddZ  ZS )CLIPSegTextModelrR   r   r   c                    "   t  | t|| _|   d S r   )r^   r_   r  
text_model	post_initrp   rq   r$   r%   r_        
zCLIPSegTextModel.__init__r   c                 C   
   | j jjS r   r#  rs   r   r<   r$   r$   r%   get_input_embeddings     
z%CLIPSegTextModel.get_input_embeddingsc                 C   s   || j j_d S r   r'  )r=   r   r$   r$   r%   set_input_embeddings  s   z%CLIPSegTextModel.set_input_embeddingsNr   r   rY   r   r  r  c                 C   s   | j ||||||dS )a;  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rY   r   r  r  )r#  )r=   r   r   rY   r   r  r  r$   r$   r%   r     s   zCLIPSegTextModel.forwardr   )rD   rE   rF   r   rI   _no_split_modulesr_   r   Moduler(  r*  r   r   r!   r   r   r   rA   r   r   r   r$   r$   rq   r%   r!    s8   
 
r!  c                       sp   e Zd Zdef fddZe				ddeej dee	 dee	 d	ee	 d
ee	 de
eef fddZ  ZS )CLIPSegVisionTransformerrR   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )r^   r_   rR   r`   rQ   rs   r   r   r   pre_layrnormr  r  post_layernormr   rq   r$   r%   r_     s   


z!CLIPSegVisionTransformer.__init__NTr   r   r  r  r   r   c           
      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||d}| |}| j||||d}|d }|d d dd d f }	| |	}	|sS||	f|dd   S t||	|j	|j
dS )N)r   )r   r   r  r  r   r   r  )rR   r   r  r  rs   r/  r  r0  r   rK   rL   )
r=   r   r   r  r  r   rK   r  r  rO   r$   r$   r%   r     s.   	

z CLIPSegVisionTransformer.forward)NNNT)rD   rE   rF   r   r_   r   r   r!   rH   r   r   rA   r   r   r   r$   r$   rq   r%   r.    s(    

r.  c                       s   e Zd ZU eed< dZdef fddZdejfddZ	e
						ddeej d
ee dee dee dee deeef fddZ  ZS )CLIPSegVisionModelrR   r   c                    r"  r   )r^   r_   r.  vision_modelr$  rp   rq   r$   r%   r_     r%  zCLIPSegVisionModel.__init__r   c                 C   r&  r   )r2  rs   ri   r<   r$   r$   r%   r(    r)  z'CLIPSegVisionModel.get_input_embeddingsNTr   r  r   r  c                 C   s   | j |||||dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r  r   r  )r2  )r=   r   r   r  r   r  r$   r$   r%   r     s   zCLIPSegVisionModel.forward)NNNTN)rD   rE   rF   r   rI   main_input_namer_   r   r-  r(  r   r   r!   rH   r   r   rA   r   r   r   r$   r$   rq   r%   r1    s0   
 
r1  c                       s  e Zd ZU eed< def fddZe e		ddej	de
ej	 de
ej	 dejfd	d
Ze e	ddejdedejfddZe									dde
ej de
ej de
ej	 de
ej de
e de
e de
e dede
e deeef fddZ  ZS )r   rR   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	|_	|j	|_	|j
| _
|j| _|j| _t|| _t|| _tj| j| j
dd| _tj| j| j
dd| _tt| jj| _|   d S )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)rW   )r^   r_   r   text_configr   	TypeErrortypevision_configr   r   projection_dimr`   r   r   r  r#  r.  r2  r   r   r   r   rd   r!   tensorrR   logit_scale_init_valuelogit_scaler$  )r=   rR   r6  r9  rq   r$   r%   r_   '  s4   

zCLIPSegModel.__init__Nr   r   rY   r   c                 C   s$   | j |||d}|j}| |}|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegTextModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```)r   r   rY   )r#  r  r   )r=   r   r   rY   text_outputsrO   text_featuresr$   r$   r%   get_text_featuresK  s   
zCLIPSegModel.get_text_featuresTr   r   c                 C   s"   | j ||d}|j}| |}|S )an  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegVisionModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegModel
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```)r   r   )r2  r  r   )r=   r   r   vision_outputsrO   image_featuresr$   r$   r%   get_image_featureso  s   
zCLIPSegModel.get_image_featuresreturn_lossr   r  r  c
              	   C   s(  |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	| j|||||	d}
| j||||||	d}|
d }| |}|d }| |}||jdddd }||jdddd }| j	
 }t|| | }| }d}|rtt|}|	s||||||
f}|dur|f| S |S t|||||||
d	S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegModel
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr3  r+  r   rX   r[   T)r   r{   keepdim)r-   r.   r/   r0   r1   r2   r3   )rR   r   r  r  r2  r#  r   r   normr=  expr!   r   r(   r+   r,   )r=   r   r   r   rY   rD  r   r  r   r  rA  r>  r1   r0   r=  r/   r.   r-   outputr$   r$   r%   r     sV   (	


zCLIPSegModel.forward)NNr   )	NNNNNNNTN)rD   rE   rF   r   rI   r_   r   r   r!   r   r   rH   r@  r   rC  r   r   rA   r,   r   r   r$   r$   rq   r%   r   #  sr   
 $"%	

r   c                       sX   e Zd ZdZdef fddZ	ddejdejdejd	ee	 d
e
ej f
ddZ  ZS )CLIPSegDecoderLayerz
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    rR   c                    r   r   r   rp   rq   r$   r%   r_     r   zCLIPSegDecoderLayer.__init__FrK   r   r   r   r   c                 C   sd   |}| j ||||d\}}|| }| |}|}| |}|| }| |}|f}|r0||f7 }|S r   )r   r   r   r   r   r$   r$   r%   r     s"   




zCLIPSegDecoderLayer.forwardr   )rD   rE   rF   rG   r   r_   r!   r   r   r   rA   rH   r   r   r$   r$   rq   r%   rI    s    rI  c                       s\   e Zd Zdef fddZ			ddeej dejdee	 d	ee	 d
ee	 f
ddZ
  ZS )CLIPSegDecoderrR   c                    sX  t     j| _t j j| _t j j| _ j	r` j
jd  j
jd f}ttj j jdddt tj j jd |d |d dt tj jd d|d |d d| _ntj jd j
j j
jd| _t j}t fd	d
t|D | _t j
 j_ j_ j_d_tfdd
tt jD | _d S )N   r   r   )rU   paddingrX   r   )rU   rV   )rV   c                    s   g | ]}t  jj jqS r$   )r   r   r9  r`   
reduce_dimr  r  r$   r%   r  Q  s    z+CLIPSegDecoder.__init__.<locals>.<listcomp>reluc                    r  r$   )rI  r  )decoder_configr$   r%   r  Y  r  )r^   r_   conditional_layerr   r   r:  rM  film_mulfilm_add"use_complex_transposed_convolutionr9  rc   
Sequentialrg   ReLUConvTranspose2dtransposed_convolutionr#   extract_layersr  r	  reducescopydeepcopyr`   decoder_num_attention_headsr   decoder_intermediate_sizer   r   r
  )r=   rR   transposed_kernelsdepthrq   )rR   rO  r%   r_   1  sB   
(zCLIPSegDecoder.__init__NTrK   rN   r   r  r  c                 C   sp  |rdnd }|r
dnd }|d d d }d }	t t|| j| jD ]O\}
\}}}|	d ur1|||	 }	n||}	|
| jkrR| ||	ddd | | }	|	ddd}	||	d d |d}|d }	|re||	f7 }|rn||d f7 }q|	d d dd d d f ddd}	tt	
|	jd }|jd }|	||	jd ||}	| |	d}|stdd |||fD S t|||d	S )
Nr$   r[   r   r   rX   )r   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r$   )r:   vr$   r$   r%   r>     s    z)CLIPSegDecoder.forward.<locals>.<genexpr>)r   rK   rL   )r  zipr
  rY  rP  rQ  r   rR  r   mathsqrtr|   r   rW  squeezerA   rJ   )r=   rK   rN   r   r  r  all_hidden_statesr  activationsrH  i
activationlayerreducer  rw   r   r   r$   r$   r%   r   [  sD   "

$
zCLIPSegDecoder.forward)NNT)rD   rE   rF   r   r_   rA   r!   r   r   r   r   r   r$   r$   rq   r%   rJ  0  s     .rJ  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    )custom_introc                       s  e Zd ZU eed< def fddZ					ddee deej	 deej	 deej	 d	eej	 f
d
dZ
e											ddeej deej d	eej deej deej	 deej deej dee dee dedee deeef fddZ  ZS )CLIPSegForImageSegmentationrR   c                    s:   t  | || _t|| _|j| _t|| _|   d S r   )	r^   r_   rR   r   r   rX  rJ  decoderr$  rp   rq   r$   r%   r_     s   

z$CLIPSegForImageSegmentation.__init__Nr   r   r   rY   conditional_pixel_valuesc                 C   s   |d ur.t ||krtdt  | jj|||d}W d    |S 1 s'w   Y  |S |d urYt ||kr<tdt  | j|}W d    |S 1 sRw   Y  |S td)Nz@Make sure to pass as many prompt texts as there are query images)r   rY   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r#   r   r!   no_gradr   r@  rC  )r=   r   r   r   rY   rn  rN   r$   r$   r%   get_conditional_embeddings  s.   



z6CLIPSegForImageSegmentation.get_conditional_embeddingsTr   rN   labelsr   r  r   r  r   c                    s  |dur|n| j j}t Q | jj||d|
|d}| j|d }|r'|jn|d   fdd| jD }|rHt	|j
|j|	rA|jnd|jd}n|	sV|dd |d	d  n|}W d   n1 sbw   Y  |du ry| j|jd
 ||||d}n|jd
 |jd
 krtd|jd | j jkrtd| j||||	|d}|r|jn|d
 }d}|dur||j}t }|||}|s|||||f}|dur|f| S |S t||||||dS )a~  
        conditional_pixel_values (`torch.FloatTensor`, *optional*):
            The pixel values of the conditional images.
        conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
            The conditional embeddings for the query images. If provided, the model will use this instead of computing
            the embeddings from the conditional_pixel_values.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> texts = ["a cat", "a remote", "a blanket"]
        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> print(logits.shape)
        torch.Size([3, 352, 352])
        ```NTr3  r   rX   c                    s   g | ]} |d   qS )r   r$   )r:   rg  rK   r$   r%   r    s    z7CLIPSegForImageSegmentation.forward.<locals>.<listcomp>r  r   r   )r   r   r   rY   rn  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r  r  )r-   r   rN   rO   r3   rP   )rR   r  r!   ro  r   r2  r   rK   rX  r   r  r  rL   rp  r|   r   r:  rm  r   r   r   r   BCEWithLogitsLossrM   )r=   r   r   rn  rN   r   rY   rq  r   r  r   r  rA  rO   rf  decoder_outputsr   r-   loss_fnrH  r$   rr  r%   r     s~   1

z#CLIPSegForImageSegmentation.forwardr  )NNNNNNNNNTN)rD   rE   rF   r   rI   r_   r   r   r!   r   rp  r   rH   r   r   r   rA   r,   r   r   r$   r$   rq   r%   rl    sp   
 
	

rl  )r   r   r!  r1  rl  )r   )@rG   rZ  rb  dataclassesr   typingr   r   r   r   r!   r   rf  r	   modeling_attn_mask_utilsr
   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   utilsr   r   r   r   r   r   configuration_clipsegr   r   r   
get_loggerrD   loggerr   r&   r+   r,   rJ   rM   r-  rQ   r   floatr   r   r   r   r   r  r  r!  r.  r1  r   rI  rJ  rl  __all__r$   r$   r$   r%   <module>   s    
#T0
K20V[443 T9d 0