o
    ei                     @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z	ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z& e!'e(Z)de	j*de	j*fddZ+de	j*de	j*fddZ,eeG dd deZ-eeG dd deZ.eeG dd deZ/G dd  d e
j0Z1G d!d" d"e
j0Z2	#dJd$e
j0d%e	j*d&e	j*d'e	j*d(e	j*dB d)e3d*e3fd+d,Z4G d-d. d.e
j0Z5G d/d0 d0e
j0Z6G d1d2 d2eZ7eG d3d4 d4eZ8G d5d6 d6e
j0Z9G d7d8 d8e8Z:G d9d: d:e8Z;G d;d< d<e
j0Z<G d=d> d>e8Z=eG d?d@ d@e8Z>G dAdB dBe
j0Z?G dCdD dDe8Z@edEdFG dGdH dHe8ZAg dIZBdS )KzPyTorch CLIPSeg model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfiglogitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalcross_entropytorcharangelenr   )r    r#   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/clipseg/modeling_clipseg.pycontrastive_loss)   s   r%   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r%   t)r&   caption_loss
image_lossr#   r#   r$   clipseg_loss.   s   r*   c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZejdB ed< dZeed< dZeed	< d
ee fddZdS )CLIPSegOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                       t  fdd  D S )Nc                 3   .    | ]}|d vr | nt  | V  qdS ))r1   r2   Ngetattrto_tuple.0kselfr#   r$   	<genexpr>T   
    
z)CLIPSegOutput.to_tuple.<locals>.<genexpr>tuplekeysr;   r#   r;   r$   r7   S      zCLIPSegOutput.to_tuple)__name__
__module____qualname____doc__r,   r    FloatTensor__annotations__r-   r.   r/   r0   r1   r   r2   r@   r   r7   r#   r#   r#   r$   r+   4   s   
 r+   c                   @   sP   e Zd ZU dZdZejdB ed< dZe	ej dB ed< dZ
e	ej dB ed< dS )CLIPSegDecoderOutputz|
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    Nr   hidden_states
attentions)rC   rD   rE   rF   r   r    rG   rH   rJ   r@   rK   r#   r#   r#   r$   rI   Z   s
   
 rI   c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZeed< dZeed< d	ee fd
dZdS )CLIPSegImageSegmentationOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Binary cross entropy loss for segmentation.
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
        Conditional embeddings used for segmentation.
    pooled_output (`torch.FloatTensor` of shape `(batch_size, embed_dim)`):
        Pooled output of the [`CLIPSegVisionModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    decoder_output (`CLIPSegDecoderOutput`):
        The output of the [`CLIPSegDecoder`].
    Nr,   r   conditional_embeddingspooled_outputr2   decoder_outputr   c                    r3   )Nc                 3   r4   ))r2   rO   Nr5   r8   r;   r#   r$   r=      r>   z:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>r?   r;   r#   r;   r$   r7      rB   z'CLIPSegImageSegmentationOutput.to_tuple)rC   rD   rE   rF   r,   r    rG   rH   r   rM   rN   r2   r   rO   rI   r@   r   r7   r#   r#   r#   r$   rL   g   s   
 rL   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )CLIPSegVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__rQ   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr    randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr!   expandr<   rQ   	__class__r#   r$   r^      s"   
"z CLIPSegVisionEmbeddings.__init__
embeddingsheightwidthr   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   NrZ   g      ?r   rW   bicubicF)sizemodealign_cornersdim)shaperl   weight	unsqueezer    jit
is_tracingrX   rb   r   reshapepermuter   r   interpolateviewcat)r<   rr   rs   rt   ri   rl   rj   class_pos_embedpatch_pos_embedrz   
new_height	new_widthsqrt_num_positionsr#   r#   r$   interpolate_pos_encoding   s*   



z0CLIPSegVisionEmbeddings.interpolate_pos_encodingTpixel_valuesc           
   
   C   s   |j \}}}}|s&|| jks|| jkr&td| d| d| j d| j d	| |}|ddd}| j|dd}tj	||gdd}	|rR|	| 
|	|| }	|	S |	| | j }	|	S )	NzInput image size (*z) doesn't match model ().rW   r   rZ   ry   )r{   ra   
ValueErrorrh   flatten	transposere   rn   r    r   r   rl   rX   )
r<   r   r   
batch_size_rs   rt   patch_embedsclass_embedsrr   r#   r#   r$   forward   s    
zCLIPSegVisionEmbeddings.forwardT)rC   rD   rE   r   r^   r    Tensorintr   rG   r   __classcell__r#   r#   rp   r$   rP      s     )rP   c                	       sX   e Zd Zdef fddZ			ddejdB dejdB dejdB dejfd	d
Z	  Z
S )CLIPSegTextEmbeddingsrQ   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )NrX   rY   Fr[   )r]   r^   r_   r   rk   
vocab_sizetoken_embeddingmax_position_embeddingsrl   rm   r    r!   rn   r<   rQ   r`   rp   r#   r$   r^      s   

zCLIPSegTextEmbeddings.__init__N	input_idsrX   inputs_embedsr   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )NrZ   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r{   rl   r|   r   rX   r   )r<   r   rX   r   
seq_lengthmax_position_embeddingposition_embeddingsrr   r#   r#   r$   r      s"   

zCLIPSegTextEmbeddings.forward)NNN)rC   rD   rE   r   r^   r    
LongTensorrG   r   r   r   r#   r#   rp   r$   r      s    r           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )NrZ   r   )rz   dtype)ptrainingr   rW   )r    matmulr   r   r   softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr#   r#   r$   eager_attention_forward  s   
r   c                       sf   e Zd ZdZdeeB f fddZ		ddejdejdB d	e	dB d
e
ejejdB f fddZ  ZS )CLIPSegAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrQ   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)r]   r^   rQ   r_   r`   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projro   rp   r#   r$   r^     s$   

zCLIPSegAttention.__init__NFrJ   r   output_attentionsr   c                 K   s   |j \}}}| |}| |}	| |}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
t	| j
jt}|| ||	|
|f| j| jsUdn| jd|\}}|||| }| |}|sqd}||fS )z#Input shape: Batch x Time x Channelr   rW   r   )r   r   N)r{   r   r   r   r   r   r   r   r   get_interfacerQ   _attn_implementationr   r   r   r   r   r   r   )r<   rJ   r   r   r   r   r   r`   queriesrA   valuesattention_interfacer   r   r#   r#   r$   r   2  s6   	




zCLIPSegAttention.forward)NF)rC   rD   rE   rF   r   r   r^   r    r   boolr@   r   r   r#   r#   rp   r$   r     s    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )
CLIPSegMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)r]   r^   rQ   r   
hidden_actactivation_fnr   r   r_   intermediate_sizefc1fc2ro   rp   r#   r$   r^   ^  s
   
zCLIPSegMLP.__init__rJ   r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r<   rJ   r#   r#   r$   r   e  s   


zCLIPSegMLP.forward)rC   rD   rE   r^   r    r   r   r   r#   r#   rp   r$   r   ]  s    r   c                       sV   e Zd Zdef fddZ	ddejdejdedB d	ee	 d
e
ej f
ddZ  ZS )CLIPSegEncoderLayerrQ   c                    R   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S N)epsr]   r^   r_   r`   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2ro   rp   r#   r$   r^   n     


zCLIPSegEncoderLayer.__init__FrJ   r   r   Nr   r   c                 K   sj   |}|  |}| jd|||d|\}}|| }|}| |}| |}|| }|f}|r3||f7 }|S )I  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rJ   r   r   Nr#   )r   r   r   r   )r<   rJ   r   r   r   residualr   outputsr#   r#   r$   r   v  s$   




zCLIPSegEncoderLayer.forwardF)rC   rD   rE   r   r^   r    r   r   r   r   r@   rG   r   r   r#   r#   rp   r$   r   m  s    r   c                   @   s2   e Zd ZU eed< dZdZdZe	 dd Z
dS )CLIPSegPreTrainedModelrQ   clip)imagetextTc                 C   s~  | j j}t|tr3tj|jjd|d d tj|jjd|d d t	|j
t|j
jd d nt|trs| j j}tj|jd|jd | d tj|jj|j j| d tj|jj|j j| d t	|j
t|jd nt|tr| j j}|jd d|j j d  | }|jd | }tj|jj|d tj|jj|d tj|jj|d tj|jj|d n\t|tr| j j}|j jd d|j j d  | }d|j j d | }tj|jj|d tj|jj|d n&t|trtj|j j|j!d | j j d tj|j"j|j#d | j j d t|t$j%r&t&|j' t(|j t|t$j)r;|j'd	ur=t&|j' d	S d	S d	S )
zInitialize the weightsr   g{Gz?)meanstdrZ   rY   r   )r   rW   N)*rQ   initializer_factor
isinstancer   initnormal_r   r|   rl   copy_rX   r    r!   r{   rn   rP   re   r`   rh   initializer_rangerj   r   num_hidden_layersr   r   r   r   r   r_   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   r   zeros_rV   ones_r   )r<   r   factorin_proj_stdout_proj_stdfc_stdr#   r#   r$   _init_weights  sP   
$


 z$CLIPSegPreTrainedModel._init_weightsN)rC   rD   rE   r   rH   base_model_prefixinput_modalitiessupports_gradient_checkpointingr    no_gradr   r#   r#   r#   r$   r     s   
 r   c                       sp   e Zd ZdZdef fddZe				ddejdB de	dB de	dB d	e	dB d
e
e deeB fddZ  ZS )CLIPSegEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    rQ   c                    s:   t     | _t fddt jD | _d| _d S )Nc                       g | ]}t  qS r#   )r   r9   r   rQ   r#   r$   
<listcomp>      z+CLIPSegEncoder.__init__.<locals>.<listcomp>F)	r]   r^   rQ   r   
ModuleListranger   layersgradient_checkpointingro   rp   r  r$   r^     s   
 
zCLIPSegEncoder.__init__Nr   r   output_hidden_statesreturn_dictr   r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ]#\}
}|r<||	f }||	|fd|i|}|d }	|rT||d f }q1|r\||	f }t|	||dS )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr#   r   r   r   )last_hidden_staterJ   rK   )rQ   r   r  use_return_dict	enumerater
  r   )r<   r   r   r   r  r  r   encoder_statesall_attentionsrJ   idxencoder_layerlayer_outputsr#   r#   r$   r     s6    

zCLIPSegEncoder.forward)NNNN)rC   rD   rE   rF   r   r^   r   r    r   r   r   r   r@   r   r   r   r#   r#   rp   r$   r    s*    r  c                       s|   e Zd Zdef fddZe						ddejdB dejdB dejdB dedB d	edB d
edB de	e
B fddZ  ZS )CLIPSegTextTransformerrQ   c                    sL   t  | |j}t|| _t|| _tj||j	d| _
|j| _|   d S r   )r]   r^   r_   r   rr   r  encoderr   r   r   final_layer_normeos_token_id	post_initr   rp   r#   r$   r^   %  s   

zCLIPSegTextTransformer.__init__Nr   r   rX   r   r  r  r   c              	   K   s~  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| }|d|d }| j||d}	t| j |	|t	j
|	jd |	jdd d}|dd  | jd|	||||dd	|}
|
d
 }| |}| jdkr|t	j
|jd
 |jd|jt	j|jdjddf }n|t	j
|jd
 |jd|jt	j|jd| jk jddf }|s||f|
dd   S t|||
j|
jdS )NzYou have to specify input_idsrZ   )r   rX   r   r   )rQ   r   r   cache_positionpast_key_valuesr   T)r   r   r   r  r  r   r   rW   )r   r   ry   r  pooler_outputrJ   rK   r#   )rQ   r   r  r  r   rv   r   rr   r	   r    r!   r{   r   popr  r  r  r   r   argmaxr   rJ   rK   )r<   r   r   rX   r   r  r  r   input_shaperJ   encoder_outputsr  rN   r#   r#   r$   r   2  sf   


	zCLIPSegTextTransformer.forwardNNNNNN)rC   rD   rE   r   r^   r   r    r   r   r@   r   r   r   r#   r#   rp   r$   r  $  s0    	r  c                       s   e Zd ZU eed< dZddgZdef fddZdej	fdd	Z
d
d Ze						ddejdB dejdB dejdB dedB dedB dedB deeB fddZ  ZS )CLIPSegTextModelrQ   )r   r   r   c                    "   t  | t|| _|   d S r   )r]   r^   r  
text_modelr  ro   rp   r#   r$   r^        
zCLIPSegTextModel.__init__r   c                 C   
   | j jjS r   r&  rr   r   r;   r#   r#   r$   get_input_embeddings     
z%CLIPSegTextModel.get_input_embeddingsc                 C   s   || j j_d S r   r)  )r<   r   r#   r#   r$   set_input_embeddings  s   z%CLIPSegTextModel.set_input_embeddingsNr   r   rX   r   r  r  c                 K   s   | j ||||||dS )a;  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rX   r   r  r  )r&  )r<   r   r   rX   r   r  r  r   r#   r#   r$   r     s   zCLIPSegTextModel.forwardr#  )rC   rD   rE   r   rH   r   _no_split_modulesr^   r   Moduler*  r,  r   r    r   r   r@   r   r   r   r#   r#   rp   r$   r$    s:   
 	r$  c                       sl   e Zd Zdef fddZe				ddejdB dedB dedB d	edB d
edB de	e
B fddZ  ZS )CLIPSegVisionTransformerrQ   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )r]   r^   rQ   r_   rP   rr   r   r   r   pre_layrnormr  r  post_layernormr   rp   r#   r$   r^     s   


z!CLIPSegVisionTransformer.__init__NTr   r   r  r  r   r   c           
      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||d}| |}| j||||d}|d }|d d dd d f }	| |	}	|sS||	f|dd   S t||	|j	|j
dS )N)r   )r   r   r  r  r   r   r  )rQ   r   r  r  rr   r1  r  r2  r   rJ   rK   )
r<   r   r   r  r  r   rJ   r"  r  rN   r#   r#   r$   r     s.   	

z CLIPSegVisionTransformer.forward)NNNT)rC   rD   rE   r   r^   r   r    rG   r   r@   r   r   r   r#   r#   rp   r$   r0    s(    
r0  c                       s   e Zd ZU eed< dZdZdef fddZdej	fddZ
e							
		ddejd	B ded	B ded	B ded	B ded	B deeB fddZ  ZS )CLIPSegVisionModelrQ   r   )r   c                    r%  r   )r]   r^   r0  vision_modelr  ro   rp   r#   r$   r^     r'  zCLIPSegVisionModel.__init__r   c                 C   r(  r   )r4  rr   rh   r;   r#   r#   r$   r*    r+  z'CLIPSegVisionModel.get_input_embeddingsNTr   r  r   r  c                 K   s   | j |||||dS )a+  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r  r   r  )r4  )r<   r   r   r  r   r  r   r#   r#   r$   r     s    zCLIPSegVisionModel.forward)NNNTN)rC   rD   rE   r   rH   main_input_namer   r^   r   r/  r*  r   r    rG   r   r@   r   r   r   r#   r#   rp   r$   r3    s2   
 r3  c                       s  e Zd ZU eed< def fddZee		ddej	dej	dB dej	dB de
e d	eeB f
d
dZee	ddejdede
e d	eeB fddZe									ddejdB dejdB dej	dB dejdB dedB dedB dedB dededB d	eeB fddZ  ZS )r   rQ   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	|_	|j	|_	|j
| _
|j| _|j| _t|| _t|| _tj| j| j
dd| _tj| j| j
dd| _tt| jj| _|   d S )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)rV   )r]   r^   r   text_configr   	TypeErrortypevision_configr   r   projection_dimr_   r   r   r  r&  r0  r4  r   r   r   r   rc   r    tensorrQ   logit_scale_init_valuelogit_scaler  )r<   rQ   r8  r;  rp   r#   r$   r^   '  s4   

zCLIPSegModel.__init__Nr   r   rX   r   r   c                 K   s0   | j d|||dd|}|j}| ||_|S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r   r   rX   r  Nr#   )r&  r  r   )r<   r   r   rX   r   text_outputsrN   r#   r#   r$   get_text_featuresK  s   zCLIPSegModel.get_text_featuresTr   r   c                 K   s.   | j d||dd|}|j}| ||_|S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegModel
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```T)r   r   r  Nr#   )r4  r  r   )r<   r   r   r   vision_outputsrN   r#   r#   r$   get_image_featuresn  s   zCLIPSegModel.get_image_featuresreturn_lossr   r  r  c
              	   K   s(  |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	| j|||||	d}| j||||||	d}|d }| |}|d }| |}||jdddd }||jdddd }| j	
 }t|| | }| }d}|rtt|}|	s||||||f}|dur|f| S |S t|||||||d	S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegModel
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr5  r-  r   rW   rZ   T)r   rz   keepdim)r,   r-   r.   r/   r0   r1   r2   )rQ   r   r  r  r4  r&  r   r   normr?  expr    r   r'   r*   r+   )r<   r   r   r   rX   rD  r   r  r   r  r   rB  r@  r0   r/   r?  r.   r-   r,   outputr#   r#   r$   r     sV   )	


zCLIPSegModel.forward)NNr   )	NNNNNNNTN)rC   rD   rE   r   rH   r^   r   r   r    r   r   r   r@   r   rA  rG   r   rC  r   r+   r   r   r#   r#   rp   r$   r   #  sz   
 $!$	
r   c                       sX   e Zd ZdZdef fddZ	ddejdejdejd	ed
B de	ej
 f
ddZ  ZS )CLIPSegDecoderLayerz
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    rQ   c                    r   r   r   ro   rp   r#   r$   r^     r   zCLIPSegDecoderLayer.__init__FrJ   r   causal_attention_maskr   Nr   c                 C   sd   |}| j ||||d\}}|| }| |}|}| |}|| }| |}|f}|r0||f7 }|S )r   )rJ   r   rJ  r   )r   r   r   r   )r<   rJ   r   rJ  r   r   r   r   r#   r#   r$   r     s"   




zCLIPSegDecoderLayer.forwardr   )rC   rD   rE   rF   r   r^   r    r   r   r@   rG   r   r   r#   r#   rp   r$   rI    s    rI  c                       s\   e Zd Zdef fddZ			ddeej dejdedB d	edB d
edB f
ddZ	  Z
S )CLIPSegDecoderrQ   c                    s`  t     j| _t j j| _t j j| _ j	r` j
jd  j
jd f}ttj j jdddt tj j jd |d |d dt tj jd d|d |d d| _ntj jd j
j j
jd| _t j}t fd	d
t|D | _t j
 j_ j_ j_d_tfdd
tt jD | _|   d S )N   r   r   )rT   paddingrW   r   )rT   rU   )rU   c                    s   g | ]}t  jj jqS r#   )r   r   r;  r_   
reduce_dimr  r  r#   r$   r  P  s    z+CLIPSegDecoder.__init__.<locals>.<listcomp>reluc                    r  r#   )rI  r  )decoder_configr#   r$   r  X  r  ) r]   r^   conditional_layerr   r   r<  rN  film_mulfilm_add"use_complex_transposed_convolutionr;  rb   
Sequentialrf   ReLUConvTranspose2dtransposed_convolutionr"   extract_layersr  r	  reducescopydeepcopyr_   decoder_num_attention_headsr   decoder_intermediate_sizer   r   r
  r  )r<   rQ   transposed_kernelsdepthrp   )rQ   rP  r$   r^   0  sD   
$zCLIPSegDecoder.__init__NTrJ   rM   r   r  r  c                 K   sp  |rdnd }|r
dnd }|d d d }	d }
t t|	| j| jD ]O\}\}}}|
d ur1|||
 }
n||}
|| jkrR| ||
ddd | | }
|
ddd}
||
d d |d}|d }
|re||
f7 }|rn||d f7 }q|
d d dd d d f ddd}
tt	
|
jd }|jd }|
||
jd ||}
| |
d}|stdd |||fD S t|||d	S )
Nr#   rZ   r   r   rW   )r   rJ  r   c                 s   s    | ]	}|d ur|V  qd S r   r#   )r9   vr#   r#   r$   r=     s    z)CLIPSegDecoder.forward.<locals>.<genexpr>)r   rJ   rK   )r  zipr
  rZ  rQ  rR  r   rS  r   mathsqrtr{   r   rX  squeezer@   rI   )r<   rJ   rM   r   r  r  r   all_hidden_statesr  activationsrH  i
activationlayerreducer  rv   r   r   r#   r#   r$   r   \  sD   	"

$
zCLIPSegDecoder.forward)NNT)rC   rD   rE   r   r^   r@   r    r   r   r   r   r#   r#   rp   r$   rK  /  s     0rK  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    )custom_introc                       s  e Zd ZU eed< def fddZ					ddedB dejdB dejdB dejdB d	ejdB f
d
dZ	e
											ddejdB dejdB d	ejdB dejdB dejdB dejdB dejdB dedB dedB dededB deeB fddZ  ZS )CLIPSegForImageSegmentationrQ   c                    s:   t  | || _t|| _|j| _t|| _|   d S r   )	r]   r^   rQ   r   r   rY  rK  decoderr  ro   rp   r#   r$   r^     s   

z$CLIPSegForImageSegmentation.__init__Nr   r   r   rX   conditional_pixel_valuesc                 C   s   |d ur/t ||krtdt  | jj|||dj}W d    |S 1 s(w   Y  |S |d ur[t ||kr=tdt  | j|j}W d    |S 1 sTw   Y  |S td)Nz@Make sure to pass as many prompt texts as there are query images)r   rX   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r"   r   r    r  r   rA  r  rC  )r<   r   r   r   rX   ro  rM   r#   r#   r$   get_conditional_embeddings  s2   



z6CLIPSegForImageSegmentation.get_conditional_embeddingsTr   rM   labelsr   r  r   r  r   c                    s  |dur|n| j j}t Q | jj||d|
|d}| j|d }|r'|jn|d   fdd| jD }|rHt	|j
|j|	rA|jnd|jd}n|	sV|dd |d	d  n|}W d   n1 sbw   Y  |du ry| j|jd
 ||||d}n|jd
 |jd
 krtd|jd | j jkrtd| j||||	|d}|r|jn|d
 }d}|dur||j}t }|||}|s|||||f}|dur|f| S |S t||||||dS )a~  
        conditional_pixel_values (`torch.FloatTensor`, *optional*):
            The pixel values of the conditional images.
        conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
            The conditional embeddings for the query images. If provided, the model will use this instead of computing
            the embeddings from the conditional_pixel_values.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> texts = ["a cat", "a remote", "a blanket"]
        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> print(logits.shape)
        torch.Size([3, 352, 352])
        ```NTr5  r   rW   c                    s   g | ]} |d   qS )r   r#   )r9   rh  rJ   r#   r$   r  	  s    z7CLIPSegForImageSegmentation.forward.<locals>.<listcomp>r  r   r   )r   r   r   rX   ro  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r  r  )r,   r   rM   rN   r2   rO   )rQ   r  r    r  r   r4  r   rJ   rY  r   r  r  rK   rp  r{   r   r<  rn  r   r   r   r   BCEWithLogitsLossrL   )r<   r   r   ro  rM   r   rX   rq  r   r  r   r  r   rB  rN   rg  decoder_outputsr   r,   loss_fnrH  r#   rr  r$   r     s~   2

z#CLIPSegForImageSegmentation.forward)NNNNN)NNNNNNNNNTN)rC   rD   rE   r   rH   r^   r   r    r   rp  r   rG   r   r   r@   r+   r   r   r#   r#   rp   r$   rm    sp   
 
	
rm  )r   r   r$  r3  rm  )r   )CrF   r[  rc  collections.abcr   dataclassesr   typingr   r    r    r   r   rg  r   masking_utilsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   configuration_clipsegr   r   r   
get_loggerrC   loggerr   r%   r*   r+   rI   rL   r/  rP   r   floatr   r   r   r   r   r  r  r$  r0  r3  r   rI  rK  rm  __all__r#   r#   r#   r$   <module>   s    
#T0
B24P^647 S9g 1