o
    i                     @   s  d Z ddlZddlmZ ddlmZmZmZ ddl	Z
ddlZddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZ ddlm Z m!Z!m"Z" e#e$Z%dej&dej&fddZ'dej&dej&fddZ(dej&de)fddZ*dLdej&de+de,de)dej&f
ddZ-dMdd Z.d!d" Z/G d#d$ d$ej0Z1G d%d& d&ej0Z2G d'd( d(ej0Z3eeG d)d* d*eZ4G d+d, d,ej0Z5G d-d. d.ej0Z6G d/d0 d0ej0Z7G d1d2 d2ej0Z8G d3d4 d4ej0Z9G d5d6 d6e9Z:G d7d8 d8ej0Z;G d9d: d:eZ<eG d;d< d<eZ=G d=d> d>ej0Z>G d?d@ d@ej0Z?G dAdB dBej0Z@G dCdD dDe=ZAG dEdF dFej0ZBG dGdH dHe=ZCeG dIdJ dJe=ZDg dKZEdS )NzPyTorch GroupViT model.    N)	dataclass)AnyOptionalUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputauto_docstringfilter_out_non_signature_kwargslogging	torch_int   )GroupViTConfigGroupViTTextConfigGroupViTVisionConfiglogitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalcross_entropytorcharangelenr   )r    r!   k/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/groupvit/modeling_groupvit.pycontrastive_loss'   s   r#   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r#   t)r$   caption_loss
image_lossr!   r!   r"   groupvit_loss,   s   r(   dimc                 C   sJ   |  |}|j|ddd }tj| tjd||d}||  | }|S )NTkeepdimr   memory_format      ?)softmaxmaxr   
zeros_likelegacy_contiguous_formatscatter_detach)r   r)   y_softindexy_hardretr!   r!   r"   hard_softmax2   s
   
r9   Ftauhardc           
      C   s   t jjt jd| j| jdt jd| j| jd}|| j}| | | }|	|}|rK|j
|ddd }t j| t jd||d}||  | }	|	S |}	|	S )N        )r   dtyper.   Tr*   r   r,   )r   distributionsgumbelGumbeltensorr   r>   sampleshaper/   r0   r1   r2   r3   r4   )
r   r;   r<   r)   gumbel_distgumbelsr5   r6   r7   r8   r!   r!   r"   gumbel_softmax<   s   
rG   c           	      C   s   || | j d  d }||kr tt|| }| j d | }ntt|| }| j d | }| j d }| j d }| ||||} tjj| ||fd|d} | S )a  
    Args:
        attentions (`torch.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
        height (`int`): height of the output attention map
        width (`int`): width of the output attention map
        align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.

    Returns:
        `torch.Tensor`: resized attention map of shape [batch_size, groups, height, width]
             ?r   r   bilinearsizemodealign_corners)rD   intnproundreshaper   r   interpolate)	
attentionsheightwidthrN   scale
feat_widthfeat_height
batch_sizegroupsr!   r!   r"   resize_attention_mapR   s   

r\   c                 C   s   g }t  7 d}| D ]*}|ddd }|du r|}n|| }t|ddd g|R  }|| qW d   n1 s@w   Y  |d }|S )a1  
    Args:
        attentions (`tuple(torch.FloatTensor)`: tuple of attention maps returned by `GroupViTVisionTransformer`
        hw_shape (`tuple(int)`): height and width of the output attention map
    Returns:
        `torch.Tensor`: the attention map of shape [batch_size, groups, height, width]
    Nr   rH   r   r:   )r   no_gradpermute
contiguousr\   append)rT   hw_shape	attn_mapsprev_attn_masks
attn_maskscur_attn_mapfinal_groupingr!   r!   r"   get_grouping_from_attentionsp   s   	
rg   c                       s*   e Zd Zdef fddZdd Z  ZS )GroupViTCrossAttentionLayerconfigc                    sJ   t    t|| _tj|j|jd| _t	|| _
tj|j|jd| _d S Neps)super__init__GroupViTAttentionattnr   	LayerNormhidden_sizelayer_norm_epsnorm2GroupViTMLPmlp	norm_postselfri   	__class__r!   r"   rn      s
   


z$GroupViTCrossAttentionLayer.__init__c                 C   s<   |}|| j ||dd  }|| | | }| |}|S )N)encoder_hidden_statesr   )rp   rv   rt   rw   )ry   querykeyxr!   r!   r"   forward   s
   
z#GroupViTCrossAttentionLayer.forward)__name__
__module____qualname__r   rn   r   __classcell__r!   r!   rz   r"   rh      s    rh   c                       s4   e Zd Zdef fddZd	ddZdd Z  ZS )
GroupViTAssignAttentionri   c                    sj   t    |jd | _t|j|j| _t|j|j| _t|j|j| _t|j|j| _	|j
| _
d S )N      )rm   rn   rr   rW   r   Linearq_projk_projv_projproj
assign_epsrx   rz   r!   r"   rn      s   
z GroupViTAssignAttention.__init__Tc                 C   sD   |r| j rt|d|d}|S |rt|dd}|S tjj|dd}|S )N)r)   r<   r)   )trainingrG   r9   r   r   r/   )ry   rp   r@   r<   r!   r!   r"   get_attn   s   
z GroupViTAssignAttention.get_attnc                 C   s   |}|  |}| |}| |}||dd | j }| |}| j|ddd}||jddd| j  }|| }| |}||fS )Nr   r:   F)r@   r<   Tr)   r+   )	r   r   r   	transposerW   r   sumr   r   )ry   r}   r~   valueraw_attnrp   	soft_attnoutr!   r!   r"   r      s   




zGroupViTAssignAttention.forward)TT)r   r   r   r   rn   r   r   r   r!   r!   rz   r"   r      s    

r   c                       s2   e Zd Zdef fddZdd Zdd Z  ZS )GroupViTTokenAssignri   c                    s   t    || _tj j jd| _t j	t
jjr j	n j	 j	f} fdd|D \}}t |||| _tj j jd| _tj j jd| _t | _t | _tj j jd| _t  j| j| _d S )Nrk   c                    s   g | ]	}t | j qS r!   )rO   rr   ).0r   ri   r!   r"   
<listcomp>   s    z0GroupViTTokenAssign.__init__.<locals>.<listcomp>)rm   rn   num_output_groupr   rq   rr   rs   norm_tokens
isinstanceassign_mlp_ratiocollectionsabcIterableGroupViTMixerMLP	mlp_internorm_post_tokensnorm_xrh   pre_assign_attnr   assign
norm_new_xru   mlp_channels)ry   ri   num_group_tokenr   r   
tokens_dimchannels_dimrz   r   r"   rn      s   



zGroupViTTokenAssign.__init__c                 C   s   |  |}| |}|S )z
        Args:
            group_tokens (torch.Tensor): group tokens, [batch_size, num_group_tokens, channels]

        Returns:
            projected_group_tokens (torch.Tensor): [batch_size, num_output_groups, channels]
        )r   r   )ry   group_tokensprojected_group_tokensr!   r!   r"   project_group_token   s   
	
z'GroupViTTokenAssign.project_group_tokenc                 C   s^   |  |}| |}| |}| ||}| ||\}}||7 }|| | | }||fS )z
        Args:
            image_tokens (`torch.Tensor`): image tokens, of shape [batch_size, input_length, channels]
            group_tokens (`torch.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
        )r   r   r   r   r   r   r   )ry   image_tokensr   r   new_image_tokens	attentionr!   r!   r"   r      s   


zGroupViTTokenAssign.forward)r   r   r   r   rn   r   r   r   r!   r!   rz   r"   r      s    r   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeej ed< dZeed	< dZeed
< dee fddZdS )GroupViTModelOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
        Classification scores for each pixel.

        <Tip warning={true}>

        The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
        to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
        original image size as post-processing. You should always check your logits shape and resize as needed.

        </Tip>
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of
        [`GroupViTTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of
        [`GroupViTVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`GroupViTTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`GroupViTVisionModel`].
    Nlosslogits_per_imagelogits_per_textsegmentation_logitstext_embedsimage_embedstext_model_outputvision_model_outputr   c                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))r   r   N)getattrto_tuple)r   kry   r!   r"   	<genexpr>0  s
    
z/GroupViTModelOutput.to_tuple.<locals>.<genexpr>)tuplekeysr   r!   r   r"   r   /  s   zGroupViTModelOutput.to_tuple)r   r   r   __doc__r   r   r   FloatTensor__annotations__r   r   r   r   r   r   r   r   r   r   r   r!   r!   r!   r"   r     s   
 r   c                	       sh   e Zd ZdZ				ddedeeeeef f ded	ef fd
dZddej	de
dej	fddZ  ZS )GroupViTPatchEmbeddingsz#
    Image to Patch Embedding.
          r      
image_size
patch_sizenum_channels	embed_dimc                    s   t    t|tjjr|n||f}t|tjjr|n||f}|d |d  |d |d   }|| _|| _|| _t	j
||||d| _d S )Nr   r   )kernel_sizestride)rm   rn   r   r   r   r   r   r   num_patchesr   Conv2d
projection)ry   r   r   r   r   r   rz   r!   r"   rn   ;  s   
 z GroupViTPatchEmbeddings.__init__Fpixel_valuesinterpolate_pos_encodingr   c              
   C   sx   |j \}}}}|s.|| jd ks|| jd kr.td| d| d| jd  d| jd  d	| |ddd}|S )Nr   r   zInput image size (*z) doesn't match model ().rH   )rD   r   
ValueErrorr   flattenr   )ry   r   r   rZ   r   rU   rV   r   r!   r!   r"   r   L  s   zGroupViTPatchEmbeddings.forward)r   r   r   r   F)r   r   r   r   rO   r   r   rn   r   Tensorboolr   r   r!   r!   rz   r"   r   6  s     $r   c                       s\   e Zd Zdef fddZdejdededejfdd	Zddejde	dejfddZ
  ZS )GroupViTVisionEmbeddingsri   c                    sx   t    t|j|j|j|jd| _| jj}t	
td||j| _t	|j| _t	j|j|jd| _|j| _|| _d S )N)r   r   r   r   r   rk   )rm   rn   r   r   r   r   rr   patch_embeddingsr   r   	Parameterr   zerosposition_embeddingsDropoutdropoutrq   rs   	layernormri   )ry   ri   r   rz   r!   r"   rn   Y  s   

z!GroupViTVisionEmbeddings.__init__
embeddingsrU   rV   r   c                 C   s   |j d }| jj d }tj s||kr||kr| jS | j}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r:   rI   r   r   rH   bicubicFrK   )rD   r   r   jit
is_tracingr   r   rR   r^   r   r   rS   view)ry   r   rU   rV   r   num_positionspatch_pos_embedr)   
new_height	new_widthsqrt_num_positionsr!   r!   r"   r   i  s&   




z1GroupViTVisionEmbeddings.interpolate_pos_encodingFr   r   c           
      C   sd   |j \}}}}| j||d}| |}| \}}}	|r&|| ||| }n|| j }| |}|S )N)r   )rD   r   r   rL   r   r   r   )
ry   r   r   rZ   r   rU   rV   r   seq_len_r!   r!   r"   r     s   


z GroupViTVisionEmbeddings.forwardr   )r   r   r   r   rn   r   r   rO   r   r   r   r   r!   r!   rz   r"   r   X  s    $&r   c                	       sX   e Zd Zdef fddZ			ddeej deej deej dej	fd	d
Z
  ZS )GroupViTTextEmbeddingsri   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )Nposition_ids)r   r:   F)
persistent)rm   rn   rr   r   	Embedding
vocab_sizetoken_embeddingmax_position_embeddingsposition_embeddingregister_bufferr   r   expandry   ri   r   rz   r!   r"   rn     s   

zGroupViTTextEmbeddings.__init__N	input_idsr   inputs_embedsr   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )Nr:   r   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rD   r   weightr   r   r   )ry   r   r   r   
seq_lengthmax_position_embeddingr   r   r!   r!   r"   r     s"   

zGroupViTTextEmbeddings.forwardNNN)r   r   r   r   rn   r   r   
LongTensorr   r   r   r   r!   r!   rz   r"   r     s    r   c                
       s   e Zd ZdZdededededef
 fddZed	d
 Zdd Z	dde
jdee
j de
jfddZ		dde
jdee
j dee dee
j fddZ  ZS )GroupViTStagezMThis corresponds to the `GroupingLayer` class in the GroupViT implementation.ri   depthnum_prev_group_tokenr   r   c                    s   t    || _|| _|dkrttd| j| _	nd | _	t
 fddt|D | _|dkr;t ||d| _nd | _|dkr^|dkr^ttj j jdt | jd || _d S d | _d S )Nr   r   c                       g | ]}t  qS r!   GroupViTEncoderLayerr   r   r   r!   r"   r         z*GroupViTStage.__init__.<locals>.<listcomp>)ri   r   r   rk   rH   )rm   rn   r  r   r   r   r   r   rr   group_token
ModuleListrangelayersr   
downsample
Sequentialrq   rs   r   group_projector)ry   ri   r  r  r   r   rz   r   r"   rn     s(   



zGroupViTStage.__init__c                 C   s
   | j d uS N)r  r   r!   r!   r"   with_group_token  s   
zGroupViTStage.with_group_tokenc                 C   s>   | j r|d d d | j f |d d | j d f fS |d fS r  )r  r   ry   r   r!   r!   r"   split_x  s   0zGroupViTStage.split_xNr   r  r   c                 C   s   |d u r|S t j||gddS )Nr   r   )r   cat)ry   r   r  r!   r!   r"   concat_x  s   zGroupViTStage.concat_xFhidden_statesprev_group_tokenoutput_attentionsc                 C   s   | j r| j|ddd}| jdur|| | }nd}|}| ||}| jD ]}||ddd}|d }q(| |\}}d}	| jdurL| ||\}}	||f}
|rW|
|	f }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the grouping tensors of Grouping block.
        r   r:   N)attention_maskcausal_attention_mask)	r  r  r   rL   r  r  r  r  r  )ry   r  r  r  r  r   cat_xlayer	layer_outr   outputsr!   r!   r"   r     s&   




zGroupViTStage.forwardr  )NF)r   r   r   r   r   rO   rn   propertyr  r  r   r   r   r  r   r   r   r   r   r!   r!   rz   r"   r    s8    "
"r  c                
       sX   e Zd Z			ddedee dee dee f fddZdejd	ejfd
dZ	  Z
S )ru   Nri   rr   intermediate_sizeoutput_sizec                    sp   t    || _t|j | _|d ur|n|j}|d ur|n|j}|d ur&|n|}t	||| _
t	||| _d S r  )rm   rn   ri   r   
hidden_actactivation_fnrr   r"  r   r   fc1fc2)ry   ri   rr   r"  r#  rz   r!   r"   rn   *  s   
zGroupViTMLP.__init__r  r   c                 C   s"   |  |}| |}| |}|S r  )r&  r%  r'  )ry   r  r!   r!   r"   r   :  s   


zGroupViTMLP.forwardr  )r   r   r   r   r   rO   rn   r   r   r   r   r!   r!   rz   r"   ru   )  s    ru   c                       s   e Zd Z fddZ  ZS )r   c                    s    t  |dd}|ddS Nr   rH   )rm   r   r   r  rz   r!   r"   r   B  s   zGroupViTMixerMLP.forward)r   r   r   r   r   r!   r!   rz   r"   r   A  s    r   c                       s   e Zd ZdZ fddZdejdedefddZ							
ddejde	ej de	ej de	ej
 de	e deeje	ej e	eej  f fddZ  ZS )ro   z=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r   r   )rm   rn   ri   rr   r   num_attention_heads	num_headshead_dimr   rW   attention_dropoutr   r   r   r   r   r   out_projrx   rz   r!   r"   rn   J  s"   

zGroupViTAttention.__init__rB   r   bszc                 C   s    | ||| j| jdd S r(  )r   r*  r+  r   r_   )ry   rB   r   r.  r!   r!   r"   _shape]  s    zGroupViTAttention._shapeNFr  r  r  r|   r  r   c                 C   s  |  \}}}|du}	| || j }
|	r*| | |d|}| | |d|}n| | |d|}| | |d|}|| j d| jf}| |
||j| }
|j| }|j| }| d}t	
|
|dd}|  || j ||fkrtd|| j ||f d|   |dur|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}|dur|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}tjj|dd}|r||| j||}||| j ||}nd}tjj|| j| jd	}t	
||}|  || j || jfkr@td
|| j|| jf d|   ||| j|| j}|dd}||||}| |}||fS )z#Input shape: Batch x Time x ChannelNr:   r   rH   z$Attention weights should be of size z	, but is z!Attention mask should be of size r   )pr   z `attn_output` should be of size )rL   r   rW   r/  r   r   r*  r+  r   r   bmmr   r   r   r   r/   r   r   rR   r-  )ry   r  r  r  r|   r  r.  tgt_lenr   is_cross_attentionquery_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputr!   r!   r"   r   `  sl   




zGroupViTAttention.forward)NNNF)r   r   r   r   rn   r   r   rO   r/  r   r   r   r   r   r   r!   r!   rz   r"   ro   G  s*    ro   c                       sT   e Zd Zdef fddZ	ddejdejdejdee d	e	ej
 f
d
dZ  ZS )r  ri   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S rj   )rm   rn   rr   r   ro   	self_attnr   rq   rs   layer_norm1ru   rv   layer_norm2rx   rz   r!   r"   rn     s   


zGroupViTEncoderLayer.__init__Fr  r  r  r  r   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r  r  r  r  )r>  r=  r?  rv   )ry   r  r  r  r  residualr9  r   r!   r!   r"   r     s"   




zGroupViTEncoderLayer.forwardr   )r   r   r   r   rn   r   r   r   r   r   r   r   r   r!   r!   rz   r"   r    s    r  c                   @   s&   e Zd ZU eed< dZdZdd ZdS )GroupViTPreTrainedModelri   groupvitTc                 C   s  | j j}t|tjtjfr"|jjjd|d |j	dur!|j	j
  nt|tjr5|j	j
  |jjd | j j}t|trX|jjjjd|d d |jjjjd|d d dS t|tr| j j}|jd d|j j d  | }|jd | }tjj|jj|d tjj|jj|d tjj|jj|d tjj|jj|d dS t|tr| j j}|j jd d|j j d  | }d|j j d | }tjj|jj|d tjj|jj|d dS dS )	zInitialize the weightsr=   )meanstdNr.   g{Gz?r   rH   )rD  )ri   initializer_ranger   r   r   r   r   datanormal_biaszero_rq   fill_initializer_factorr   r   r   ro   r   num_hidden_layersinitr   r   r   r-  ru   rr   r&  r'  )ry   module
init_rangefactorin_proj_stdout_proj_stdfc_stdr!   r!   r"   _init_weights  s8   



 z%GroupViTPreTrainedModel._init_weightsN)r   r   r   r   r   base_model_prefixsupports_gradient_checkpointingrT  r!   r!   r!   r"   rA    s
   
 rA  c                       sb   e Zd Zdeddf fddZ			ddejdee dee d	ee de	e
ef f
d
dZ  ZS )GroupViTVisionEncoderri   r   Nc                    s>   t     | _t fddtt jD | _d| _	d S )Nc              
      sF   g | ]}t   j|  j|  j| |d kr j|d  nd dqS )r   r   )ri   r  r   r   r  )r  depthsnum_group_tokensnum_output_groups)r   ir   r!   r"   r     s    z2GroupViTVisionEncoder.__init__.<locals>.<listcomp>F)
rm   rn   ri   r   r  r  r    rX  stagesgradient_checkpointingrx   rz   r   r"   rn     s   


zGroupViTVisionEncoder.__init__r  output_hidden_statesr  return_dictc                 C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|r"dnd }|r(dnd }d }t| jD ](\}}	|r<||f }|	|||}
|
d }|
d }|rY|
d d urY||
d f }q1|ra||f }|sotdd |||fD S t|||dS )Nr!   r   r   rH   c                 s       | ]	}|d ur|V  qd S r  r!   r   vr!   r!   r"   r   D      z0GroupViTVisionEncoder.forward.<locals>.<genexpr>last_hidden_stater  rT   )ri   r  r^  use_return_dict	enumerater\  r   r   )ry   r  r^  r  r_  all_hidden_statesall_groupingsr   r[  stagelayer_outputsr!   r!   r"   r   "  s.   

zGroupViTVisionEncoder.forwardr  )r   r   r   r   rn   r   r   r   r   r   r   r   r   r   r!   r!   rz   r"   rW    s     
rW  c                       st   e Zd ZdZdef fddZ					ddeej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )GroupViTTextEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self-attention layers. Each layer is a
    [`GroupViTEncoderLayer`].

    Args:
        config: GroupViTTextConfig
    ri   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    r  r!   r  r	  r   r!   r"   r   V  r
  z0GroupViTTextEncoder.__init__.<locals>.<listcomp>F)	rm   rn   ri   r   r  r  rL  r  r]  rx   rz   r   r"   rn   S  s   
 
zGroupViTTextEncoder.__init__Nr  r  r  r^  r_  r   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ] \}
}|r<||	f }||	|||d}|d }	|rQ||d f }q1|rY||	f }|sgtdd |	||fD S t|	||dS )	a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr!   )r  r   r   c                 s   r`  r  r!   ra  r!   r!   r"   r     rc  z.GroupViTTextEncoder.forward.<locals>.<genexpr>rd  )ri   r  r^  rf  rg  r  r   r   )ry   r   r  r  r  r^  r_  encoder_statesall_attentionsr  idxencoder_layerrk  r!   r!   r"   r   Y  s6   &

zGroupViTTextEncoder.forward)NNNNN)r   r   r   r   r   rn   r   r   r   r   r   r   r   r   r   r!   r!   rz   r"   rl  J  s*    	
rl  c                       s   e Zd Zdef fddZe						ddeej deej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )GroupViTTextTransformerri   c                    sH   t    || _|j}t|| _t|| _tj	||j
d| _|j| _d S rj   )rm   rn   ri   rr   r   r   rl  encoderr   rq   rs   final_layer_normeos_token_idr   rz   r!   r"   rn     s   


z GroupViTTextTransformer.__init__Nr   r  r   r  r^  r_  r   c                 C   sj  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| }|d|d }| j||d}t||j	|j
d}	|d urLt||j	}| j|||	|||d}
|
d }| |}| jdkr|tj|jd |j
d|jtj|j
djdd	f }n|tj|jd |j
d|jtj|j
d| jk jdd	f }|s||f|
d
d   S t|||
j|
jdS )NzYou have to specify input_idsr:   )r   r   r   )r   r  r  r  r^  r_  r   rH   )r>   r   r   r   re  pooler_outputr  rT   )ri   r  r^  rf  r   rL   r   r   r	   r>   r   r
   rr  rs  rt  r   r   rD   torO   argmaxr   r  rT   )ry   r   r  r   r  r^  r_  input_shaper  r  encoder_outputsre  pooled_outputr!   r!   r"   r     s\   

	

	zGroupViTTextTransformer.forwardNNNNNN)r   r   r   r   rn   r   r   r   r   r   r   r   r   r   r   r!   r!   rz   r"   rq    s0    
rq  c                       s   e Zd ZU eed< def fddZdejfddZdd Z	e
												dd
eej deej deej dee dee dee deeef fddZ  ZS )GroupViTTextModelri   c                    "   t  | t|| _|   d S r  )rm   rn   rq  
text_model	post_initrx   rz   r!   r"   rn        
zGroupViTTextModel.__init__r   c                 C   
   | j jjS r  r  r   r   r   r!   r!   r"   get_input_embeddings     
z&GroupViTTextModel.get_input_embeddingsc                 C   s   || j j_d S r  r  )ry   r   r!   r!   r"   set_input_embeddings
  s   z&GroupViTTextModel.set_input_embeddingsNr   r  r   r  r^  r_  c                 C   s   | j ||||||dS )a9  
        Examples:

        ```python
        >>> from transformers import CLIPTokenizer, GroupViTTextModel

        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r  r   r  r^  r_  )r  )ry   r   r  r   r  r^  r_  r!   r!   r"   r     s   zGroupViTTextModel.forwardr|  )r   r   r   r   r   rn   r   Moduler  r  r   r   r   r   r   r   r   r   r   r   r!   r!   rz   r"   r}    s6   
 
r}  c                       sh   e Zd Zdef fddZe				ddeej dee	 dee	 dee	 d	e
eef f
d
dZ  ZS )GroupViTVisionTransformerri   c                    s@   t    || _|j}t|| _t|| _tj	||j
d| _d S rj   )rm   rn   ri   rr   r   r   rW  rr  r   rq   rs   r   r   rz   r!   r"   rn   1  s   


z"GroupViTVisionTransformer.__init__Nr   r^  r  r_  r   c           	      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| |}| j||||d}|d }| |}|jdd}|sO||f|dd   S t	|||j
|jdS )Nz You have to specify pixel_values)r  r^  r  r_  r   r   r   ru  )ri   r  r^  rf  r   r   rr  r   rC  r   r  rT   )	ry   r   r^  r  r_  r  rz  re  r{  r!   r!   r"   r   :  s0   

z!GroupViTVisionTransformer.forwardNNNN)r   r   r   r   rn   r   r   r   r   r   r   r   r   r   r   r!   r!   rz   r"   r  0  s$    	
r  c                       s   e Zd ZU eed< dZdef fddZdefddZe					dde
ej d	e
e d
e
e de
e deeef f
ddZ  ZS )GroupViTVisionModelri   r   c                    r~  r  )rm   rn   r  vision_modelr  rx   rz   r!   r"   rn   i  r  zGroupViTVisionModel.__init__r   c                 C   r  r  )r  r   r   r   r!   r!   r"   r  o  r  z(GroupViTVisionModel.get_input_embeddingsNr  r^  r_  c                 C   s   | j ||||dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTVisionModel

        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r  r^  r_  )r  )ry   r   r  r^  r_  r!   r!   r"   r   r  s   zGroupViTVisionModel.forwardr  )r   r   r   r   r   main_input_namern   r   r  r   r   r   r   r   r   r   r   r   r   r!   r!   rz   r"   r  e  s*   
 
r  c                       s  e Zd ZU eed< def fddZe e		ddej	de
ej	 de
ej	 dejfd	d
Ze edej	dejfddZe									dde
ej de
ej de
ej	 de
ej de
e de
e de
e de
e de
e deeef fddZ  ZS )GroupViTModelri   c              
      s6  t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _
|j| _|j| _t|| _t|| _ttj| j| j
ddt| j
tjddtj| j
| j	dd| _ttj| j| j
ddt| j
tjddtj| j
| j	dd| _tt| jj| _|   d S )NzOconfig.text_config is expected to be of type GroupViTTextConfig but is of type .zSconfig.vision_config is expected to be of type GroupViTVisionConfig but is of type T)rH  )inplace) rm   rn   r   text_configr   	TypeErrortypevision_configr   projection_dimprojection_intermediate_dimrr   text_embed_dimvision_embed_dimrq  r  r  r  r   r  r   BatchNorm1dReLUvisual_projectiontext_projectionr   r   rB   ri   logit_scale_init_valuelogit_scaler  )ry   ri   r  r  rz   r!   r"   rn     sF   





zGroupViTModel.__init__Nr   r  r   r   c                 C   s    | j |||d}| |j}|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`GroupViTTextModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import CLIPTokenizer, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```)r   r  r   )r  r  rv  )ry   r   r  r   text_outputstext_featuresr!   r!   r"   get_text_features  s   zGroupViTModel.get_text_featuresr   c                 C   s   |  |}| |j}|S )am  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`GroupViTVisionModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, GroupViTModel
        >>> from transformers.image_utils import load_image

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```)r  r  rv  )ry   r   vision_outputsimage_featuresr!   r!   r"   get_image_features  s   
z GroupViTModel.get_image_featuresreturn_lossr  r^  output_segmentationr_  c
              
   C   sP  |dur|n| j j}|dur|n| j j}|rd}|dur|n| j j}|	dur(|	n| j j}	| j||||	d}
| j||||||	d}|
d }| |}|d }| |}||j	ddd }||j	ddd }| j
 }t|| | }| }d}|r|
d }| |d|jd }|r|
d	 }n|
d
 }t||jd
d }||j	ddd }t|| | }||jd d|jd dd
d}||jd |jd d}t||| }||jd |jd |jd
 |jd	 }d}|rt|}|	s|dur|||||||
f}n||||||
f}|dur|f| S |S t||||||||
dS )aM  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_segmentation (`bool`, *optional*):
            Whether or not to return the segmentation logits.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NTr  r  r   r:   r   r   r   rH   )r   r   r   r   r   r   r   r   )ri   r  r  r^  rf  r  r  r  r  normr  expr   matmulr%   rR   rD   rg   r^   r(   r   )ry   r   r   r  r   r  r  r^  r  r_  r  r  r   r   r  r   r   
seg_logitsimage_group_embedsrT   groupinglogits_per_image_groupflatten_groupingr   outputr!   r!   r"   r     s   )	




 

zGroupViTModel.forward)NN)	NNNNNNNNN)r   r   r   r   r   rn   r   r   r   r   r   r   r  r  r  r   r   r   r   r   r   r!   r!   rz   r"   r    sd   
 + 	

r  )r  rA  r}  r  )r   Fr:   r   )Fr   collections.abcr   dataclassesr   typingr   r   r   numpyrP   r   r   activationsr   modeling_attn_mask_utilsr	   r
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   r   configuration_groupvitr   r   r   
get_loggerr   loggerr   r#   r(   rO   r9   floatr   rG   r\   rg   r  rh   r   r   r   r   r   r   r  ru   r   ro   r  rA  rW  rl  rq  r}  r  r  r  __all__r!   r!   r!   r"   <module>   s`   
$

070"K(^o2':X\251  