o
    ei                     @   s  d Z ddlZddlmZ ddlmZ ddlZddl	Z	ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZm Z m!Z! ddl"m#Z#m$Z$m%Z% e &e'Z(de	j)de	j)fddZ*de	j)de	j)fddZ+de	j)de,fddZ-dNde	j)de.de/de,de	j)f
dd Z0dOd!d"Z1d#d$ Z2G d%d& d&e
j3Z4G d'd( d(e
j3Z5G d)d* d*e
j3Z6eeG d+d, d,eZ7G d-d. d.e
j3Z8G d/d0 d0e
j3Z9G d1d2 d2e
j3Z:G d3d4 d4e
j3Z;G d5d6 d6e
j3Z<G d7d8 d8e<Z=G d9d: d:e
j3Z>G d;d< d<eZ?eG d=d> d>eZ@G d?d@ d@e
j3ZAG dAdB dBe
j3ZBG dCdD dDe@ZCG dEdF dFe@ZDG dGdH dHe
j3ZEG dIdJ dJe@ZFeG dKdL dLe@ZGg dMZHdS )PzPyTorch GroupViT model.    N)	dataclass)Any)nn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int   )GroupViTConfigGroupViTTextConfigGroupViTVisionConfiglogitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalcross_entropytorcharangelenr   )r    r!   l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/groupvit/modeling_groupvit.pycontrastive_loss(   s   r#   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r#   t)r$   caption_loss
image_lossr!   r!   r"   groupvit_loss-   s   r(   dimc                 C   sJ   |  |}|j|ddd }tj| tjd||d}||  | }|S )NTkeepdimr   memory_format      ?)softmaxmaxr   
zeros_likelegacy_contiguous_formatscatter_detach)r   r)   y_softindexy_hardretr!   r!   r"   hard_softmax3   s
   
r9   Ftauhardc           
      C   s   t jjt jd| j| jdt jd| j| jd}|| j}| | | }|	|}|rK|j
|ddd }t j| t jd||d}||  | }	|	S |}	|	S )N        )r   dtyper.   Tr*   r   r,   )r   distributionsgumbelGumbeltensorr   r>   sampleshaper/   r0   r1   r2   r3   r4   )
r   r;   r<   r)   gumbel_distgumbelsr5   r6   r7   r8   r!   r!   r"   gumbel_softmax=   s   
rG   c           	      C   s   || | j d  d }||kr tt|| }| j d | }ntt|| }| j d | }| j d }| j d }| ||||} tjj| ||fd|d} | S )a  
    Args:
        attentions (`torch.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
        height (`int`): height of the output attention map
        width (`int`): width of the output attention map
        align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.

    Returns:
        `torch.Tensor`: resized attention map of shape [batch_size, groups, height, width]
             ?r   r   bilinearsizemodealign_corners)rD   intnproundreshaper   r   interpolate)	
attentionsheightwidthrN   scale
feat_widthfeat_height
batch_sizegroupsr!   r!   r"   resize_attention_mapS   s   

r\   c                 C   s   g }t  7 d}| D ]*}|ddd }|du r|}n|| }t|ddd g|R  }|| qW d   n1 s@w   Y  |d }|S )a1  
    Args:
        attentions (`tuple(torch.FloatTensor)`: tuple of attention maps returned by `GroupViTVisionTransformer`
        hw_shape (`tuple(int)`): height and width of the output attention map
    Returns:
        `torch.Tensor`: the attention map of shape [batch_size, groups, height, width]
    Nr   rH   r   r:   )r   no_gradpermute
contiguousr\   append)rT   hw_shape	attn_mapsprev_attn_masks
attn_maskscur_attn_mapfinal_groupingr!   r!   r"   get_grouping_from_attentionsq   s   	
rg   c                       s*   e Zd Zdef fddZdd Z  ZS )GroupViTCrossAttentionLayerconfigc                    sJ   t    t|| _tj|j|jd| _t	|| _
tj|j|jd| _d S Neps)super__init__GroupViTAttentionattnr   	LayerNormhidden_sizelayer_norm_epsnorm2GroupViTMLPmlp	norm_postselfri   	__class__r!   r"   rn      s
   


z$GroupViTCrossAttentionLayer.__init__c                 C   s<   |}|| j ||dd  }|| | | }| |}|S )N)encoder_hidden_statesr   )rp   rv   rt   rw   )ry   querykeyxr!   r!   r"   forward   s
   
z#GroupViTCrossAttentionLayer.forward)__name__
__module____qualname__r   rn   r   __classcell__r!   r!   rz   r"   rh      s    rh   c                       s4   e Zd Zdef fddZd	ddZdd Z  ZS )
GroupViTAssignAttentionri   c                    sj   t    |jd | _t|j|j| _t|j|j| _t|j|j| _t|j|j| _	|j
| _
d S )N      )rm   rn   rr   rW   r   Linearq_projk_projv_projproj
assign_epsrx   rz   r!   r"   rn      s   
z GroupViTAssignAttention.__init__Tc                 C   sD   |r| j rt|d|d}|S |rt|dd}|S tjj|dd}|S )N)r)   r<   r)   )trainingrG   r9   r   r   r/   )ry   rp   r@   r<   r!   r!   r"   get_attn   s   
z GroupViTAssignAttention.get_attnc                 C   s   |}|  |}| |}| |}||dd | j }| |}| j|ddd}||jddd| j  }|| }| |}||fS )Nr   r:   F)r@   r<   Tr)   r+   )	r   r   r   	transposerW   r   sumr   r   )ry   r}   r~   valueraw_attnrp   	soft_attnoutr!   r!   r"   r      s   




zGroupViTAssignAttention.forward)TT)r   r   r   r   rn   r   r   r   r!   r!   rz   r"   r      s    

r   c                       s2   e Zd Zdef fddZdd Zdd Z  ZS )GroupViTTokenAssignri   c                    s   t    || _tj j jd| _t j	t
jjr j	n j	 j	f} fdd|D \}}t |||| _tj j jd| _tj j jd| _t | _t | _tj j jd| _t  j| j| _d S )Nrk   c                    s   g | ]	}t | j qS r!   )rO   rr   ).0r   ri   r!   r"   
<listcomp>   s    z0GroupViTTokenAssign.__init__.<locals>.<listcomp>)rm   rn   num_output_groupr   rq   rr   rs   norm_tokens
isinstanceassign_mlp_ratiocollectionsabcIterableGroupViTMixerMLP	mlp_internorm_post_tokensnorm_xrh   pre_assign_attnr   assign
norm_new_xru   mlp_channels)ry   ri   num_group_tokenr   r   
tokens_dimchannels_dimrz   r   r"   rn      s   



zGroupViTTokenAssign.__init__c                 C   s   |  |}| |}|S )z
        Args:
            group_tokens (torch.Tensor): group tokens, [batch_size, num_group_tokens, channels]

        Returns:
            projected_group_tokens (torch.Tensor): [batch_size, num_output_groups, channels]
        )r   r   )ry   group_tokensprojected_group_tokensr!   r!   r"   project_group_token   s   
	
z'GroupViTTokenAssign.project_group_tokenc                 C   s^   |  |}| |}| |}| ||}| ||\}}||7 }|| | | }||fS )z
        Args:
            image_tokens (`torch.Tensor`): image tokens, of shape [batch_size, input_length, channels]
            group_tokens (`torch.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
        )r   r   r   r   r   r   r   )ry   image_tokensr   r   new_image_tokens	attentionr!   r!   r"   r      s   


zGroupViTTokenAssign.forward)r   r   r   r   rn   r   r   r   r!   r!   rz   r"   r      s    r   c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZejdB ed< dZejdB ed< dZeed	< dZeed
< dee fddZdS )GroupViTModelOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
        Classification scores for each pixel.

        <Tip warning={true}>

        The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
        to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
        original image size as post-processing. You should always check your logits shape and resize as needed.

        </Tip>
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of
        [`GroupViTTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of
        [`GroupViTVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`GroupViTTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`GroupViTVisionModel`].
    Nlosslogits_per_imagelogits_per_textsegmentation_logitstext_embedsimage_embedstext_model_outputvision_model_outputr   c                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))r   r   N)getattrto_tuple)r   kry   r!   r"   	<genexpr>1  s
    
z/GroupViTModelOutput.to_tuple.<locals>.<genexpr>)tuplekeysr   r!   r   r"   r   0  s   zGroupViTModelOutput.to_tuple)r   r   r   __doc__r   r   FloatTensor__annotations__r   r   r   r   r   r   r   r   r   r   r   r!   r!   r!   r"   r     s   
 r   c                	       sd   e Zd ZdZ				ddedeeeef B ded	ef fd
dZddejde	dejfddZ
  ZS )GroupViTPatchEmbeddingsz#
    Image to Patch Embedding.
          r      
image_size
patch_sizenum_channels	embed_dimc                    s   t    t|tjjr|n||f}t|tjjr|n||f}|d |d  |d |d   }|| _|| _|| _t	j
||||d| _d S )Nr   r   )kernel_sizestride)rm   rn   r   r   r   r   r   r   num_patchesr   Conv2d
projection)ry   r   r   r   r   r   rz   r!   r"   rn   <  s   
 z GroupViTPatchEmbeddings.__init__Fpixel_valuesinterpolate_pos_encodingr   c              
   C   sx   |j \}}}}|s.|| jd ks|| jd kr.td| d| d| jd  d| jd  d	| |ddd}|S )Nr   r   zInput image size (*z) doesn't match model ().rH   )rD   r   
ValueErrorr   flattenr   )ry   r   r   rZ   r   rU   rV   r   r!   r!   r"   r   M  s   zGroupViTPatchEmbeddings.forward)r   r   r   r   F)r   r   r   r   rO   r   rn   r   Tensorboolr   r   r!   r!   rz   r"   r   7  s     $r   c                       s\   e Zd Zdef fddZdejdededejfdd	Zddejde	dejfddZ
  ZS )GroupViTVisionEmbeddingsri   c                    sx   t    t|j|j|j|jd| _| jj}t	
td||j| _t	|j| _t	j|j|jd| _|j| _|| _d S )N)r   r   r   r   r   rk   )rm   rn   r   r   r   r   rr   patch_embeddingsr   r   	Parameterr   zerosposition_embeddingsDropoutdropoutrq   rs   	layernormri   )ry   ri   r   rz   r!   r"   rn   Z  s   

z!GroupViTVisionEmbeddings.__init__
embeddingsrU   rV   r   c                 C   s   |j d }| jj d }tj s||kr||kr| jS | j}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r:   rI   r   r   rH   bicubicFrK   )rD   r   r   jit
is_tracingr   r   rR   r^   r   r   rS   view)ry   r   rU   rV   r   num_positionspatch_pos_embedr)   
new_height	new_widthsqrt_num_positionsr!   r!   r"   r   j  s&   




z1GroupViTVisionEmbeddings.interpolate_pos_encodingFr   r   c           
      C   sd   |j \}}}}| j||d}| |}| \}}}	|r&|| ||| }n|| j }| |}|S )N)r   )rD   r   r   rL   r   r   r   )
ry   r   r   rZ   r   rU   rV   r   seq_len_r!   r!   r"   r     s   


z GroupViTVisionEmbeddings.forwardr   )r   r   r   r   rn   r   r   rO   r   r   r   r   r!   r!   rz   r"   r   Y  s    $&r   c                	       sX   e Zd Zdef fddZ			ddejdB dejdB dejdB dejfd	d
Z	  Z
S )GroupViTTextEmbeddingsri   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )Nposition_idsr   r:   F)
persistent)rm   rn   rr   r   	Embedding
vocab_sizetoken_embeddingmax_position_embeddingsposition_embeddingregister_bufferr   r   expandry   ri   r   rz   r!   r"   rn     s   

zGroupViTTextEmbeddings.__init__N	input_idsr   inputs_embedsr   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )Nr:   r   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rD   r   weightr   r   r   )ry   r   r   r   
seq_lengthmax_position_embeddingr   r   r!   r!   r"   r     s"   

zGroupViTTextEmbeddings.forwardNNN)r   r   r   r   rn   r   
LongTensorr   r   r   r   r!   r!   rz   r"   r     s    r   c                
       s   e Zd ZdZdededededef
 fddZed	d
 Zdd Z	dde
jde
jdB de
jfddZ		dde
jde
jdB dedB dee
j fddZ  ZS )GroupViTStagezMThis corresponds to the `GroupingLayer` class in the GroupViT implementation.ri   depthnum_prev_group_tokenr   r   c                    s   t    || _|| _|dkrttd| j| _	nd | _	t
 fddt|D | _|dkr;t ||d| _nd | _|dkr^|dkr^ttj j jdt | jd || _d S d | _d S )Nr   r   c                       g | ]}t  qS r!   GroupViTEncoderLayerr   r   r   r!   r"   r         z*GroupViTStage.__init__.<locals>.<listcomp>)ri   r   r   rk   rH   )rm   rn   r  r   r   r   r   r   rr   group_token
ModuleListrangelayersr   
downsample
Sequentialrq   rs   r   group_projector)ry   ri   r  r  r   r   rz   r   r"   rn     s(   



zGroupViTStage.__init__c                 C   s
   | j d uS N)r  r   r!   r!   r"   with_group_token  s   
zGroupViTStage.with_group_tokenc                 C   s>   | j r|d d d | j f |d d | j d f fS |d fS r  )r  r   ry   r   r!   r!   r"   split_x  s   0zGroupViTStage.split_xNr   r  r   c                 C   s   |d u r|S t j||gddS )Nr   r   )r   cat)ry   r   r  r!   r!   r"   concat_x  s   zGroupViTStage.concat_xFhidden_statesprev_group_tokenoutput_attentionsc                 C   s   | j r| j|ddd}| jdur|| | }nd}|}| ||}| jD ]}||dd}|d }q(| |\}}d}	| jdurK| ||\}}	||f}
|rV|
|	f }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the grouping tensors of Grouping block.
        r   r:   N)attention_mask)	r  r  r   rL   r  r  r  r  r  )ry   r  r  r  r  r   cat_xlayer	layer_outr   outputsr!   r!   r"   r      s&   




zGroupViTStage.forwardr  )NF)r   r   r   r   r   rO   rn   propertyr  r  r   r   r  r   r   r   r   r   r!   r!   rz   r"   r    s8    "
"r  c                
       sX   e Zd Z			ddededB dedB dedB f fddZdejd	ejfd
dZ  Z	S )ru   Nri   rr   intermediate_sizeoutput_sizec                    sp   t    || _t|j | _|d ur|n|j}|d ur|n|j}|d ur&|n|}t	||| _
t	||| _d S r  )rm   rn   ri   r   
hidden_actactivation_fnrr   r"  r   r   fc1fc2)ry   ri   rr   r"  r#  rz   r!   r"   rn   +  s   
zGroupViTMLP.__init__r  r   c                 C   s"   |  |}| |}| |}|S r  )r&  r%  r'  )ry   r  r!   r!   r"   r   ;  s   


zGroupViTMLP.forwardr  )
r   r   r   r   rO   rn   r   r   r   r   r!   r!   rz   r"   ru   *  s    ru   c                       s   e Zd Z fddZ  ZS )r   c                    s    t  |dd}|ddS Nr   rH   )rm   r   r   r  rz   r!   r"   r   C  s   zGroupViTMixerMLP.forward)r   r   r   r   r   r!   r!   rz   r"   r   B  s    r   c                       s   e Zd ZdZ fddZdejdedefddZ					
ddejdejd	B dej	d	B de
d	B deejejd	B eej d	B f f
ddZ  ZS )ro   z=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r   r   )rm   rn   ri   rr   r   num_attention_heads	num_headshead_dimr   rW   attention_dropoutr   r   r   r   r   r   out_projrx   rz   r!   r"   rn   K  s"   

zGroupViTAttention.__init__rB   r   bszc                 C   s    | ||| j| jdd S r(  )r   r*  r+  r   r_   )ry   rB   r   r.  r!   r!   r"   _shape^  s    zGroupViTAttention._shapeNFr  r  r|   r  r   c                 K   sX  |  \}}}|du}	| || j }
|	r*| | |d|}| | |d|}n| | |d|}| | |d|}|| j d| jf}| |
||j| }
|j| }|j| }| d}t	
|
|dd}|  || j ||fkrtd|| j ||f d|   |dur|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}tjj|dd}|r||| j||}||| j ||}nd}tjj|| j| jd	}t	
||}|  || j || jfkrtd
|| j|| jf d|   ||| j|| j}|dd}||||}| |}||fS )z#Input shape: Batch x Time x ChannelNr:   r   rH   z$Attention weights should be of size z	, but is z!Attention mask should be of size r   )pr   z `attn_output` should be of size )rL   r   rW   r/  r   r   r*  r+  r   r   bmmr   r   r   r   r/   r   r   rR   r-  )ry   r  r  r|   r  kwargsr.  tgt_lenr   is_cross_attentionquery_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputr!   r!   r"   r   a  sZ   




zGroupViTAttention.forward)NNF)r   r   r   r   rn   r   r   rO   r/  r   r   r   r   r   r!   r!   rz   r"   ro   H  s$    ro   c                       sV   e Zd Zdef fddZ	ddejdejdedB d	ee	 d
e
ej f
ddZ  ZS )r	  ri   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S rj   )rm   rn   rr   r   ro   	self_attnr   rq   rs   layer_norm1ru   rv   layer_norm2rx   rz   r!   r"   rn     s   


zGroupViTEncoderLayer.__init__Fr  r  r  Nr2  r   c                 K   sj   |}|  |}| jd|||d|\}}|| }|}| |}| |}|| }|f}|r3||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r  r  r  Nr!   )r?  r>  r@  rv   )ry   r  r  r  r2  residualr:  r   r!   r!   r"   r     s$   




zGroupViTEncoderLayer.forwardr   )r   r   r   r   rn   r   r   r   r   r   r   r   r   r   r!   r!   rz   r"   r	    s    r	  c                   @   s2   e Zd ZU eed< dZdZdZe	 dd Z
dS )GroupViTPreTrainedModelri   groupvit)imagetextTc                 C   s  | j j}t|tjtjfr"tj|jd|d |j	dur!t
|j	 n/t|tjtjfrQt
|j	 t|j t|dddurQt
|j t|j t
|j | j j}t|trtj|jjd|d d tj|jjd|d d t|jt|jjd d dS t|tr| j j}|jd d	|j j d  | }|jd | }tj|jj|d
 tj|j j|d
 tj|j!j|d
 tj|j"j|d
 dS t|t#r| j j}|j j$d d	|j j d  | }d	|j j$ d | }tj|j%j|d
 tj|j&j|d
 dS dS )zInitialize the weightsr=   )meanstdNrunning_meang{Gz?r:   r   r   rH   )rG  )'ri   initializer_ranger   r   r   r   initnormal_r   biaszeros_rq   BatchNorm1dones_r   rH  running_varnum_batches_trackedinitializer_factorr   r   r   copy_r   r   r   rD   r   ro   r   num_hidden_layersr   r   r   r-  ru   rr   r&  r'  )ry   module
init_rangefactorin_proj_stdout_proj_stdfc_stdr!   r!   r"   _init_weights  sB   

&
 z%GroupViTPreTrainedModel._init_weightsN)r   r   r   r   r   base_model_prefixinput_modalitiessupports_gradient_checkpointingr   r]   r[  r!   r!   r!   r"   rB    s   
 rB  c                       s^   e Zd Zdeddf fddZ			ddejdedB dedB d	edB dee	B f
d
dZ
  ZS )GroupViTVisionEncoderri   r   Nc                    s>   t     | _t fddtt jD | _d| _	d S )Nc              
      sF   g | ]}t   j|  j|  j| |d kr j|d  nd dqS )r   r   )ri   r  r   r   r  )r  depthsnum_group_tokensnum_output_groups)r   ir   r!   r"   r     s    z2GroupViTVisionEncoder.__init__.<locals>.<listcomp>F)
rm   rn   ri   r   r  r  r    r`  stagesgradient_checkpointingrx   rz   r   r"   rn     s   


zGroupViTVisionEncoder.__init__r  output_hidden_statesr  return_dictc                 C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|r"dnd }|r(dnd }d }t| jD ](\}}	|r<||f }|	|||}
|
d }|
d }|rY|
d d urY||
d f }q1|ra||f }|sotdd |||fD S t|||dS )Nr!   r   r   rH   c                 s       | ]	}|d ur|V  qd S r  r!   r   vr!   r!   r"   r   @      z0GroupViTVisionEncoder.forward.<locals>.<genexpr>last_hidden_stater  rT   )ri   r  rf  use_return_dict	enumeraterd  r   r
   )ry   r  rf  r  rg  all_hidden_statesall_groupingsr   rc  stagelayer_outputsr!   r!   r"   r     s.   

zGroupViTVisionEncoder.forwardr  )r   r   r   r   rn   r   r   r   r   r
   r   r   r!   r!   rz   r"   r_    s     r_  c                       sd   e Zd ZdZdef fddZ				ddejdB dedB dedB d	edB d
e	e
B f
ddZ  ZS )GroupViTTextEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self-attention layers. Each layer is a
    [`GroupViTEncoderLayer`].

    Args:
        config: GroupViTTextConfig
    ri   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    r  r!   r  r
  r   r!   r"   r   R  r  z0GroupViTTextEncoder.__init__.<locals>.<listcomp>F)	rm   rn   ri   r   r  r  rT  r  re  rx   rz   r   r"   rn   O  s   
 
zGroupViTTextEncoder.__init__Nr  r  rf  rg  r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ]#\}
}|r<||	f }||	|fd|i|}|d }	|rT||d f }q1|r\||	f }|sjtdd |	||fD S t|	||dS )	ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr!   r  r   r   c                 s   rh  r  r!   ri  r!   r!   r"   r     rk  z.GroupViTTextEncoder.forward.<locals>.<genexpr>rl  )ri   r  rf  rn  ro  r  r   r
   )ry   r   r  r  rf  rg  r2  encoder_statesall_attentionsr  idxencoder_layerrs  r!   r!   r"   r   U  s:   

zGroupViTTextEncoder.forwardNNNN)r   r   r   r   r   rn   r   r   r   r   r
   r   r   r!   r!   rz   r"   rt  F  s$    	rt  c                       s|   e Zd Zdef fddZe						ddejdB dejdB dejdB dedB d	edB d
edB de	e
B fddZ  ZS )GroupViTTextTransformerri   c                    sL   t  | |j}t|| _t|| _tj||j	d| _
|j| _|   d S rj   )rm   rn   rr   r   r   rt  encoderr   rq   rs   final_layer_normeos_token_id	post_initr   rz   r!   r"   rn     s   

z GroupViTTextTransformer.__init__Nr   r  r   r  rf  rg  r   c              	   K   s~  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| }|d|d }| j||d}	t| j |	|t	j
|	jd |	jdd d}|dd  | jd|	||||dd	|}
|
d
 }| |}| jdkr|t	j
|jd
 |jd|jt	j|jdjddf }n|t	j
|jd
 |jd|jt	j|jd| jk jddf }|s||f|
dd   S t|||
j|
jdS )NzYou have to specify input_idsr:   )r   r   r   r   )ri   r   r  cache_positionpast_key_values	is_causalT)r   r  r  rf  rg  r  r   rH   )r>   r   r   rm  pooler_outputr  rT   r!   )ri   r  rf  rn  r   rL   r   r   r   r   r   rD   r   popr{  r|  r}  torO   argmaxr   r  rT   )ry   r   r  r   r  rf  rg  r2  input_shaper  encoder_outputsrm  pooled_outputr!   r!   r"   r     sf   


	zGroupViTTextTransformer.forwardNNNNNN)r   r   r   r   rn   r   r   r   r   r   r   r   r   r!   r!   rz   r"   rz    s0    	rz  c                       s   e Zd ZU eed< dZdef fddZdejfddZ	dd	 Z
e	
	
	
	
	
	
ddejd
B dejd
B dejd
B ded
B ded
B ded
B deeB fddZ  ZS )GroupViTTextModelri   )rE  c                    "   t  | t|| _|   d S r  )rm   rn   rz  
text_modelr~  rx   rz   r!   r"   rn        
zGroupViTTextModel.__init__r   c                 C   
   | j jjS r  r  r   r   r   r!   r!   r"   get_input_embeddings     
z&GroupViTTextModel.get_input_embeddingsc                 C   s   || j j_d S r  r  )ry   r   r!   r!   r"   set_input_embeddings  s   z&GroupViTTextModel.set_input_embeddingsNr   r  r   r  rf  rg  c                 K   s   | j ||||||dS )a9  
        Examples:

        ```python
        >>> from transformers import CLIPTokenizer, GroupViTTextModel

        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r  r   r  rf  rg  )r  )ry   r   r  r   r  rf  rg  r2  r!   r!   r"   r     s   zGroupViTTextModel.forwardr  )r   r   r   r   r   r]  rn   r   Moduler  r  r   r   r   r   r   r   r   r   r!   r!   rz   r"   r    s8   
 	r  c                       sd   e Zd Zdef fddZe				ddejdB dedB dedB dedB d	e	e
B f
d
dZ  ZS )GroupViTVisionTransformerri   c                    s@   t    || _|j}t|| _t|| _tj	||j
d| _d S rj   )rm   rn   ri   rr   r   r   r_  r{  r   rq   rs   r   r   rz   r!   r"   rn   )  s   


z"GroupViTVisionTransformer.__init__Nr   rf  r  rg  r   c           	      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| |}| j||||d}|d }| |}|jdd}|sO||f|dd   S t	|||j
|jdS )Nz You have to specify pixel_values)r  rf  r  rg  r   r   r   r  )ri   r  rf  rn  r   r   r{  r   rF  r   r  rT   )	ry   r   rf  r  rg  r  r  rm  r  r!   r!   r"   r   2  s0   

z!GroupViTVisionTransformer.forwardry  )r   r   r   r   rn   r   r   r   r   r   r   r   r   r!   r!   rz   r"   r  (  s$    	r  c                       s   e Zd ZU eed< dZdZdef fddZdefddZ	e
								ddejd	B d
ed	B ded	B ded	B deeB f
ddZ  ZS )GroupViTVisionModelri   r   )rD  c                    r  r  )rm   rn   r  vision_modelr~  rx   rz   r!   r"   rn   b  r  zGroupViTVisionModel.__init__r   c                 C   r  r  )r  r   r   r   r!   r!   r"   r  h  r  z(GroupViTVisionModel.get_input_embeddingsNr  rf  rg  c                 K   s   | j ||||dS )a)  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, GroupViTVisionModel

        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r  rf  rg  )r  )ry   r   r  rf  rg  r2  r!   r!   r"   r   k  s   zGroupViTVisionModel.forwardry  )r   r   r   r   r   main_input_namer]  rn   r   r  r   r   r   r   r   r   r   r   r!   r!   rz   r"   r  ]  s,   
 r  c                       s  e Zd ZU eed< def fddZee		ddej	dej	dB dej	dB de
e d	eeB f
d
dZeedej	de
e d	eeB fddZe									ddejdB dejdB dej	dB dejdB dedB dedB dedB dedB dedB d	eeB fddZ  ZS )GroupViTModelri   c              
      s6  t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _
|j| _|j| _t|| _t|| _ttj| j| j
ddt| j
tjddtj| j
| j	dd| _ttj| j| j
ddt| j
tjddtj| j
| j	dd| _tt| jj| _|   d S )NzOconfig.text_config is expected to be of type GroupViTTextConfig but is of type .zSconfig.vision_config is expected to be of type GroupViTVisionConfig but is of type T)rL  )inplace) rm   rn   r   text_configr   	TypeErrortypevision_configr   projection_dimprojection_intermediate_dimrr   text_embed_dimvision_embed_dimrz  r  r  r  r   r  r   rN  ReLUvisual_projectiontext_projectionr   r   rB   ri   logit_scale_init_valuelogit_scaler~  )ry   ri   r  r  rz   r!   r"   rn     sF   





zGroupViTModel.__init__Nr   r  r   r2  r   c                 K   s0   | j d|||dd|}|j}| ||_|S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import CLIPTokenizer, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r   r  r   rg  Nr!   )r  r  r  )ry   r   r  r   r2  text_outputsr  r!   r!   r"   get_text_features  s   zGroupViTModel.get_text_featuresr   c                 K   s(   | j |fddi|}| |j|_|S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, GroupViTModel
        >>> from transformers.image_utils import load_image

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```rg  T)r  r  r  )ry   r   r2  vision_outputsr!   r!   r"   get_image_features  s   z GroupViTModel.get_image_featuresreturn_lossr  rf  output_segmentationrg  c
              
   K   sP  |dur|n| j j}|dur|n| j j}|rd}|dur|n| j j}|	dur(|	n| j j}	| j||||	d}| j||||||	d}|d }| |}|d }| |}||j	ddd }||j	ddd }| j
 }t|| | }| }d}|r|d }| |d|jd }|r|d	 }n|d
 }t||jd
d }||j	ddd }t|| | }||jd d|jd dd
d}||jd |jd d}t||| }||jd |jd |jd
 |jd	 }d}|rt|}|	s|dur|||||||f}n||||||f}|dur|f| S |S t||||||||dS )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_segmentation (`bool`, *optional*):
            Whether or not to return the segmentation logits.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NTr  r  r   r:   r   r   r   rH   )r   r   r   r   r   r   r   r   )ri   r  r  rf  rn  r  r  r  r  normr  expr   matmulr%   rR   rD   rg   r^   r(   r   )ry   r   r   r  r   r  r  rf  r  rg  r2  r  r  r   r   r  r   r   
seg_logitsimage_group_embedsrT   groupinglogits_per_image_groupflatten_groupingr   outputr!   r!   r"   r     s   ,	




 

zGroupViTModel.forward)NN)	NNNNNNNNN)r   r   r   r   r   rn   r   r   r   r   r   r   r   r   r  r  r  r   r   r   r   r   r!   r!   rz   r"   r    st   
 +!	
r  )r  rB  r  r  )r   Fr:   r   )Ir   collections.abcr   dataclassesr   typingr   numpyrP   r   r    r   rJ  activationsr   masking_utilsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   configuration_groupvitr   r   r   
get_loggerr   loggerr   r#   r(   rO   r9   floatr   rG   r\   rg   r  rh   r   r   r   r   r   r   r  ru   r   ro   r	  rB  r_  rt  rz  r  r  r  r  __all__r!   r!   r!   r"   <module>   sd    
$

070"K(^e2,:Q]455  