o
    wi                     @   s  d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z) e%*e+Z,ee$ddG dd de#Z-G dd de
j.Z/G dd de
j.Z0G dd de
j.Z1de0iZ2G dd de
j.Z3G dd  d e
j.Z4G d!d" d"e
j.Z5G d#d$ d$eZ6G d%d& d&e
j.Z7e$G d'd( d(eZ8G d)d* d*e
j.Z9G d+d, d,e
j.Z:	-dKd.e
j.d/ej;d0ej;d1ej;d2eej; d3e<d4e<fd5d6Z=G d7d8 d8e
j.Z>G d9d: d:eZ?G d;d< d<e
j.Z@G d=d> d>e
j.ZAe$d?dG d@dA dAe8ZBG dBdC dCe
j.ZCe$dDdG dEdF dFe8ZDe$dGdG dHdI dIe8eZEg dJZFdS )LzPyTorch GIT model.    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging	torch_int   )	GitConfigGitVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                   @   sj   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dS )GitVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r    r   torchFloatTensor__annotations__r!   r"   tupler#    r,   r,   a/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/git/modeling_git.pyr   3   s   
 r   c                       s\   e Zd ZdZ fddZ				ddeej deej deej d	e	d
ej
f
ddZ  ZS )GitEmbeddingsz;Construct the embeddings from word and position embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	tj
|j|jd| _
t|j| _t|dd| _| jdt|jddd d S )	N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr2   register_bufferr(   arangeexpandselfconfig	__class__r,   r-   r:   I   s   

zGitEmbeddings.__init__Nr   	input_idsr4   inputs_embedspast_key_values_lengthreturnc           	      C   s   |d ur	|  }n|  d d }|d }|d u r&| jd d ||| f }|d u r0| |}n|}| jdkr@| |}||7 }| |}| |}|S )Nr6   r   r3   )sizer4   r?   r2   rA   rB   rF   )	rL   rP   r4   rQ   rR   input_shape
seq_length
embeddingsrA   r,   r,   r-   forwardX   s   




zGitEmbeddings.forward)NNNr   )r$   r%   r&   r'   r:   r   r(   
LongTensorr)   intTensorrX   __classcell__r,   r,   rN   r-   r.   F   s$    r.   c                       s   e Zd Zd fdd	ZdejdejfddZ					dd	ejd
eej deej dee	 dee
 dee
 deej fddZ  ZS )GitSelfAttentionNc                    sV  t    |j|j dkrt|dstd|j d|j d|| _|d u r1td| j	j
 d |j| _t|j|j | _| j| j | _t|jj|jj d d	 | _|jd ura|  j|j9  _t|j| j| _t|j| j| _t|j| j| _t|j| _|pt|d
d| _| jdks| jdkr|j| _td|j d	 | j| _d S d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.   r   r2   r3   relative_keyrelative_key_query) r9   r:   r=   num_attention_headshasattr
ValueError	layer_idxloggerwarning_oncerO   r$   rZ   attention_head_sizeall_head_sizevision_config
image_size
patch_sizeimage_patch_tokensnum_image_with_embeddingr   LinearquerykeyvaluerD   attention_probs_dropout_probrF   rG   r2   r@   r;   distance_embeddingrL   rM   r2   rf   rN   r,   r-   r:   w   s:   


zGitSelfAttention.__init__xrS   c                 C   s6   |  d d | j| jf }||}|ddddS )Nr6   r   r`   r   r   )rT   rc   ri   viewpermute)rL   rw   new_x_shaper,   r,   r-   transpose_for_scores   s   
z%GitSelfAttention.transpose_for_scoresFr"   attention_mask	head_maskpast_key_valueoutput_attentionspixel_values_presentc              	   C   s  |  |}|r
| jnd}| | |}	| | |}
|d urt||	d d d d |d d d f |
d d d d |d d d f | j\}}tj|	d d d d d |d d f |gdd}	tj|
d d d d d |d d f |gdd}
| |}t	||	
dd}| jdks| jdkr
|jd |	jd }}|d urtj|d tj|jd	dd}ntj|tj|jd	dd}tj|tj|jd	dd}|| }| || j d }|j|jd
}| jdkrtd||}|| }n| jdkr
td||}td|	|}|| | }|t| j }|d ur|| }tjj|dd}| |}|d ur1|| }t	||
}|dddd }|  d d | j!f }||}|rY||fn|f}||f }|S )Nr   r`   dimr6   ra   rb   r   dtypedevicer   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   )"rq   rn   r{   rr   rs   updaterf   r(   catmatmul	transposer2   shapetensorlongr   rx   rI   ru   r@   tor   einsummathsqrtri   r   
functionalsoftmaxrF   ry   
contiguousrT   rj   )rL   r"   r|   r}   r~   r   r   mixed_query_layercutoff	key_layervalue_layerkey_layer_pastvalue_layer_pastquery_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputsr,   r,   r-   rX      sX   
	@..







zGitSelfAttention.forwardNNNNNFF)r$   r%   r&   r:   r(   r[   r{   r   r)   r	   boolr+   rX   r\   r,   r,   rN   r-   r]   v   s.    "r]   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )GitSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr0   )r9   r:   r   rp   r=   denserB   rC   rD   rE   rF   rK   rN   r,   r-   r:         
zGitSelfOutput.__init__r"   input_tensorrS   c                 C   &   |  |}| |}| || }|S Nr   rF   rB   rL   r"   r   r,   r,   r-   rX         

zGitSelfOutput.forwardr$   r%   r&   r:   r(   r[   rX   r\   r,   r,   rN   r-   r          $r   eagerc                       sx   e Zd Zd fdd	Zdd Z					ddejdeej d	eej d
ee	 dee
 dee
 deej fddZ  ZS )GitAttentionNc                    s6   t    t|j |||d| _t|| _t | _d S )N)r2   rf   )	r9   r:   GIT_SELF_ATTENTION_CLASSES_attn_implementationrL   r   outputsetpruned_headsrv   rN   r,   r-   r:      s   

zGitAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )lenr   rL   rc   ri   r   r   rq   rr   rs   r   r   rj   union)rL   headsindexr,   r,   r-   prune_heads	  s   zGitAttention.prune_headsFr"   r|   r}   r~   r   r   rS   c           
      C   s:   |  ||||||}| |d |}|f|dd   }	|	S )Nr   r   )rL   r   )
rL   r"   r|   r}   r~   r   r   self_outputsattention_outputr   r,   r,   r-   rX     s   	zGitAttention.forwardr   r   )r$   r%   r&   r:   r   r(   r[   r   r)   r	   r   r+   rX   r\   r,   r,   rN   r-   r      s.    	r   c                       2   e Zd Z fddZdejdejfddZ  ZS )GitIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r9   r:   r   rp   r=   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnrK   rN   r,   r-   r:   3  s
   
zGitIntermediate.__init__r"   rS   c                 C   s   |  |}| |}|S r   )r   r   rL   r"   r,   r,   r-   rX   ;  s   

zGitIntermediate.forwardr   r,   r,   rN   r-   r   2  s    r   c                       r   )	GitOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r9   r:   r   rp   r   r=   r   rB   rC   rD   rE   rF   rK   rN   r,   r-   r:   C  r   zGitOutput.__init__r"   r   rS   c                 C   r   r   r   r   r,   r,   r-   rX   I  r   zGitOutput.forwardr   r,   r,   rN   r-   r   B  r   r   c                       sx   e Zd Zd fdd	Z					ddejdeej deej dee d	ee	 d
ee	 de
ej fddZdd Z  ZS )GitLayerNc                    s>   t    |j| _d| _t||d| _t|| _t|| _	d S )Nr   )rf   )
r9   r:   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   )rL   rM   rf   rN   r,   r-   r:   Q  s   

zGitLayer.__init__Fr"   r|   r}   r~   r   r   rS   c                 C   s^   | j ||||||d}|d }|dd }	|d }
t| j| j| j|}|f|	 }	|	|
f }	|	S )N)r   r~   r   r   r   r6   )r   r   feed_forward_chunkr   r   )rL   r"   r|   r}   r~   r   r   self_attention_outputsr   r   present_key_valuelayer_outputr,   r,   r-   rX   Y  s"   


zGitLayer.forwardc                 C   s   |  |}| ||}|S r   )r   r   )rL   r   intermediate_outputr   r,   r,   r-   r   {  s   
zGitLayer.feed_forward_chunkr   r   )r$   r%   r&   r:   r(   r[   r   r)   r	   r   r+   rX   r   r\   r,   r,   rN   r-   r   P  s.    
"r   c                       s   e Zd Z fddZ								ddejdeej deej d	eee	e
e
ej  f  d
ee dee dee dee dee dee
ej ef fddZ  ZS )
GitEncoderc                    :   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  |qS r,   )r   ).0irM   r,   r-   
<listcomp>  s    z'GitEncoder.__init__.<locals>.<listcomp>F)	r9   r:   rM   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingrK   rN   r   r-   r:        
 
zGitEncoder.__init__NFTr"   r|   r}   past_key_values	use_cacher   output_hidden_statesr   return_dictrS   c
                 C   s6  | j r| jr|rtd d}d}
|r,t|ts,d}
|d u r"t }n
t|}td |r0dnd }|r6dnd }d }t| j	D ]1\}}|rJ||f }|d urR|| nd }|||||||}|d }|rg|d }|rp||d f }q?|rx||f }|r||nd }|
r|
 }|	std	d
 ||||fD S t||||dS )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r,   r   r6   r   c                 s       | ]	}|d ur|V  qd S r   r,   r   vr,   r,   r-   	<genexpr>  s    z%GitEncoder.forward.<locals>.<genexpr>r!   r   r"   r#   )r   trainingrg   rh   r   r	   r
   from_legacy_cache	enumerater   to_legacy_cacher+   r   )rL   r"   r|   r}   r   r   r   r   r   r   return_legacy_cacheall_hidden_statesall_self_attentionsnext_decoder_cacher   layer_modulelayer_head_masklayer_outputs
next_cacher,   r,   r-   rX     sn   

	

zGitEncoder.forward)NNNNFFFT)r$   r%   r&   r:   r(   r[   r   r)   r   r	   r+   r   r   rX   r\   r,   r,   rN   r-   r     s>    		
r   c                   @   s(   e Zd ZeZdZdZdZdZdd Z	dS )GitPreTrainedModelgitTc                 C   s  t |tr)tjj|jd| jjd tjj|jj	| jjd tjj|j
j	| jjd t |tjrI|j	jjd| jjd |jdurG|jj  dS dS t |tjrl|j	jjd| jjd |jdurj|j	j|j   dS dS t |tjr|jj  |j	jd dS dS )zInitialize the weights        )meanstd)r  Ng      ?)r   GitVisionEmbeddingsr   initnormal_class_embeddingrM   initializer_rangepatch_embeddingweightposition_embeddingrp   databiaszero_r;   r/   rB   fill_)rL   moduler,   r,   r-   _init_weights  s$   


z GitPreTrainedModel._init_weightsN)
r$   r%   r&   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_cache_class_supports_quantized_cacher  r,   r,   r,   r-   r     s    r   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )r  rM   c                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestrider  r`   r   r4   r5   r7   )r9   r:   rM   r=   	embed_dimrl   rm   r   	Parameterr(   randnr  Conv2dnum_channelsr  num_patchesnum_positionsr;   r	  rH   rI   rJ   rK   rN   r,   r-   r:     s"   
"zGitVisionEmbeddings.__init__rW   heightwidthrS   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr6   g      ?r   r`   bicubicF)rT   modealign_cornersr   )r   r	  r  	unsqueezer(   jit
is_tracingr4   rm   r   reshapery   r   r   interpolaterx   r   )rL   rW   r   r!  r  r	  r  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionsr,   r,   r-   interpolate_pos_encoding  s*   



z,GitVisionEmbeddings.interpolate_pos_encodingFpixel_valuesc              
   C   s   |j \}}}}|s&|| jks|| jkr&td| d| d| j d| j d	| jjj}| |j|d}|ddd}| j	
|dd}	tj|	|gdd	}
|r[|
| |
|| }
|
S |
| | j }
|
S )
NzInput image size (*z) doesn't match model ().r   r`   r   r6   r   )r   rl   re   r  r  r   r   flattenr   r  rJ   r(   r   r/  r	  r4   )rL   r0  r/  
batch_size_r   r!  target_dtypepatch_embedsclass_embedsrW   r,   r,   r-   rX   9  s    
zGitVisionEmbeddings.forwardF)r$   r%   r&   r   r:   r(   r[   rZ   r/  r)   rX   r\   r,   r,   rN   r-   r    s     )r  c                       r   )GitVisionMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S r   )r9   r:   rM   r   r   activation_fnr   rp   r=   r   fc1fc2rK   rN   r,   r-   r:   M  s
   
zGitVisionMLP.__init__r"   rS   c                 C   s"   |  |}| |}| |}|S r   )r<  r;  r=  r   r,   r,   r-   rX   T  s   


zGitVisionMLP.forwardr   r,   r,   rN   r-   r:  L  s    r:  r   r  rq   rr   rs   r|   scalingrF   c           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr6   r   )r   r   )pr   r   r`   )r(   r   r   r   r   r   float32r   r   rF   r   r   )
r  rq   rr   rs   r|   r>  rF   kwargsattn_weightsattn_outputr,   r,   r-   eager_attention_forward\  s   
rD  c                       sh   e Zd ZdZ fddZ			ddejdeej deej d	ee d
e	ejeej f f
ddZ
  ZS )GitVisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r2  g      F)r9   r:   rM   r=   r  rc   	num_headshead_dimre   scaleattention_dropoutrF   	is_causalr   rp   k_projv_projq_projout_projrK   rN   r,   r-   r:   u  s$   

zGitVisionAttention.__init__NFr"   r|   causal_attention_maskr   rS   c              
   C   sL  |j \}}}| |}| |}	| |}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
| jj	dkrY|durR|durR|| }n|durX|}n|du| _
t}| jj	dkrz| jj	dkrt|rttd nt| jj	 }|| ||	|
|| j
| j| jsdn| jd	\}}|||| }| |}|sd}||fS )
z#Input shape: Batch x Time x Channelr   r`   flash_attention_2Nr   sdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )rJ  r>  rF   )r   rM  rK  rL  rx   rF  rG  r   rM   r   rJ  rD  rg   rh   r   rH  r   rF   r(  r   rN  )rL   r"   r|   rO  r   r4  rV   r  querieskeysvaluesattention_interfacerC  rB  r,   r,   r-   rX     sH   	






zGitVisionAttention.forward)NNF)r$   r%   r&   r'   r:   r(   r[   r   r   r+   rX   r\   r,   r,   rN   r-   rE  r  s"    rE  c                       sT   e Zd Zdef fddZ	ddejdejdejdee d	e	ej
 f
d
dZ  ZS )GitVisionEncoderLayerrM   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S r   )r9   r:   r=   r  rE  	self_attnr   rB   rC   layer_norm1r:  mlplayer_norm2rK   rN   r,   r-   r:     s   


zGitVisionEncoderLayer.__init__Fr"   r|   rO  r   rS   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r"   r|   rO  r   )rX  rW  rZ  rY  )rL   r"   r|   rO  r   residualrB  r   r,   r,   r-   rX     s"   




zGitVisionEncoderLayer.forwardr9  )r$   r%   r&   r   r:   r(   r[   r   r   r+   r)   rX   r\   r,   r,   rN   r-   rV    s    rV  c                       st   e Zd ZdZdef fddZ					ddeej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )GitVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`GitVisionEncoderLayer`].

    Args:
        config: GitVisionConfig
    rM   c                    r   )Nc                    s   g | ]}t  qS r,   )rV  r   r5  r   r,   r-   r     s    z-GitVisionEncoder.__init__.<locals>.<listcomp>F)	r9   r:   rM   r   r   r   r   layersr   rK   rN   r   r-   r:     r   zGitVisionEncoder.__init__Nr|   rO  r   r   r   rS   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ] \}
}|r<||	f }||	|||d}|d }	|rQ||d f }q1|rY||	f }|sgtdd |	||fD S t|	||dS )	a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr,   )r   r   r   c                 s   r   r   r,   r   r,   r,   r-   r   G  s    z+GitVisionEncoder.forward.<locals>.<genexpr>r!   r"   r#   )rM   r   r   use_return_dictr   r^  r+   r   )rL   rQ   r|   rO  r   r   r   encoder_statesall_attentionsr"   idxencoder_layerr   r,   r,   r-   rX     s6   &

zGitVisionEncoder.forward)NNNNN)r$   r%   r&   r'   r   r:   r   r(   r[   r   r   r+   r   rX   r\   r,   r,   rN   r-   r\    s*    	
r\  c                       sr   e Zd Zdef fddZe					ddeej dee	 dee	 d	ee	 d
ee	 de
eef fddZ  ZS )GitVisionTransformerrM   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )r9   r:   rM   r=   r  rW   r   rB   rC   pre_layrnormr\  encoderpost_layernorm)rL   rM   r  rN   r,   r-   r:   O  s   


zGitVisionTransformer.__init__NFr0  r   r   r/  r   rS   c           	      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| j||d}| |}| j||||d}|d }| |}|sO|f|dd   S t	||j
|jdS )Nz You have to specify pixel_valuesr/  )rQ   r   r   r   r   r   r_  )rM   r   r   r`  re   rW   rf  rg  rh  r   r"   r#   )	rL   r0  r   r   r/  r   r"   encoder_outputsr!   r,   r,   r-   rX   Y  s.   	

zGitVisionTransformer.forwardNNNFN)r$   r%   r&   r   r:   r   r   r(   r)   r   r   r+   r   rX   r\   r,   r,   rN   r-   re  M  s*    

re  zY
    The vision model from CLIP, used in GIT, without any head or projection on top.
    c                       s   e Zd ZeZdZdef fddZdejfddZ	e
						ddeej d
ee dee dedee deeef fddZ  ZS )GitVisionModelr0  rM   c                    s"   t  | t|| _|   d S r   )r9   r:   re  vision_model	post_initrK   rN   r,   r-   r:     s   
zGitVisionModel.__init__rS   c                 C   s
   | j jjS r   )rm  rW   r  rL   r,   r,   r-   get_input_embeddings     
z#GitVisionModel.get_input_embeddingsNFr   r   r/  r   c                 C   s(   |dur|n| j j}| j|||||dS )a{  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GitVisionModel

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
        >>> model = GitVisionModel.from_pretrained("microsoft/git-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```N)r0  r   r   r/  r   )rM   r`  rm  )rL   r0  r   r   r/  r   r,   r,   r-   rX     s   zGitVisionModel.forwardrk  )r$   r%   r&   r   r  main_input_namer:   r   Modulerp  r   r   r(   r)   r   r   r+   r   rX   r\   r,   r,   rN   r-   rl    s0    
rl  c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )GitProjectionrM   c                    s@   t    || _tt|jj|jtj|j|jj	d| _
d S r   )r9   r:   rM   r   
Sequentialrp   rk   r=   rB   rC   visual_projectionrK   rN   r,   r-   r:     s   

zGitProjection.__init__rW   rS   c                 C   s
   |  |S r   )rv  )rL   rW   r,   r,   r-   rX     rq  zGitProjection.forward)	r$   r%   r&   r   r:   r(   r[   rX   r\   r,   r,   rN   r-   rt    s    rt  zy
    The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states
    c                       s  e Zd Z fddZdd Zdd Zdd Zd	ed
ej	dej
dejfddZd!ddZe												d"deej deej deej deej deej deej deeeeej f  dee dee dee dedee deeej ef fdd Z  ZS )#GitModelc                    sr   t     | _t | _t j| _t | _	t
 | _ jd ur3t fddt jD | _|   d S )Nc                 3   s(    | ]}t td d  jjV  qdS )r   N)r   r  r(   zerosrk   r=   r]  r   r,   r-   r     s
    
z$GitModel.__init__.<locals>.<genexpr>)r9   r:   rM   r.   rW   rl  rk   image_encoderr   rg  rt  rv  ro   r   ParameterListr   img_temperal_embeddingrn  rK   rN   r   r-   r:     s   




zGitModel.__init__c                 C   s   | j jS r   rW   r?   ro  r,   r,   r-   rp    s   zGitModel.get_input_embeddingsc                 C   s   || j _d S r   r|  )rL   rs   r,   r,   r-   set_input_embeddings  s   zGitModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrg  r   r   r   )rL   heads_to_pruner   r   r,   r,   r-   _prune_heads  s   zGitModel._prune_headsrT   r   r   rS   c                 C   s4   t jt j||||ddd}||dktd}|S )Nr   r   r   )diagonal-inf)r(   triuonesmasked_fillfloat)rL   rT   r   r   maskr,   r,   r-   _generate_future_mask  s   zGitModel._generate_future_maskNc                 C   s  |j d }|j d }|j}|j}	tj||f||	d}
tj||| ftd|j|	d}tj||f|	|jd}|dkrLtj|j d |j d | f|	|jd}tj|
|fdd}tj|||	fdd}tj||fddd d d f }|d u rtj|j d |j d fd|d}|jtj	krt
d	tj||jd
}td||< ||j d || || | f}| }|d d d d d |f }|d d d d d f }|| |d d d d d |f< |d d d d d d d f }|S )Nr   r  r  r   r   r   F)
fill_valuer   z1Memory key padding mask must be a boolean tensor.r   )r   r   r   r(   rx  fullr  r   r   r   re   
zeros_likerJ   clone)rL   tgtmemorytgt_maskrR   memory_key_padding_masknum_tgt
num_memoryr   r   top_left	top_rightbottom_leftleftrightfull_attention_maskzero_negative_infinityorigin_leftr   r,   r,   r-   create_attention_mask  sP   


 zGitModel.create_attention_maskFrP   r|   r4   r0  r}   rQ   r   r   r   r   r/  r   c                 C   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|dur4|dur4td|durC| || | }n|durP| dd }ntd|d }d}|durpt|t	sl|d d j
d n| }| || j j}d}|dur|jdkr| j||d	j}n=|jd
krg }t|j
d D ]"}| j|dd|ddddf |d	j}|| j| 7 }|| qtj|dd}ntd| |}| j||||d}|du rtj|j
d d|j
d f|j|jd}||d|d dd}tj||fdd}| ||j|j}| j||||d}|dur\t||j|d d|j}|dkrB|dddd| dddf }n|dddd|d  d|d  df  |7  < | j ||||||	|
||dud	}|d }|s||f|dd  S t!||j"|j#|j$dS )a  
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModel
        >>> import requests
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
        >>> model = AutoModel.from_pretrained("microsoft/git-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = "this is an image of two cats"

        >>> inputs = processor(images=image, text=text, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer6   z5You have to specify either input_ids or inputs_embedsr   r   r`      ri     r   z#pixel_values must be of rank 4 or 5)rP   r4   rQ   rR   r   )r  r  r  rR   )tgt_len)r|   r}   r   r   r   r   r   r   r   )%rM   r   r   r   r`  re   %warn_if_padding_and_no_attention_maskrT   r   r	   r   get_seq_lengthget_head_maskr   ndimry  r!   r   r{  appendr(   r   rv  rW   rx  r   r   repeatr  r  r   r   rg  r   r   r"   r#   )rL   rP   r|   r4   r0  r}   rQ   r   r   r   r   r/  r   rU   rV   rR   projected_visual_featuresvisual_features	frame_idxvisual_features_frameembedding_outputr"   r  combined_attention_maskexpanded_attn_maskrj  sequence_outputr,   r,   r-   rX   )  s   %





$4zGitModel.forwardr   )NNNNNNNNNNFN)r$   r%   r&   r:   rp  r}  r  rZ   r(   r   r   r[   r  r  r   r   r   r	   listr)   r   r+   r   rX   r\   r,   r,   rN   r-   rw    s^     
2	
rw  z`
    GIT Model with a `language modeling` head on top for autoregressive language modeling.
    c                        s  e Zd ZdgZ fddZdd Zdd Ze														dd
ee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 deeeee	j
 f  dee dee dee dedee deee	j
 ef fddZ	dddZdd Z  ZS ) GitForCausalLMzoutput.weightc                    s4   t  | t|| _t|j|j| _| 	  d S r   )
r9   r:   rw  r   r   rp   r=   r<   r   rn  rK   rN   r,   r-   r:     s   
zGitForCausalLM.__init__c                 C   s   | j S r   r   ro  r,   r,   r-   get_output_embeddings  s   z$GitForCausalLM.get_output_embeddingsc                 C   s
   || _ d S r   r  )rL   new_embeddingsr,   r,   r-   set_output_embeddings  rq  z$GitForCausalLM.set_output_embeddingsNFrP   r|   r4   r0  r}   rQ   labelsr   r   r   r   r/  r   rS   c                 K   s  |dur|n| j j}|durd}	| j||||||||	|
|||d}|d }| |}d}|durl| jjjd jjj}|dd|dddf 	 }|ddddf 	 }| j
|d| j j|dfd| j ji|}|s|f|dd  }|dur|f| S |S t|||j|j|jdS )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Examples:

        Image captioning example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM
        >>> import requests
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
        >>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_caption)
        two cats sleeping on a pink blanket next to remotes.
        ```

        Visual question answering (VQA) example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM
        >>> from huggingface_hub import hf_hub_download
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa")

        >>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
        >>> image = Image.open(file_path).convert("RGB")

        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

        >>> question = "what does the front of the bus say at the top?"

        >>> input_ids = processor(text=question, add_special_tokens=False).input_ids
        >>> input_ids = [processor.tokenizer.cls_token_id] + input_ids
        >>> input_ids = torch.tensor(input_ids).unsqueeze(0)

        >>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
        >>> print(processor.batch_decode(generated_ids, skip_special_tokens=True))
        ['what does the front of the bus say at the top? special']
        ```

        Video captioning example:

        ```python
        >>> import av
        >>> import numpy as np
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download
        >>> from transformers import AutoProcessor, AutoModelForCausalLM

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")

        >>> # set seed for reproducibility
        >>> np.random.seed(45)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`list[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # load video
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample frames
        >>> num_frames = model.config.num_image_with_embedding
        >>> indices = sample_frame_indices(
        ...     clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
        ... )
        >>> frames = read_video_pyav(container, indices)

        >>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values

        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)

        >>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
        Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.']
        ```
        NF)r|   r4   r0  r}   rQ   r   r   r   r   r/  r   r   r6   r   r<   )losslogitsr   r"   r#   )rM   r`  r   r   rg  r   r   rL   rn   r   loss_functionrx   r<   r   r   r"   r#   )rL   rP   r|   r4   r0  r}   rQ   r  r   r   r   r   r/  r   rA  r   r  r  r  num_image_tokensshifted_logitsr   r,   r,   r-   rX     sV    
zGitForCausalLM.forwardc           	      K   sv   |d ur#|  }|jd |kr|}n|jd d }|d d |d f }|j}|d u r/||}|||dd ||dS )Nr   r0  )rP   r|   r0  r   r   )r  r   new_onesget)	rL   rP   r   r|   r   rA  past_lengthremove_prefix_lengthrU   r,   r,   r-   prepare_inputs_for_generation  s   

z,GitForCausalLM.prepare_inputs_for_generationc                    s.   d}|D ]}|t  fdd|D f7 }q|S )Nr,   c                 3   s$    | ]}| d  |jV  qdS )r   N)index_selectr   r   )r   
past_statebeam_idxr,   r-   r     s   " z0GitForCausalLM._reorder_cache.<locals>.<genexpr>)r+   )rL   r   r  reordered_past
layer_pastr,   r  r-   _reorder_cache  s   zGitForCausalLM._reorder_cache)NNNNNNNNNNNFN)NNN)r$   r%   r&   _tied_weights_keysr:   r  r  r   r   r(   r[   r   r	   r  r   r+   r   rX   r  r  r\   r,   r,   rN   r-   r    sh    		
 E
r  )r  rw  r   rl  )r   )Gr'   r   dataclassesr   typingr   r   r   r(   torch.utils.checkpointr   activationsr   cache_utilsr	   r
   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_gitr   r   
get_loggerr$   rg   r   rs  r.   r]   r   r   r   r   r   r   r   r   r  r:  r[   r  rD  rE  rV  r\  re  rl  rt  rw  r  __all__r,   r,   r,   r-   <module>   s   
0v31ZS
P3X65   