o
    i                    @   s  d Z ddlZddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( e rddl)m*Z* ddl+m,Z, e"-e.Z/G dd dej0Z1zddl2m3Z3 e3Z1e/4d W n e5y   Y n e6y   e/7d Y nw G dd dej0Z8G dd dej0Z9G dd dej0Z:G dd deZ;G d d! d!ej0Z<eG d"d# d#eZ=eG d$d% d%e=Z>G d&d' d'ej0Z?G d(d) d)ej0Z@G d*d+ d+ej0ZAG d,d- d-ej0ZBG d.d/ d/ej0ZCG d0d1 d1eZDed2d3G d4d5 d5e=ZEed6d3G d7d8 d8e=eZFg d9ZGdS ):zPix2Struct modeling file    N)OptionalUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging)deprecate_kwarg   )Pix2StructConfigPix2StructTextConfigPix2StructVisionConfig)	BlockMask)make_flex_block_causal_maskc                       s&   e Zd Zd fdd	Zdd Z  ZS )Pix2StructLayerNormư>c                    s&   t    tt|| _|| _dS )zc
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ f/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/pix2struct/modeling_pix2struct.pyr$   >   s   

zPix2StructLayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )N   T)keepdim)tor&   float32powmeanrsqrtr)   r(   dtypefloat16bfloat16)r*   hidden_statesvariancer/   r/   r0   forwardF   s
   
zPix2StructLayerNorm.forward)r"   __name__
__module____qualname__r$   r>   __classcell__r/   r/   r-   r0   r!   =   s    r!   )FusedRMSNormzWDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pix2StructLayerNormzJDiscovered apex but it failed to load, falling back to Pix2StructLayerNormc                       s@   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
Pix2StructVisionEmbeddingsa-  
    Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
    Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
    is represented by a vector of `hidden_size` values.
    configreturnNc                    sR   t    t|j|j| _t|j|j| _	t|j|j| _
t|j| _d S N)r#   r$   r   Linearpatch_embed_hidden_sizer+   patch_projection	Embeddingseq_lenrow_embeddercolumn_embedderDropoutdropout_ratedropoutr*   rF   r-   r/   r0   r$   k   s
   
z#Pix2StructVisionEmbeddings.__init__flattened_patchesc                 C   s   |d d d d df   }|d d d d df   }|d d d d dd f }| |}| |}| |}|| | }| |}|S )Nr   r   r1   )longrK   rN   rO   rR   )r*   rT   row_indicescol_indices
embeddingsrow_embeddingscol_embeddingsr/   r/   r0   r>   t   s   



z"Pix2StructVisionEmbeddings.forward)
r@   rA   rB   __doc__r   r$   r&   Tensorr>   rC   r/   r/   r-   r0   rE   d   s    	rE   c                       s.   e Zd Z fddZ				dddZ  ZS )Pix2StructVisionAttentionc                    s   t    |j| _|j| _|j| _|j| _| j| j | _	t
j| j| j	dd| _t
j| j| j	dd| _t
j| j| j	dd| _t
j| j	| jdd| _d| _d S NFbias)r#   r$   r+   d_kvkey_value_proj_dimnum_attention_headsn_headsattention_dropoutrR   	inner_dimr   rI   querykeyvalueoutputgradient_checkpointingrS   r-   r/   r0   r$      s   

z"Pix2StructVisionAttention.__init__NFc                    s  |j dd \ } fdd}||}||}	||}
t||	dd}|du rtjdj||f|j	|j
d}jrKjrKd|_| dkrd||ddddddf |j	 }n$|durq|||j	 }nt stj |f|j	|j
d}|||j	 }d| }||dkt|j
j}||7 }t|tt|j
j}tjj|d	tjd
|}tjj|jjd}|dur|| }t||
}|dd  d	j} |}|f|f }|r||f }|S )z&
        Self-attention block
        Nr1   c                    s    |    djjddS )
projectionr2   r   r1   )
contiguousviewrd   rb   	transpose)states
batch_sizer*   r/   r0   to_projection_shape   s    z>Pix2StructVisionAttention.forward.<locals>.to_projection_shaper   r   devicer9   Tr2   )dimr9   ptraining)!shaperg   rh   ri   r&   matmulro   zerosrd   ru   r9   rk   ry   requires_gradrv   r4   r   r'   masked_fillfinfominmaxtensorr   
functionalsoftmaxr5   type_asrR   rm   rn   rf   rj   )r*   r<   attention_maskposition_biaslayer_head_maskoutput_attentions
seq_lengthrs   query_states
key_statesvalue_statesscoresposition_bias_maskedattn_weightsattn_outputoutputsr/   rq   r0   r>      sH   &

z!Pix2StructVisionAttention.forward)NNNFr?   r/   r/   r-   r0   r]      s    r]   c                       *   e Zd Zdef fddZdd Z  ZS )Pix2StructVisionMlprF   c                    j   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S r^   r#   r$   r   rI   r+   d_ffwi_0wi_1worP   rQ   rR   r   dense_act_fnactrS   r-   r/   r0   r$         
zPix2StructVisionMlp.__init__c                 C   z   |  | |}| |}|| }| |}t| jjtjr6|j	| jjj	kr6| jjj	tj
kr6|| jjj	}| |}|S rH   r   r   r   rR   
isinstancer   r(   r&   r\   r9   int8r4   r*   r<   hidden_geluhidden_linearr/   r/   r0   r>         


zPix2StructVisionMlp.forward)r@   rA   rB   r   r$   r>   rC   r/   r/   r-   r0   r          r   c                       st   e Zd Zdeddf fddZ			ddejdeej d	eej d
ede	e
ejejf e
ej f f
ddZ  ZS )Pix2StructVisionLayerrF   rG   Nc                    sT   t    |j| _d| _t|| _t|| _t|j	|j
d| _t|j	|j
d| _d S )Nr   r,   )r#   r$   chunk_size_feed_forwardseq_len_dimr]   	attentionr   mlpr!   r+   layer_norm_epspre_mlp_layer_normpre_attention_layer_normrS   r-   r/   r0   r$     s   


zPix2StructVisionLayer.__init__Fr<   r   	head_maskr   c           
      C   sb   |}|  |}| j||||d}|d }|dd  }|| }| |}	| |	| }	|	f| }|S )N)r   r   r   r   r   )r   r   r   r   )
r*   r<   r   r   r   residualself_attention_outputsattention_outputr   layer_outputr/   r/   r0   r>     s   


zPix2StructVisionLayer.forward)NNF)r@   rA   rB   r   r$   r&   r\   r   boolr   tupler>   rC   r/   r/   r-   r0   r     s     r   c                       sn   e Zd Zdeddf fddZ					ddejd	eej d
eej dededede	e
ef fddZ  ZS )Pix2StructVisionEncoderrF   rG   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r/   )r   ).0_rF   r/   r0   
<listcomp>6  s    z4Pix2StructVisionEncoder.__init__.<locals>.<listcomp>F)	r#   r$   rF   r   
ModuleListrangenum_hidden_layerslayerrk   rS   r-   r   r0   r$   3  s   
 
z Pix2StructVisionEncoder.__init__FTr<   r   r   r   output_hidden_statesreturn_dictc                 C   s   |rdnd }|r
dnd }t | jD ])\}	}
|r||f }|d ur$||	 nd }|
||||}|d }|r:||d f }q|rB||f }|sPtdd |||fD S t|||dS )Nr/   r   r   c                 s       | ]	}|d ur|V  qd S rH   r/   r   vr/   r/   r0   	<genexpr>V  s    z2Pix2StructVisionEncoder.forward.<locals>.<genexpr>last_hidden_stater<   
attentions)	enumerater   r   r   )r*   r<   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_moduler   layer_outputsr/   r/   r0   r>   9  s(   	

zPix2StructVisionEncoder.forward)NNFFT)r@   rA   rB   r   r$   r&   r\   r   r   r   r   r   r>   rC   r/   r/   r-   r0   r   2  s,    	
r   c                   @   s6   e Zd ZU eed< dZedd Zdd Zdd Z	d	S )
Pix2StructPreTrainedModelrF   Fc                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r&   r   r   r   )r*   r   
input_maskdummy_inputsr/   r/   r0   r   d  s   

z&Pix2StructPreTrainedModel.dummy_inputsc                 C   s  | j j}t|tr|jj|d  dS t|trt| j tr$| j j	j
n| j j
}t| j tr3| j j	jn| j j}|jjjjd||d  d t|jdrX|jjdurX|jjj  |jjjjd||d  d t|jdry|jjdury|jjj  |jjjjd||d  d t|jdr|jjdur|jjj  dS dS dS t|tr%t| j tr| j j	j
n| j j
}t| j tr| j j	jn| j j
}t| j tr| j j	jn| j j}|jjjjd||| d  d |jjjjd||d  d |jjjjd||d  d |jjjjd||| d  d |jr#|jjjjd||d  d dS dS t|tjr\t| j tr8| j j	j
n| j j
}|jjjd||d  d |jdurZ|jj|j   dS dS t|trt| j trn| j j	j
n| j j
}|j jjjd||d  d dS t|tj!tj"frtj#j$|jj%t&j'd| j j(d%|jj)|j_|jdur|jj  dS dS t|tr|jdur|jjd dS dS t|tjr|jjjd| j j(d |jdur|jj|j   dS dS dS )zInitialize the weights      ?        g      )r7   stdr`   N)*rF   initializer_factorr   r!   r(   datafill_ Pix2StructTextDenseGatedActDenser   text_configr+   r   r   normal_hasattrr`   zero_r   r   Pix2StructTextAttentionra   	num_headsrg   rh   ri   rj   has_relative_attention_biasrelative_attention_biasr   rL   padding_idxPix2StructTextModellm_headrI   Conv2dinittrunc_normal_r4   r&   r5   initializer_ranger9   )r*   modulefactorr+   r   rb   rd   r/   r/   r0   _init_weightso  s   




   

 
z'Pix2StructPreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d u rtdt|r1t|jd d d |}tj||dd df gdd}n|	|j}|dd df 
 |ddd f< ||d< |d u rStd||d	k| |S )
Nzself.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id. See Pix2Struct docs for more information.r2   )r   .rv   r   ).r   z1self.model.config.pad_token_id has to be defined.)rF   decoder_start_token_idpad_token_id
ValueErrorr   r&   fullrz   cat	new_zerosclonemasked_fill_)r*   r   r   r   shifted_input_idsr/   r/   r0   _shift_right  s      z&Pix2StructPreTrainedModel._shift_rightN)
r@   rA   rB   r   __annotations___can_compile_fullgraphpropertyr   r   r   r/   r/   r/   r0   r   ^  s   
 

Pr   c                       s   e Zd ZU eed< dZdZdgZdef fddZdd Z	d	e
eee f d
dfddZe						ddeej deej deej dee dee dee d
eeef fddZ  ZS )Pix2StructVisionModelrF   rT   Tr   c                    sD   t  | || _t|| _t|| _t|j|j	d| _
|   d S Nr   )r#   r$   rF   rE   rX   r   encoderr!   r+   r   	layernorm	post_initrS   r-   r/   r0   r$     s   

zPix2StructVisionModel.__init__c                 C   s   | j jS rH   )rX   rK   r*   r/   r/   r0   get_input_embeddings  s   z*Pix2StructVisionModel.get_input_embeddingsheads_to_prunerG   Nc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   prune_heads)r*   r  r   headsr/   r/   r0   _prune_heads  s   z"Pix2StructVisionModel._prune_headsr   r   r   r   r   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td|du r4|jdddk }| || j j}| 	|}| j
||||||d}|d }	| |	}	|sb|	f}
|
|dd  S t|	|j|jdS )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
            Flattened and padded pixel values. These values can be obtained using [`AutoImageProcessor`]. See
            [`Pix2StructVisionImageProcessor.__call__`] for details. Check the [original
            paper](https://huggingface.co/papers/2210.03347) (figure 5) for more details.

        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, Pix2StructVisionModel

        >>> image_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 2048, 768]
        ```
        Nz%You have to specify flattened_patchesr2   r   r   )r   r   r   r   r   r   r   )rF   r   r   use_return_dictr   sumfloatget_head_maskr   rX   r   r   r   r<   r   )r*   rT   r   r   r   r   r   embedding_outputencoder_outputssequence_outputhead_outputsr/   r/   r0   r>     s8   &

zPix2StructVisionModel.forward)NNNNNN)r@   rA   rB   r   r   main_input_namesupports_gradient_checkpointing_no_split_modulesr$   r  dictintlistr  r   r   r&   r\   r   r   r   r   r>   rC   r/   r/   r-   r0   r     s<   
 
r   c                       r   )r   rF   c                    r   r^   r   rS   r-   r/   r0   r$   M  r   z)Pix2StructTextDenseGatedActDense.__init__c                 C   r   rH   r   r   r/   r/   r0   r>   U  r   z(Pix2StructTextDenseGatedActDense.forwardr@   rA   rB   r   r$   r>   rC   r/   r/   r-   r0   r   L  r   r   c                       r   )Pix2StructTextLayerFFrF   c                    s8   t    t|| _t|j|jd| _t	|j
| _d S r   )r#   r$   r   DenseReluDenser!   r+   layer_norm_epsilon
layer_normr   rP   rQ   rR   rS   r-   r/   r0   r$   j  s   

zPix2StructTextLayerFF.__init__c                 C   s&   |  |}| |}|| | }|S rH   )r  r  rR   )r*   r<   forwarded_statesr/   r/   r0   r>   r  s   

zPix2StructTextLayerFF.forwardr  r/   r/   r-   r0   r  i  r   r  c                       sp   e Zd Z	ddedee f fddZedd
dZdddZ	e
dddd									dddZ  ZS )r   FNrF   	layer_idxc                    s   t    || _|j| _|j| _|j| _|j| _|j| _	|j
| _| j	| j | _|| _|d u r9td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrqt| j| j	| _t | _d| _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr_   )r#   r$   r   relative_attention_num_bucketsrelative_attention_max_distancer+   ra   rb   r   rd   rQ   rR   rf   r  loggerwarning_oncer.   r@   r   rI   rg   rh   ri   rj   rL   r   setpruned_headsrk   r*   rF   r   r  r-   r/   r0   r$   z  s,   

z Pix2StructTextAttention.__init__T       c                 C   s   d}|r|d }|| dk tj| 7 }t| } n
t| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r1   r   )r4   r&   rU   absr   
zeros_likelogr  math	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larger/   r/   r0   _relative_position_bucket  s*   z1Pix2StructTextAttention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf |}tj|tj|ddddf }|| }| j|d| j| j	d}|  |}	|	
g dd}	|	S )z%Compute binned relative position biasN)r9   ru   F)r.  r/  r0  )r1   r   r   r   )r   r(   ru   r&   arangerU   r4   r5  r  r  permute	unsqueeze)
r*   query_length
key_lengthru   cache_positioncontext_positionmemory_positionr-  relative_position_bucketvaluesr/   r/   r0   compute_bias  s    
 
z$Pix2StructTextAttention.compute_biaspast_key_valuepast_key_values4.58new_nameversionc                 C   s  |j dd \}}|du}| |}||d| j| jdd}|dur:t|tr:|j	| j
}|r6|j}n|j}n|}|r@|n|}|rW|rW|rW|j| j
 j}|j| j
 j}nE| |}| |}||d| j| jdd}||d| j| jdd}|dur|s|
nd}
|||| j
d|
i\}}|rd|j| j
< t||dd}|du r|j d }|dur|n|
d d }| jstjd| j||f|j|jd	}| jr| jrd|_n| j|||j|
d
}|dddd| dddf }|dur|ddddddd|j d f }|| }| jr-t|j d }d|t| j< |dd|  f }n|}||7 }t!j"j#|$ dd%|}t!j"j&|| j&| jd}|durT|| }t||}|dd' }||d| j(}| )|}||f}|	r{||f }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr1   r2   r   r;  Tr   rt   )ru   r;  r   r   rw   )*rz   rg   rn   rd   rb   ro   r   r	   
is_updatedgetr  cross_attention_cacheself_attention_cachelayerskeysr?  rh   ri   updater&   r{   r   r|   ru   r9   rk   ry   r}   r@  r#  r'   r  r   r   r   r   r  r   rR   rm   rf   rj   )r*   r<   maskkey_value_statesr   rB  r   r9  	use_cacher   r;  rr   r   is_cross_attentionr   rH  curr_past_key_valuecurrent_statesr   r   r   r:  real_seq_lengthcausal_maskr   r   r   r   r/   r/   r0   r>     sz   





"
&


zPix2StructTextAttention.forwardFN)Tr%  r&  )NN)	NNNNNNFFN)r@   rA   rB   r   r   r  r$   staticmethodr5  r@  r   r>   rC   r/   r/   r-   r0   r   y  s*    
0r   c                       sN   e Zd Zddee f fddZedddd								dd
dZ  ZS ) Pix2StructTextLayerSelfAttentionFNr  c                    s>   t    t|||d| _t|j|jd| _t	|j
| _d S )Nr   r  r   r#   r$   r   r   r!   r+   r  r  r   rP   rQ   rR   r$  r-   r/   r0   r$   R  s   
z)Pix2StructTextLayerSelfAttention.__init__rA  rB  rC  rD  c	              
   C   sL   |  |}	| j|	|||||||d}
|| |
d  }|f|
dd   }|S )N)rO  r   r   rB  rQ  r   r;  r   r   r  r   rR   )r*   r<   r   r   r   rB  rQ  r   r;  normed_hidden_statesr   r   r/   r/   r0   r>   Z  s   

z(Pix2StructTextLayerSelfAttention.forwardrW  )NNNNFFN	r@   rA   rB   r   r  r$   r   r>   rC   r/   r/   r-   r0   rY  Q  s    rY  c                       sP   e Zd Zddee f fddZedddd										dd
dZ  ZS )!Pix2StructTextLayerCrossAttentionNr  c                    s>   t    t|d|d| _t|j|jd| _t	|j
| _d S )NFrZ  r   r[  )r*   rF   r  r-   r/   r0   r$   x  s   
z*Pix2StructTextLayerCrossAttention.__init__rA  rB  rC  rD  Fc                 C   sP   |  |}| j|||||||||	|
d
}|| |d  }|f|dd   }|S )N)	rO  rP  r   r   rB  rQ  r9  r   r;  r   r   r\  )r*   r<   rP  r   r   r   rB  rQ  r9  r   r;  r]  r   r   r   r/   r/   r0   r>   ~  s    
z)Pix2StructTextLayerCrossAttention.forwardrH   )NNNNFNFNr^  r/   r/   r-   r0   r_  w  s    r_  c                       sX   e Zd Zddee f fddZedddd												
	dddZ  ZS )Pix2StructTextBlockFNr  c                    s6   t    t|||d| _t||d| _t|| _d S )NrZ  )r  )r#   r$   rY  self_attentionr_  encoder_decoder_attentionr  r   r$  r-   r/   r0   r$     s   
zPix2StructTextBlock.__init__rA  rB  rC  rD  Tc                 C   sH  | j |||||	|
||d}|d }|dd  }|jtjkr6t| r6t|jjd }tj|| |d}|d u}|ry| j	||||||	|d d |
|d	}|d }|jtjkrqt| rqt|jjd }tj|| |d}||dd   }| 
|}|jtjkrt| rt|jjd }tj|| |d}|f}|| S )N)r   r   r   rB  rQ  r   r;  r   r   i  )r   r   r2   )rP  r   r   r   rB  r9  rQ  r   )ra  r9   r&   r:   isinfanyr   r   clamprb  r   )r*   r<   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   cross_attn_layer_head_maskrB  rQ  r   r   r;  r   attention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   r/   r/   r0   r>     sN   


zPix2StructTextBlock.forwardrW  )NNNNNNNNFFTNr^  r/   r/   r-   r0   r`    s     r`  z3
    The standalone text decoder of Pix2Struct
    )custom_introc                #       s^  e Zd ZU eed< dgZdgZdZ fddZdd Z	e
																												d'd
eej deej deej deej deej deej deej dee dee dee dee deej dee deej deeejdf ef fddZ	d(deejdf dejdejdedef
dd Zedejd!ed"ed#ejdejd$efd%d&Z  ZS ))r   rF   r`  zlm_head.weightTc                    s   t    t j j| _t fddt j	D | _
t j jd| _t j| _tj j jdd| _|   d| _d S )Nc                    s"   g | ]}t  t|d k|dqS )r   rZ  )r`  r   )r   r   r   r/   r0   r     s    z0Pix2StructTextModel.__init__.<locals>.<listcomp>r   Fr_   )r#   r$   r   rL   
vocab_sizer+   embed_tokensr   r   
num_layersr   r!   r  final_layer_normrP   rQ   rR   rI   r   r  rk   rS   r-   r   r0   r$     s   

zPix2StructTextModel.__init__c                 C   s
   || _ d S rH   )rp  r*   new_embeddingsr/   r/   r0   set_input_embeddings     
z(Pix2StructTextModel.set_input_embeddingsNr   r   rf  rg  inputs_embedsr   cross_attn_head_maskrB  rQ  r   r   labelsr   r;  rG   .c           )      K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}| jr7| jr7|	r7td d}	|durC|durCt	d|durT|
 }|d|d }n|dura|
 dd }nt	d|du rw| jdusrJ d| |}|\}}|	r|du r| j jrtt| j dt| j d}nt| j d}d	}|dur|d	 }n|dur| }|du rtj||| |jd
}|du r|dur| | n|}tj|||jd
}| j jr| |||t|tr|jn||
}n|ddddddf }|j|jd}d| t|jj }|dur+|
 \}}}||f}|du r%tj||jd
}| |}nd}| || j j}| || j j}|rBdnd}|
rIdnd}|
rPdnd}d}d}|  |} t!| j"D ]Q\}!}"||! }#||! }$|rt|| f }|"| ||||||#|$||	|
|d}%|%d	 } |%d }|dur|%|
rdnd }|
r||%d f }|dur||%d f }q`| #| } |  | } | $| }&|r|| f }d}'|dur||&j}t%j&ddd}(|(|&' d|&
d|' d}'|st(dd |'|&||||fD S t)|'|&||||dS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Pix2StructText is a model with relative position
            embeddings so you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [Pix2StructText
            Training](./t5#training).
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoProcessor, Pix2StructTextModel

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructTextModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> loss = outputs.loss
        ```
        NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer2   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz<You have to initialize the model with valid token embeddingsr   r   ru   )r9   r   r/   )r   ri  rB  rQ  r   r;  r   r   r1      r   r7   )ignore_index	reductionc                 s   r   rH   r/   r   r/   r/   r0   r     s    	z.Pix2StructTextModel.forward.<locals>.<genexpr>)losslogitsrB  r<   r   cross_attentions)*rF   rQ  r   r   r	  rk   ry   r   warningr   sizern   rp  is_encoder_decoderr	   r   get_seq_lengthr&   r6  ru   r'   
is_decoder_update_causal_maskr   rK  r4   r9   r   r   invert_attention_maskr  rq  rR   r   r   rr  r   r   CrossEntropyLossrm   r   r   ))r*   r   r   rf  rg  rw  r   rx  rB  rQ  r   r   ry  r   r;  kwargsinput_shaperr   r   past_key_values_lengthmask_seq_lengthrV  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskr   all_attentionsall_cross_attentionsr   rh  r<   r   r   r   ri  r   r  r~  loss_fctr/   r/   r0   r>     s   3













&zPix2StructTextModel.forwardFr   input_tensorc                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )Nflash_attention_2r   flex_attentionr   Fsdpa)rw  r  is_trainingr   r2   )sequence_lengthtarget_lengthr9   r;  rr   )cudaxpunpu)rF   _attn_implementationrd  r   r&   r\   r    r  is_compileabler   _ignore_causal_mask_sdpary   r9   rz   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionru   typer   r   _unmask_unattended)r*   r   r  r;  rB  r   past_seen_tokensusing_compilable_cacher9   r  r  rV  	min_dtyper/   r/   r0   r    sT   




z'Pix2StructTextModel._update_causal_maskr  r  r9   rr   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr{  )
fill_valuer9   ru   r   )diagonalrz  r2   r   )rv   r&   r   r   r   ru   triur6  reshapeexpandr   rz   r4   r~   )r   r  r  r9   r;  rr   r  rV  r  mask_lengthpadding_maskr/   r/   r0   r  5  s,    $
6  zIPix2StructTextModel._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNNNNN)F)r@   rA   rB   r   r   r  _tied_weights_keysr  r$   ru  r   r   r&   
LongTensorFloatTensorr\   r   r   r   r   r   r>   r  rX  r  r9   r  rC   r/   r/   r-   r0   r     s   
 	
 `
Dr   zr
    A conditional generation model with a language modeling head. Can be used for sequence generation tasks.
    c                &       sF  e Zd ZU eed< dZdgZdef fddZdd Zdd	 Z	d
e
jfddZdd Zdd Ze																d#deej deej deej deej deej deej deej deeeej   dee deej deej dee dee dee dee d eej d
eeej ef f"d!d"Z  ZS )$"Pix2StructForConditionalGenerationrF   rT   zdecoder.lm_head.weightc                    s8   t  | t|j| _t|j| _|j| _| 	  d S rH   )
r#   r$   r   vision_configr   r   r   decoderis_vqar  rS   r-   r/   r0   r$   x  s
   z+Pix2StructForConditionalGeneration.__init__c                 C   
   | j  S rH   )r  r  r  r/   r/   r0   r    rv  z7Pix2StructForConditionalGeneration.get_input_embeddingsc                 C      | j | d S rH   )r  ru  rs  r/   r/   r0   ru       z7Pix2StructForConditionalGeneration.set_input_embeddingsrG   c                 C   r  rH   )r  get_output_embeddingsr  r/   r/   r0   r    rv  z8Pix2StructForConditionalGeneration.get_output_embeddingsc                 C   r  rH   )r  set_output_embeddingsrs  r/   r/   r0   r    r  z8Pix2StructForConditionalGeneration.set_output_embeddingsc                 C   s   | j S rH   )r   r  r/   r/   r0   get_encoder  s   z.Pix2StructForConditionalGeneration.get_encoderNr   r   r   r   decoder_head_maskrx  r  rB  ry  decoder_inputs_embedsrQ  r   r   r   r;  c                 C   sH  |dur|n| j jj}|dur|n| j j}|du r%| j||||||d}n$|rIt|tsIt|d t|dkr:|d ndt|dkrE|d ndd}|d }|
duru|du ru|du ru| |
}|durd|n|	| j j
 }d|dddf< | j||||	||||||||
||d}|s|| S t|j|j|j|j|j|j|j|j|jd	S )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
            Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
            `num_channels` * `patch_size` * `patch_size`

            The process of flattening the pixel patches is done by `Pix2StructProcessor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss for the decoder.

        Example:

        Inference:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> # autoregressive generation
        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A stop sign is on a street corner.

        >>> # conditional generation
        >>> text = "A picture of"
        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A picture of a stop sign with a red stop sign
        ```

        Training:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "A stop sign is on the street corner."

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> labels = processor(text=text, return_tensors="pt").input_ids

        >>> # forward pass
        >>> outputs = model(**inputs, labels=labels)
        >>> loss = outputs.loss
        >>> print(f"{loss.item():.5f}")
        5.94282
        ```N)rT   r   r   r   r   r   r   r   r1   r   )r   r   rw  rB  rf  rg  r   rx  rQ  r   r   ry  r   r;  )	r~  r  rB  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_staterf  encoder_attentions)rF   r   rQ  r	  r   r   r   lenr   ner   r  r  r   r~  r  rB  r<   r   r  r   )r*   rT   r   r   r   r   r  rx  r  rB  ry  r  rQ  r   r   r   r;  r<   decoder_outputsr/   r/   r0   r>     sl   r
z*Pix2StructForConditionalGeneration.forward)NNNNNNNNNNNNNNNN)r@   rA   rB   r   r   r  r  r$   r  ru  r   Moduler  r  r  r   r   r&   r  r  
BoolTensorr\   r   r   r   r   r   r>   rC   r/   r/   r-   r0   r  n  s|   
 	
r  )r   r  r   r   )Hr[   r*  typingr   r   r&   r   activationsr   cache_utilsr   r   r	   
generationr
   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   r   utils.deprecationr   configuration_pix2structr   r   r   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr    
get_loggerr@   r   r  r!   apex.normalizationrD   infoImportError	Exceptionr  rE   r]   r   r   r   r   r   r   r  r   rY  r_  r`  r   r  __all__r/   r/   r/   r0   <module>   st   $	

$b+,|p Y&'X  u U