o
    eib                     @   s\  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZmZmZm Z  ddl!m"Z"m#Z#m$Z$ e %e&Z'G dd dej(Z)zddl*m+Z+ e+Z)e',d W n e-y   Y n e.y   e'/d Y nw G dd dej(Z0G dd dej(Z1G dd dej(Z2G dd deZ3G dd dej(Z4eG dd  d eZ5eG d!d" d"e5Z6G d#d$ d$ej(Z7G d%d& d&ej(Z8G d'd( d(ej(Z9G d)d* d*ej(Z:G d+d, d,ej(Z;G d-d. d.eZ<ed/d0G d1d2 d2e5Z=ed3d0G d4d5 d5e5eZ>g d6Z?dS )7zPix2Struct modeling file    N)nn   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torchdynamo_compilinglogging   )Pix2StructConfigPix2StructTextConfigPix2StructVisionConfigc                       s&   e Zd Zd fdd	Zdd Z  ZS )Pix2StructLayerNormư>c                    s&   t    tt|| _|| _dS )zc
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ p/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/pix2struct/modeling_pix2struct.pyr   4   s   

zPix2StructLayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )N   T)keepdim)tor    float32powmeanrsqrtr#   r"   dtypefloat16bfloat16)r$   hidden_statesvariancer)   r)   r*   forward<   s
   
zPix2StructLayerNorm.forward)r   __name__
__module____qualname__r   r8   __classcell__r)   r)   r'   r*   r   3   s    r   )FusedRMSNormzWDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pix2StructLayerNormzJDiscovered apex but it failed to load, falling back to Pix2StructLayerNormc                       s@   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
Pix2StructVisionEmbeddingsa-  
    Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
    Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
    is represented by a vector of `hidden_size` values.
    configreturnNc                    sR   t    t|j|j| _t|j|j| _	t|j|j| _
t|j| _d S N)r   r   r   Linearpatch_embed_hidden_sizer%   patch_projection	Embeddingseq_lenrow_embeddercolumn_embedderDropoutdropout_ratedropoutr$   r@   r'   r)   r*   r   `   s
   
z#Pix2StructVisionEmbeddings.__init__flattened_patchesc                 C   s   |d d d d df   }|d d d d df   }|d d d d dd f }| |}| |}| |}|| | }| |}|S )Nr   r   r+   )longrE   rH   rI   rL   )r$   rN   row_indicescol_indices
embeddingsrow_embeddingscol_embeddingsr)   r)   r*   r8   i   s   



z"Pix2StructVisionEmbeddings.forward)
r:   r;   r<   __doc__r   r   r    Tensorr8   r=   r)   r)   r'   r*   r?   Y   s    	r?   c                       s,   e Zd Z fddZ			dddZ  ZS )Pix2StructVisionAttentionc                    s   t    |j| _|j| _|j| _|j| _| j| j | _	t
j| j| j	dd| _t
j| j| j	dd| _t
j| j| j	dd| _t
j| j	| jdd| _d| _d S NFbias)r   r   r%   d_kvkey_value_proj_dimnum_attention_headsn_headsattention_dropoutrL   	inner_dimr   rC   querykeyvalueoutputgradient_checkpointingrM   r'   r)   r*   r   ~   s   

z"Pix2StructVisionAttention.__init__NFc                    s  |j dd \ } fdd}||}||}||}	t||dd}
|du rtjdj||f|
j	|
j
d}jrKjrKd|_| dkrd||ddddddf |j	 }n$|durq|||j	 }nt stj |f|j	|j
d}|||j	 }d| }||dkt|
j
j}|
|7 }
t|
tt|
j
j}
tjj|
d	tjd
|
}tjj|jjd}t||	}|dd  d	j} |}|f|f }|r||f }|S )z&
        Self-attention block
        Nr+   c                    s    |    djjddS )
projectionr,   r   r+   )
contiguousviewr^   r\   	transpose)states
batch_sizer$   r)   r*   to_projection_shape   s    z>Pix2StructVisionAttention.forward.<locals>.to_projection_shaper   r   devicer3   Tr,   )dimr3   ptraining)!shapera   rb   rc   r    matmulri   zerosr^   ro   r3   re   rs   requires_gradrp   r.   r   r!   masked_fillfinfominmaxtensorr   
functionalsoftmaxr/   type_asrL   rg   rh   r`   rd   )r$   r6   attention_maskposition_biasoutput_attentions
seq_lengthrm   query_states
key_statesvalue_statesscoresposition_bias_maskedattn_weightsattn_outputoutputsr)   rk   r*   r8      sD   &

z!Pix2StructVisionAttention.forward)NNFr9   r)   r)   r'   r*   rW   }   s    rW   c                       *   e Zd Zdef fddZdd Z  ZS )Pix2StructVisionMlpr@   c                    j   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S rX   r   r   r   rC   r%   d_ffwi_0wi_1worJ   rK   rL   r   dense_act_fnactrM   r'   r)   r*   r         
zPix2StructVisionMlp.__init__c                 C   z   |  | |}| |}|| }| |}t| jjtjr6|j	| jjj	kr6| jjj	tj
kr6|| jjj	}| |}|S rB   r   r   r   rL   
isinstancer   r"   r    rV   r3   int8r.   r$   r6   hidden_geluhidden_linearr)   r)   r*   r8         


zPix2StructVisionMlp.forward)r:   r;   r<   r   r   r8   r=   r)   r)   r'   r*   r          r   c                       sd   e Zd Zdeddf fddZ		ddejdejdB d	edeejejf eej B fd
dZ	  Z
S )Pix2StructVisionLayerr@   rA   Nc                    sT   t    |j| _d| _t|| _t|| _t|j	|j
d| _t|j	|j
d| _d S )Nr   r&   )r   r   chunk_size_feed_forwardseq_len_dimrW   	attentionr   mlpr   r%   layer_norm_epspre_mlp_layer_normpre_attention_layer_normrM   r'   r)   r*   r      s   


zPix2StructVisionLayer.__init__Fr6   r   r   c           	      C   s`   |}|  |}| j|||d}|d }|dd  }|| }| |}| || }|f| }|S )N)r   r   r   r   )r   r   r   r   )	r$   r6   r   r   residualself_attention_outputsattention_outputr   layer_outputr)   r)   r*   r8      s   


zPix2StructVisionLayer.forward)NF)r:   r;   r<   r   r   r    rV   booltupler8   r=   r)   r)   r'   r*   r      s    r   c                       s^   e Zd Zdeddf fddZ				ddejd	ejdB d
edededee	B fddZ
  ZS )Pix2StructVisionEncoderr@   rA   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r)   )r   ).0_r@   r)   r*   
<listcomp>"  s    z4Pix2StructVisionEncoder.__init__.<locals>.<listcomp>F)	r   r   r@   r   
ModuleListrangenum_hidden_layerslayerre   rM   r'   r   r*   r     s   
 
z Pix2StructVisionEncoder.__init__FTr6   r   r   output_hidden_statesreturn_dictc                 C   s   |rdnd }|r
dnd }t | jD ]\}}	|r||f }|	|||}
|
d }|r/||
d f }q|r7||f }|sEtdd |||fD S t|||dS )Nr)   r   r   c                 s       | ]	}|d ur|V  qd S rB   r)   r   vr)   r)   r*   	<genexpr>?  s    z2Pix2StructVisionEncoder.forward.<locals>.<genexpr>last_hidden_stater6   
attentions)	enumerater   r   r   )r$   r6   r   r   r   r   all_hidden_statesall_self_attentionsilayer_modulelayer_outputsr)   r)   r*   r8   %  s&   

zPix2StructVisionEncoder.forward)NFFT)r:   r;   r<   r   r   r    rV   r   r   r   r8   r=   r)   r)   r'   r*   r     s&    	r   c                   @   sB   e Zd ZU eed< dZdZedd Ze	
 dd Zdd	 Zd
S )Pix2StructPreTrainedModelr@   )imagetextFc                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r    r|   r   r   )r$   r   
input_maskdummy_inputsr)   r)   r*   r   N  s   

z&Pix2StructPreTrainedModel.dummy_inputsc                 C   s  | j j}t|trt|j|d  dS t|trt| j tr$| j j	j
n| j j
}t| j tr3| j j	jn| j j}tj|jjd||d  d t|jdrX|jjdurXt|jj tj|jjd||d  d t|jdry|jjduryt|jj tj|jjd||d  d t|jdr|jjdurt|jj dS dS dS t|tr%t| j tr| j j	j
n| j j
}t| j tr| j j	jn| j j
}t| j tr| j j	jn| j j}tj|jjd||| d  d tj|jjd||d  d tj|jjd||d  d tj|jjd||| d  d |jr#tj|jjd||d  d dS dS t|tjrft| j tr8| j j	j
n| j j
}tj|jd||d  d |jdurbt|jddsdt|j|j  dS dS dS t|t rt| j trx| j j	j
n| j j
}tj|j!jd||d  d dS t|tj"tj#frtj$|jd| j j%d |jdurt|j dS dS t|tr|jdurt&|j dS dS t|tjrtj|jd| j j%d |jdurt|jddst|j|j  dS dS dS dS )	zInitialize the weights      ?g        g      )r1   stdrZ   N_is_hf_initializedF)'r@   initializer_factorr   r   init	constant_r"    Pix2StructTextDenseGatedActDenser   text_configr%   r   normal_r   hasattrrZ   zeros_r   r   Pix2StructTextAttentionr[   	num_headsra   rb   rc   rd   has_relative_attention_biasrelative_attention_biasr   rF   padding_idxgetattrPix2StructTextModellm_headrC   Conv2dtrunc_normal_initializer_rangeones_)r$   modulefactorr%   r   r\   r^   r)   r)   r*   _init_weightsY  s   




   

 z'Pix2StructPreTrainedModel._init_weightsc                 C   sx   | j j}| j j}|d u rtd||j}|dd df  |ddd f< ||d< |d u r2td||dk| |S )Nzself.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id. See Pix2Struct docs for more information..r,   r   ).r   z1self.model.config.pad_token_id has to be defined.)r@   decoder_start_token_idpad_token_id
ValueError	new_zerosrt   clonemasked_fill_)r$   r   r   r   shifted_input_idsr)   r)   r*   _shift_right  s    z&Pix2StructPreTrainedModel._shift_rightN)r:   r;   r<   r   __annotations__input_modalities_can_compile_fullgraphpropertyr   r    no_gradr   r   r)   r)   r)   r*   r   G  s   
 


Lr   c                       s   e Zd ZU eed< dZdZdZdgZdef fddZ	dd	 Z
e	
	
	
	
	
ddejd
B dejd
B ded
B ded
B ded
B deeB fddZ  ZS )Pix2StructVisionModelr@   rN   )r   Tr   c                    sD   t  | || _t|| _t|| _t|j|j	d| _
|   d S Nr   )r   r   r@   r?   rR   r   encoderr   r%   r   	layernorm	post_initrM   r'   r)   r*   r     s   

zPix2StructVisionModel.__init__c                 C   s   | j jS rB   )rR   rE   r$   r)   r)   r*   get_input_embeddings  s   z*Pix2StructVisionModel.get_input_embeddingsNr   r   r   r   rA   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td|du r4|jdddk }| |}| j|||||d}|d }	| 	|	}	|sY|	f}
|
|dd  S t
|	|j|jdS )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
            Flattened and padded pixel values. These values can be obtained using [`AutoImageProcessor`]. See
            [`Pix2StructVisionImageProcessor.__call__`] for details. Check the [original
            paper](https://huggingface.co/papers/2210.03347) (figure 5) for more details.

        Example:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, Pix2StructVisionModel

        >>> image_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 2048, 768]
        ```
        Nz%You have to specify flattened_patchesr,   rp   r   )r   r   r   r   r   r   )r@   r   r   use_return_dictr   sumfloatrR   r   r   r   r6   r   )r$   rN   r   r   r   r   kwargsembedding_outputencoder_outputssequence_outputhead_outputsr)   r)   r*   r8     s4   (

zPix2StructVisionModel.forward)NNNNN)r:   r;   r<   r   r   main_input_namer   supports_gradient_checkpointing_no_split_modulesr   r   r   r    rV   r   r   r   r8   r=   r)   r)   r'   r*   r     s6   
 r   c                       r   )r   r@   c                    r   rX   r   rM   r'   r)   r*   r   !  r   z)Pix2StructTextDenseGatedActDense.__init__c                 C   r   rB   r   r   r)   r)   r*   r8   )  r   z(Pix2StructTextDenseGatedActDense.forwardr:   r;   r<   r   r   r8   r=   r)   r)   r'   r*   r      r   r   c                       r   )Pix2StructTextLayerFFr@   c                    s8   t    t|| _t|j|jd| _t	|j
| _d S r   )r   r   r   DenseReluDenser   r%   layer_norm_epsilon
layer_normr   rJ   rK   rL   rM   r'   r)   r*   r   >  s   

zPix2StructTextLayerFF.__init__c                 C   s&   |  |}| |}|| | }|S rB   )r  r  rL   )r$   r6   forwarded_statesr)   r)   r*   r8   F  s   

zPix2StructTextLayerFF.forwardr
  r)   r)   r'   r*   r  =  r   r  c                       s^   e Zd ZddededB f fddZedd
dZdddZ								dddZ	  Z
S )r   FNr@   	layer_idxc                    s   t    || _|j| _|j| _|j| _|j| _|j| _	|j
| _| j	| j | _|| _|d u r9td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrqt| j| j	| _d| _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrY   )r   r   r   relative_attention_num_bucketsrelative_attention_max_distancer%   r[   r\   r   r^   rK   rL   r`   r  loggerwarning_oncer(   r:   r   rC   ra   rb   rc   rd   rF   r   re   r$   r@   r   r  r'   r)   r*   r   N  s*   

z Pix2StructTextAttention.__init__T       c                 C   s   d}|r|d }|| dk tj| 7 }t| } n
t| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r+   r   )r.   r    rO   absrz   
zeros_likelogr  math	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larger)   r)   r*   _relative_position_bucketj  s*   z1Pix2StructTextAttention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf |}tj|tj|ddddf }|| }| j|d| j| j	d}|  |}	|	
g dd}	|	S )z%Compute binned relative position biasN)r3   ro   F)r  r   r!  )r+   r   r   r   )r   r"   ro   r    arangerO   r.   r&  r  r  permute	unsqueeze)
r$   query_length
key_lengthro   cache_positioncontext_positionmemory_positionr  relative_position_bucketvaluesr)   r)   r*   compute_bias  s    
 
z$Pix2StructTextAttention.compute_biasc
                 C   s  |j dd \}
}|du}| |}||
d| j| jdd}|dur:t|tr:|j	| j
}|r6|j}n|j}n|}|r@|n|}|rW|rW|rW|j| j
 j}|j| j
 j}nE| |}| |}||
d| j| jdd}||
d| j| jdd}|dur|s|	nd}	|||| j
d|	i\}}|rd|j| j
< t||dd}|du r|j d }|dur|n|	d d }| jstjd| j||f|j|jd	}| jr| jrd|_n| j|||j|	d
}|dddd| dddf }|dur|ddddddd|j d f }|| }|}||7 }tjj|  dd!|}tjj"|| j"| jd}t||}|dd# }||
d| j$}| %|}||f}|rT||f }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr+   r,   r   r,  Tr   rn   )ro   r,  r   rq   )&rt   ra   rh   r^   r\   ri   r   r   
is_updatedgetr  cross_attention_cacheself_attention_cachelayerskeysr0  rb   rc   updater    ru   r   rv   ro   r3   re   rs   rw   r1  r   r}   r~   r  r   rL   rg   r`   rd   )r$   r6   maskkey_value_statesr   past_key_valuesr*  	use_cacher   r,  rl   r   is_cross_attentionr   r3  curr_past_key_valuescurrent_statesr   r   r   r+  real_seq_lengthcausal_maskr   r   r   r   r)   r)   r*   r8     sn   





"
&

zPix2StructTextAttention.forwardFN)Tr  r  )NN)NNNNNFFN)r:   r;   r<   r   intr   staticmethodr&  r1  r8   r=   r)   r)   r'   r*   r   M  s    
0r   c                       s>   e Zd ZddedB f fddZ						d	ddZ  ZS )
 Pix2StructTextLayerSelfAttentionFNr  c                    s>   t    t|||d| _t|j|jd| _t	|j
| _d S )Nr   r  r   r   r   r   r   r   r%   r  r  r   rJ   rK   rL   r  r'   r)   r*   r     s   
z)Pix2StructTextLayerSelfAttention.__init__c              	   C   sJ   |  |}| j|||||||d}	|| |	d  }|f|	dd   }
|
S )N)r:  r   r<  r=  r   r,  r   r   r  r   rL   )r$   r6   r   r   r<  r=  r   r,  normed_hidden_statesr   r   r)   r)   r*   r8     s   

	z(Pix2StructTextLayerSelfAttention.forwardrC  )NNNFFNr:   r;   r<   rD  r   r8   r=   r)   r)   r'   r*   rF    s    rF  c                       s@   e Zd ZddedB f fddZ							d	ddZ  ZS )
!Pix2StructTextLayerCrossAttentionNr  c                    s>   t    t|d|d| _t|j|jd| _t	|j
| _d S )NFrG  r   rH  )r$   r@   r  r'   r)   r*   r   :  s   
z*Pix2StructTextLayerCrossAttention.__init__Fc
                 C   sN   |  |}
| j|
||||||||	d	}|| |d  }|f|dd   }|S )N)r:  r;  r   r<  r=  r*  r   r,  r   r   rI  )r$   r6   r;  r   r   r<  r=  r*  r   r,  rJ  r   r   r   r)   r)   r*   r8   @  s   
z)Pix2StructTextLayerCrossAttention.forwardrB   )NNNFNFNrK  r)   r)   r'   r*   rL  9  s    
rL  c                       sF   e Zd Zd	dedB f fddZ										d
ddZ  ZS )Pix2StructTextBlockFNr  c                    s6   t    t|||d| _t||d| _t|| _d S )NrG  )r  )r   r   rF  self_attentionrL  encoder_decoder_attentionr  r   r  r'   r)   r*   r   ^  s   
zPix2StructTextBlock.__init__Tc              
   C   sD  | j ||||||	|d}|d }|dd  }|jtjkr5t| r5t|jjd }tj|| |d}|d u}|rw| j	||||||d d ||	d}|d }|jtjkrot| rot|jjd }tj|| |d}||dd   }| 
|}|jtjkrt| rt|jjd }tj|| |d}|f}|| S )N)r   r   r<  r=  r   r,  r   r   i  )rz   r{   r,   )r;  r   r   r<  r*  r=  r   )rN  r3   r    r4   isinfanyry   r{   clamprO  r   )r$   r6   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr<  r=  r   r   r,  r   attention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   r)   r)   r*   r8   n  sJ   	


zPix2StructTextBlock.forwardrC  )
NNNNNNFFTNrK  r)   r)   r'   r*   rM  ]  s    rM  z3
    The standalone text decoder of Pix2Struct
    )custom_introc                       s   e Zd ZU eed< dZdgZddiZdZ fddZ	d	d
 Z
e												ddejdB dejdB dejdB dejdB dejdB dedB dedB dedB dedB dejdB dedB dejdB deejdf eB fddZ  ZS )r   r@   )r   rM  zlm_head.weightzembed_tokens.weightTc                    s   t    t j j| _t fddt j	D | _
t j jd| _t j| _tj j jdd| _|   d| _d S )Nc                    s"   g | ]}t  t|d k|dqS )r   rG  )rM  r   )r   r   r   r)   r*   r     s    z0Pix2StructTextModel.__init__.<locals>.<listcomp>r   FrY   )r   r   r   rF   
vocab_sizer%   embed_tokensr   r   
num_layersr   r   r  final_layer_normrJ   rK   rL   rC   r   r   re   rM   r'   r   r*   r     s   

zPix2StructTextModel.__init__c                 C   s
   || _ d S rB   )r\  r$   new_embeddingsr)   r)   r*   set_input_embeddings     
z(Pix2StructTextModel.set_input_embeddingsNr   r   rS  rT  inputs_embedsr<  r=  r   r   labelsr   r,  rA   .c           %      K   s  |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	|dur$|n| j j}| jr7| jr7|r7td d}|durC|durCt	d|durT|
 }|d|d }n|dura|
 dd }nt	d|du rw| jdusrJ d| |}|\}}|r|du r| j jrtt| j dt| j d}nt| j d}d	}|dur|d	 }n|dur| }|du rtj||| |jd
}|du r|dur| | n|}tj|||jd
}| j jrt| j ||||d}n|ddddddf }|j|jd}d| t|jj }|dur$|
 \}}}||f}|du rtj||jd
}| |}nd}|	r+dnd}|r2dnd}|r9dnd}d}d}| |}t| jD ]G\}} |	rU||f }| ||||||||||d
}!|!d	 }|!d }|dury|!|rvdnd }|r||!d f }|dur||!d f }qI| |}| |}|  |}"|	r||f }d}#|
dur|
|"j}
t!j"ddd}$|$|"# d|"
d|
# d}#|st$dd |#|"||||fD S t%|#|"||||dS )aU  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Pix2StructText is a model with relative position
            embeddings so you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [Pix2StructText
            Training](./t5#training).

        Example:

        ```python
        >>> from transformers import AutoProcessor, Pix2StructTextModel

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructTextModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> loss = outputs.loss
        ```
        NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer,   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz<You have to initialize the model with valid token embeddingsr   r   )ro   )r@   rc  r   r,  r<  )r3   r   r)   )r<  r=  r   r,  r   r   r+      r   r1   )ignore_index	reductionc                 s   r   rB   r)   r   r)   r)   r*   r     s    	z.Pix2StructTextModel.forward.<locals>.<genexpr>)losslogitsr<  r6   r   cross_attentions)&r@   r=  r   r   r   re   rs   r  warningr   sizerh   r\  is_encoder_decoderr   r   get_seq_lengthr    r'  ro   r!   
is_decoderr
   r.   r3   ry   rz   invert_attention_maskrL   r   r   r^  r   r   CrossEntropyLossrg   r   r   )%r$   r   r   rS  rT  rc  r<  r=  r   r   rd  r   r,  r  input_shaperl   r   past_key_values_lengthmask_seq_lengthrB  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskr   all_attentionsall_cross_attentionsr   rU  r6   r   r   r   ri  rh  loss_fctr)   r)   r*   r8     s   +












&zPix2StructTextModel.forward)NNNNNNNNNNNN)r:   r;   r<   r   r   r   r	  _tied_weights_keysr  r   ra  r   r    
LongTensorFloatTensorr   r   r   r   r8   r=   r)   r)   r'   r*   r     s`   
 	
r   zr
    A conditional generation model with a language modeling head. Can be used for sequence generation tasks.
    c                       s  e Zd ZU eed< dZdef fddZdd Zdd Zd	e	j
fd
dZdd Ze													ddejdB dejdB dejdB dejdB deeej  dB dedB dejdB dejdB dedB dedB dedB dedB dejdB d	eej eB fddZ  ZS )"Pix2StructForConditionalGenerationr@   rN   c                    s8   t  | t|j| _t|j| _|j| _| 	  d S rB   )
r   r   r   vision_configr   r   r   decoderis_vqar   rM   r'   r)   r*   r     s
   z+Pix2StructForConditionalGeneration.__init__c                 C   
   | j  S rB   )r  r   r   r)   r)   r*   r     rb  z7Pix2StructForConditionalGeneration.get_input_embeddingsc                 C      | j | d S rB   )r  ra  r_  r)   r)   r*   ra       z7Pix2StructForConditionalGeneration.set_input_embeddingsrA   c                 C   r  rB   )r  get_output_embeddingsr   r)   r)   r*   r    rb  z8Pix2StructForConditionalGeneration.get_output_embeddingsc                 C   r  rB   )r  set_output_embeddingsr_  r)   r)   r*   r    r  z8Pix2StructForConditionalGeneration.set_output_embeddingsNr   r   r   r  r<  rd  decoder_inputs_embedsr=  r   r   r   r,  c                 K   sB  |	dur|	n| j jj}	|dur|n| j j}|du r$| j|||
||d}n$|rHt|tsHt|d t|dkr9|d ndt|dkrD|d ndd}|d }|durt|du rt|du rt| |}|durc|n|	| j j
 }d|dddf< | j|||||||	|
||||d}|s|| S t|j|j|j|j|j|j|j|j|jd	S )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
            Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
            `num_channels` * `patch_size` * `patch_size`

            The process of flattening the pixel patches is done by `Pix2StructProcessor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss for the decoder.

        Example:

        Inference:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> # autoregressive generation
        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A stop sign is on a street corner.

        >>> # conditional generation
        >>> text = "A picture of"
        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A picture of a stop sign with a red stop sign
        ```

        Training:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> text = "A stop sign is on the street corner."

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> labels = processor(text=text, return_tensors="pt").input_ids

        >>> # forward pass
        >>> outputs = model(**inputs, labels=labels)
        >>> loss = outputs.loss
        >>> print(f"{loss.item():.5f}")
        5.94282
        ```N)rN   r   r   r   r   r   r   r+   r   )r   r   rc  r<  rS  rT  r=  r   r   rd  r   r,  )	rh  ri  r<  decoder_hidden_statesdecoder_attentionsrj  encoder_last_hidden_staterS  encoder_attentions)r@   r   r=  r   r   r   r   lenr   ner   r  r  r   rh  ri  r<  r6   r   rj  r   )r$   rN   r   r   r   r  r<  rd  r  r=  r   r   r   r,  r  r6   decoder_outputsr)   r)   r*   r8     sf   h
z*Pix2StructForConditionalGeneration.forward)NNNNNNNNNNNNN)r:   r;   r<   r   r   r  r   r   ra  r   Moduler  r  r   r    r~  r}  
BoolTensorr   r   rV   r   r   r8   r=   r)   r)   r'   r*   r    sf   
 	
r  )r   r  r   r   )@rU   r  r    r    r   r   activationsr   cache_utilsr   r   r   
generationr	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   configuration_pix2structr   r   r   
get_loggerr:   r  r  r   apex.normalizationr>   infoImportError	Exceptionrk  r?   rW   r   r   r   r   r   r   r  r   rF  rL  rM  r   r  __all__r)   r)   r)   r*   <module>   sh   
$[))tc J#$S g D