o
    i                     @   s  d dl mZmZ d dlZd dlmZ d dlZd dlm	Z
 d dl	Zd dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d	d
lmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ d	dl%m&Z&m'Z'm(Z(m)Z)m*Z* d	dl+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1 e/2e3Z4dZ5dZ6ej7Z7ej8j9G dd de,Z:dZ;dZ<G dd dej=Z>G dd dej=Z?G dd dej=Z@G dd dej=ZAG dd dej=ZBG dd  d ej=ZCG d!d" d"ej=ZDG d#d$ d$ej=ZEG d%d& d&ej=ZFG d'd( d(ej=ZGG d)d* d*ej=ZHG d+d, d,ej=ZIG d-d. d.ej=ZJG d/d0 d0ej=ZKG d1d2 d2ej=ZLG d3d4 d4e'ZMG d5d6 d6ej=ZNe-d7e;G d8d9 d9eMZOe(eOe5ee6 G d:d; d;ej=ZPe-d<e;G d=d> d>eMZQd?ZRe*eQe<Sd@eR  e)eQe:e6dA G dBdC dCej=ZTe-dDe;G dEdF dFeMZUe(eUe5ee6 G dGdH dHej=ZVe-dIe;G dJdK dKeMZWdLZXe*eWe<Sd@eX  e)eWe!e6dA G dMdN dNej=ZYe-dOe;G dPdQ dQeMZZe(eZe5e#e6 G dRdS dSej=Z[e-dTe;G dUdV dVeMZ\e*e\e<SdW e(e\e5e e6 G dXdY dYej=Z]e-dZe;G d[d\ d\eMZ^e(e^e5e$e6 G d]d^ d^ej=Z_e-d_e;G d`da daeMZ`e(e`e5e"e6 G dbdc dcej=Zae-dde;G dedf dfeMZbe(ebe5ee6 g dgZcdS )h    )CallableOptionalN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)partitioning)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )
-FlaxBaseModelOutputWithPastAndCrossAttentionsFlaxBaseModelOutputWithPooling0FlaxBaseModelOutputWithPoolingAndCrossAttentions%FlaxCausalLMOutputWithCrossAttentionsFlaxMaskedLMOutputFlaxMultipleChoiceModelOutputFlaxNextSentencePredictorOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstring append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )
BertConfigzgoogle-bert/bert-base-uncasedr#   c                   @   sZ   e Zd ZU dZdZejed< dZejed< dZ	e
eej  ed< dZe
eej  ed< dS )FlaxBertForPreTrainingOutputaI  
    Output type of [`BertForPreTraining`].

    Args:
        prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nprediction_logitsseq_relationship_logitshidden_states
attentions)__name__
__module____qualname____doc__r%   jnpndarray__annotations__r&   r'   r   tupler(    r1   r1   _/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/bert/modeling_flax_bert.pyr$   =   s   
 r$   a
  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`BertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].

a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

c                   @   sB   e Zd ZU dZeed< ejZejed< dd Z	dde
fdd	Zd
S )FlaxBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                 C   s   t j| jj| jjtj jj| jjd| j	d| _
t j| jj| jjtj jj| jjd| j	d| _t j| jj| jjtj jj| jjd| j	d| _t j| jj| j	d| _t j| jjd| _d S )N)stddev)embedding_initr5   epsilonr5   rate)nnEmbedr4   
vocab_sizehidden_sizejaxinitializersnormalinitializer_ranger5   word_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutselfr1   r1   r2   setup   s(   zFlaxBertEmbeddings.setupTdeterministicc           
      C   sX   |  |d}| |d}| |d}|| | }	| |	}	| j|	|d}	|	S )Ni4rQ   )rD   astyperF   rH   rI   rM   )
rO   	input_idstoken_type_idsposition_idsattention_maskrQ   inputs_embedsposition_embedsrH   r'   r1   r1   r2   __call__   s   
zFlaxBertEmbeddings.__call__NT)r)   r*   r+   r,   r#   r/   r-   float32r5   rP   boolr[   r1   r1   r1   r2   r3      s   
 r3   c                   @   s~   e Zd ZU eed< dZeed< ejZ	ej	ed< dd Z
dd Zd	d
 Zejdd Z				ddeej dedefddZdS )FlaxBertSelfAttentionr4   Fcausalr5   c                 C   s   | j j| j j | _| j j| j j dkrtdtj| j j| jtjj	
| j jd| _tj| j j| jtjj	
| j jd| _tj| j j| jtjj	
| j jd| _| jrettjd| j jfdddd| _d S d S )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads})r5   kernel_initr"   r^   r5   )r4   r?   num_attention_headshead_dim
ValueErrorr<   Denser5   r@   rA   rB   rC   querykeyvaluer`   r   r-   onesrE   causal_maskrN   r1   r1   r2   rP      s2   zFlaxBertSelfAttention.setupc                 C   s"   | |jd d | jj| jf S N   )reshapeshaper4   rc   rd   rO   r'   r1   r1   r2   _split_heads   s   "z"FlaxBertSelfAttention._split_headsc                 C   s   | |jd d | jjf S rl   )rn   ro   r4   r?   rp   r1   r1   r2   _merge_heads  s   z"FlaxBertSelfAttention._merge_headsc                 C   s   |  dd}| ddtj|j|j}| ddtj|j|j}| dddd }|rz|jj^ }	}
}}|j}dt|	 |ddf }t	|j||}t	|j||}||_||_|jd	 }|j| |_t
t|
|| k t|	d	||
f }t||}|||fS )
a\  
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slightly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        cache
cached_keycached_valuecache_indexc                   S   s   t jdt jdS )Nr   rb   )r-   arrayint32r1   r1   r1   r2   <lambda>  s    z=FlaxBertSelfAttention._concatenate_to_cache.<locals>.<lambda>)r   r   r"   )has_variablevariabler-   zerosro   r5   ri   lenr   dynamic_update_slicebroadcast_toaranger0   r   )rO   rh   ri   rg   rX   is_initializedrt   ru   rv   
batch_dims
max_length	num_headsdepth_per_head	cur_indexindicesnum_updated_cache_vectorspad_maskr1   r1   r2   _concatenate_to_cache  s(   	


z+FlaxBertSelfAttention._concatenate_to_cacheNTkey_value_states
init_cacheoutput_attentionsc                 C   sx  |d u}|j d }	| |}
|r| |}| |}n
| |}| |}| |
}
| |}| |}| jr|
j d |j d }}| ddrj| jd d }| jd d j d }t	| j
dd|dfdd||f}n| j
d d d d d |d |f }t||	f|j dd   }|d ur| jrttj|dd|j }t||}n| jr|}n|d urtj|dd}| jr| dds|r| |||
|\}}}|d urt|dkt|j d| jt|j t| jj| j}nd }d }|s| jjdkr| d	}t|
|||| jjd
|| jd d	}|d urtd||}td||}||j d d d }|r7||f}|S |f}|S )Nr   r"   rs   rt   rv   )axisg        rM   T)biasdropout_rngdropout_ratebroadcast_dropoutrQ   r5   	precisionz...hqk,h->...hqkz...hqk,...khd->...qhdrm   ))ro   rg   rh   ri   rq   r`   rz   	variablesr   dynamic_slicerk   r-   r   expand_dimsr   r   selectfullrT   r5   finfominr4   attention_probs_dropout_probmake_rngr
   einsumrn   )rO   r'   rX   layer_head_maskr   r   rQ   r   is_cross_attention
batch_sizequery_states
key_statesvalue_statesquery_length
key_length
mask_shiftmax_decoder_lengthrk   attention_biasr   attn_weightsattn_outputoutputsr1   r1   r2   r[   &  sz   







"


zFlaxBertSelfAttention.__call__NFTF)r)   r*   r+   r#   r/   r`   r^   r-   r]   r5   rP   rq   rr   r<   compactr   r   r.   r[   r1   r1   r1   r2   r_      s(   
 
$r_   c                   @   >   e Zd ZU eed< ejZejed< dd Zd
de	fddZ
d	S )FlaxBertSelfOutputr4   r5   c                 C   sR   t j| jjtj j| jj| jd| _	t j
| jj| jd| _
t j| jjd| _d S )Nra   r5   r8   r:   )r<   rf   r4   r?   r@   rA   rB   rC   r5   denserI   rJ   rK   rL   rM   rN   r1   r1   r2   rP     s   zFlaxBertSelfOutput.setupTrQ   c                 C   *   |  |}| j||d}| || }|S NrS   r   rM   rI   )rO   r'   input_tensorrQ   r1   r1   r2   r[        
zFlaxBertSelfOutput.__call__Nr\   r)   r*   r+   r#   r/   r-   r]   r5   rP   r^   r[   r1   r1   r1   r2   r     
   
 	r   c                   @   sR   e Zd ZU eed< dZeed< ejZ	ej	ed< dd Z
				dd	efd
dZdS )FlaxBertAttentionr4   Fr`   r5   c                 C   s,   t | j| j| jd| _t| j| jd| _d S )Nr`   r5   rb   )r_   r4   r`   r5   rO   r   outputrN   r1   r1   r2   rP     s   zFlaxBertAttention.setupNTr   c              	   C   sL   | j |||||||d}|d }	| j|	||d}|f}
|r$|
|d f7 }
|
S )N)r   r   r   rQ   r   r   rS   r"   )rO   r   )rO   r'   rX   r   r   r   rQ   r   attn_outputsr   r   r1   r1   r2   r[     s   	zFlaxBertAttention.__call__r   )r)   r*   r+   r#   r/   r`   r^   r-   r]   r5   rP   r[   r1   r1   r1   r2   r     s   
 	r   c                   @   6   e Zd ZU eed< ejZejed< dd Zdd Z	dS )FlaxBertIntermediater4   r5   c                 C   s8   t j| jjtj j| jj| jd| _	t
| jj | _d S Nr   )r<   rf   r4   intermediate_sizer@   rA   rB   rC   r5   r   r   
hidden_act
activationrN   r1   r1   r2   rP     s   zFlaxBertIntermediate.setupc                 C   s   |  |}| |}|S N)r   r   rp   r1   r1   r2   r[     s   

zFlaxBertIntermediate.__call__N
r)   r*   r+   r#   r/   r-   r]   r5   rP   r[   r1   r1   r1   r2   r     s
   
 r   c                   @   r   )FlaxBertOutputr4   r5   c                 C   sR   t j| jjtj j| jj| jd| _	t j
| jjd| _t j| jj| jd| _d S )Nr   r:   r8   )r<   rf   r4   r?   r@   rA   rB   rC   r5   r   rK   rL   rM   rI   rJ   rN   r1   r1   r2   rP     s   zFlaxBertOutput.setupTrQ   c                 C   r   r   r   )rO   r'   attention_outputrQ   r1   r1   r2   r[     r   zFlaxBertOutput.__call__Nr\   r   r1   r1   r1   r2   r     r   r   c                   @   sd   e Zd ZU eed< ejZejed< dd Z					dde	ej
 d	e	ej
 d
ededef
ddZdS )FlaxBertLayerr4   r5   c                 C   s`   t | j| jj| jd| _t| j| jd| _t| j| jd| _| jj	r.t | jd| jd| _
d S d S )Nr   rb   F)r   r4   
is_decoderr5   	attentionr   intermediater   r   add_cross_attentioncrossattentionrN   r1   r1   r2   rP     s   zFlaxBertLayer.setupNFTencoder_hidden_statesencoder_attention_maskr   rQ   r   c	                 C   s   | j ||||||d}	|	d }
|d ur"| j|
|||||d}|d }
| |
}| j||
|d}|f}|rF||	d f7 }|d urF||d f7 }|S )N)r   r   rQ   r   r   )rX   r   r   rQ   r   rS   r"   )r   r   r   r   )rO   r'   rX   r   r   r   r   rQ   r   attention_outputsr   cross_attention_outputsr   r1   r1   r2   r[     s6   
zFlaxBertLayer.__call__)NNFTF)r)   r*   r+   r#   r/   r-   r]   r5   rP   r   r.   r^   r[   r1   r1   r1   r2   r     s(   
 	r   c                   @   |   e Zd ZU eed< ejZejed< dZe	ed< dd Z
							dd	eej d
eej de	de	de	de	de	fddZdS )FlaxBertLayerCollectionr4   r5   Fgradient_checkpointingc                    sT   j rttdd  fddtjjD _d S fddtjjD _d S )N)         )static_argnumsc                    s"   g | ]} j t|jd qS )namer5   )r4   strr5   .0iFlaxBertCheckpointLayerrO   r1   r2   
<listcomp>-  s    z1FlaxBertLayerCollection.setup.<locals>.<listcomp>c                    s"   g | ]}t  jt| jd qS r   )r   r4   r   r5   r   rN   r1   r2   r   2  s    )r   rematr   ranger4   num_hidden_layerslayersrN   r1   r   r2   rP   *  s   


zFlaxBertLayerCollection.setupNTr   r   r   rQ   r   output_hidden_statesreturn_dictc              
   C   s"  |rdnd }|	r
dnd }|r|d urdnd }|d ur5|j d t| jkr5tdt| j d|j d  dt| jD ]6\}}|	rE||f7 }||||d urP|| nd |||||}|d }|rp||d f7 }|d urp||d f7 }q:|	rx||f7 }||||f}|
stdd	 |D S t||||d
S )Nr1   r   z&The head_mask should be specified for z/ layers, but it is for                         .r"   rm   c                 s   s    | ]	}|d ur|V  qd S r   r1   )r   vr1   r1   r2   	<genexpr>l  s    z3FlaxBertLayerCollection.__call__.<locals>.<genexpr>)last_hidden_stater'   r(   cross_attentions)ro   r}   r   re   	enumerater0   r   )rO   r'   rX   	head_maskr   r   r   rQ   r   r   r   all_attentionsall_hidden_statesall_cross_attentionsr   layerlayer_outputsr   r1   r1   r2   r[   6  sP   

z FlaxBertLayerCollection.__call__NNFTFFTr)   r*   r+   r#   r/   r-   r]   r5   r   r^   rP   r   r.   r[   r1   r1   r1   r2   r   %  s6   
 	
r   c                   @   r   )FlaxBertEncoderr4   r5   Fr   c                 C   s   t | j| j| jd| _d S )Nr5   r   )r   r4   r5   r   r   rN   r1   r1   r2   rP   {  s
   zFlaxBertEncoder.setupNTr   r   r   rQ   r   r   r   c                 C   s   | j |||||||||	|
d
S )N)r   r   r   r   rQ   r   r   r   )r   )rO   r'   rX   r   r   r   r   rQ   r   r   r   r1   r1   r2   r[     s   zFlaxBertEncoder.__call__r   r   r1   r1   r1   r2   r   v  s6   
 	
r   c                   @   r   )FlaxBertPoolerr4   r5   c                 C   s*   t j| jjtj j| jj| jd| _	d S r   )
r<   rf   r4   r?   r@   rA   rB   rC   r5   r   rN   r1   r1   r2   rP     s
   zFlaxBertPooler.setupc                 C   s$   |d d df }|  |}t|S )Nr   )r   r<   tanh)rO   r'   cls_hidden_stater1   r1   r2   r[     s   

zFlaxBertPooler.__call__Nr   r1   r1   r1   r2   r     s
   
 r   c                   @   r   )FlaxBertPredictionHeadTransformr4   r5   c                 C   s>   t j| jj| jd| _t| jj | _t j	| jj
| jd| _	d S )Nrb   r8   )r<   rf   r4   r?   r5   r   r   r   r   rI   rJ   rN   r1   r1   r2   rP     s   z%FlaxBertPredictionHeadTransform.setupc                 C   s   |  |}| |}| |S r   )r   r   rI   rp   r1   r1   r2   r[     s   


z(FlaxBertPredictionHeadTransform.__call__Nr   r1   r1   r1   r2   r     s
   
 r   c                   @   sT   e Zd ZU eed< ejZejed< ej	j
jZedejf ed< dd Zd
dd	ZdS )FlaxBertLMPredictionHeadr4   r5   .	bias_initc                 C   sF   t | j| jd| _tj| jj| jdd| _| d| j	| jjf| _
d S )Nrb   F)r5   use_biasr   )r   r4   r5   	transformr<   rf   r>   decoderparamr   r   rN   r1   r1   r2   rP     s   zFlaxBertLMPredictionHead.setupNc                 C   sR   |  |}|d ur| jdd|jii|}n| |}t| j| j}||7 }|S )Nparamskernel)r  r  applyTr-   asarrayr   r5   )rO   r'   shared_embeddingr   r1   r1   r2   r[     s   

z!FlaxBertLMPredictionHead.__call__r   )r)   r*   r+   r#   r/   r-   r]   r5   r@   r<   rA   r|   r   r   npr.   rP   r[   r1   r1   r1   r2   r     s   
 r   c                   @   8   e Zd ZU eed< ejZejed< dd ZdddZ	dS )	FlaxBertOnlyMLMHeadr4   r5   c                 C   s   t | j| jd| _d S )Nrb   )r   r4   r5   predictionsrN   r1   r1   r2   rP        zFlaxBertOnlyMLMHead.setupNc                 C   s   | j ||d}|S Nr
  )r  )rO   r'   r
  r1   r1   r2   r[     s   zFlaxBertOnlyMLMHead.__call__r   r   r1   r1   r1   r2   r    s
   
 r  c                   @   s.   e Zd ZU ejZejed< dd Zdd ZdS )FlaxBertOnlyNSPHeadr5   c                 C   s   t jd| jd| _d S )Nrm   rb   )r<   rf   r5   seq_relationshiprN   r1   r1   r2   rP     r  zFlaxBertOnlyNSPHead.setupc                 C   s
   |  |S r   )r  )rO   pooled_outputr1   r1   r2   r[     s   
zFlaxBertOnlyNSPHead.__call__N)	r)   r*   r+   r-   r]   r5   r/   rP   r[   r1   r1   r1   r2   r    s   
 r  c                   @   r  )	FlaxBertPreTrainingHeadsr4   r5   c                 C   s(   t | j| jd| _tjd| jd| _d S )Nrb   rm   )r   r4   r5   r  r<   rf   r  rN   r1   r1   r2   rP     s   zFlaxBertPreTrainingHeads.setupNc                 C   s    | j ||d}| |}||fS r  )r  r  )rO   r'   r  r
  prediction_scoresseq_relationship_scorer1   r1   r2   r[     s   
z!FlaxBertPreTrainingHeads.__call__r   r   r1   r1   r1   r2   r    s
   
 r  c                       s   e Zd ZU dZeZdZdZej	e
d< ddejddfd	ed
ededejdedef fddZdd Zd#dejjd
ededefddZdd Zeed													d$dee dejjdedee dee dee d ee fd!d"Z  ZS )%FlaxBertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    bertNmodule_class)r"   r"   r   TFr4   input_shapeseedr5   _do_initr   c           	         s4   | j d|||d|}t j||||||d d S )Nr4   r5   r   )r  r  r5   r  r1   )r  super__init__)	rO   r4   r  r  r5   r  r   kwargsmodule	__class__r1   r2   r     s   
z FlaxBertPreTrainedModel.__init__c                 C   s   | j | j| jdd| _d S )NTr  )r  r4   r5   _modulerN   r1   r1   r2   enable_gradient_checkpointing  s
   z5FlaxBertPreTrainedModel.enable_gradient_checkpointingrngr  returnc                 C   s"  t j|dd}t |}t t t |jd |}t |}t | j	j
| j	jf}tj|\}	}
|	|
d}| j	jrXt || j	jf }|}| jj||||||||dd	}n| jj||||||dd}|d }|d urtt|}tt|}| jD ]}|| ||< q|t | _tt|S |S )NrR   rb   r   )r  rM   F)r   r  )r-   r|   
zeros_liker   r   
atleast_2dro   	ones_likerj   r4   r   rc   r@   randomsplitr   r?   r"  initr   r   _missing_keyssetr   r   )rO   r'  r  r  rU   rV   rW   rX   r   
params_rngr   rngsr   r   module_init_outputsrandom_paramsmissing_keyr1   r1   r2   init_weights  sB   



z$FlaxBertPreTrainedModel.init_weightsc                 C   sl   t j||fdd}t j|dd}t t t |jd |j}| jjt	j
d|||ddd}t|d S )	aW  
        Args:
            batch_size (`int`):
                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
            max_length (`int`):
                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                cache.
        rR   rb   r   r   FT)r   r   rs   )r-   rj   r+  r   r   r*  ro   r"  r.  r@   r,  PRNGKeyr   )rO   r   r   rU   rX   rW   init_variablesr1   r1   r2   r   F  s   
 z"FlaxBertPreTrainedModel.init_cachebatch_size, sequence_lengthr   trainr   r   r   past_key_valuesc                 C   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r't|}|d u r;ttt|j	d |j	}|d u rDt
|}|d u rSt| j j| j jf}i }|	d ur]|	|d< d|pb| ji}| j jr|rr||d< dg}nd}| jj|tj|ddtj|ddtj|ddtj|ddtj|dd|||
 |||||d}|d ur|r|\}}t|d |d	< |S |d ur|s|\}}|d d
 t|d f |d
d   }|S | jj|tj|ddtj|ddtj|ddtj|ddtj|dd|
 ||||d}|S )Nr   rM   r  rs   FrR   rb   )rV   rW   r   r   r   rQ   r   r   r   r2  mutabler;  r"   )rV   rW   r   rQ   r   r   r   r2  )r4   r   r   r   r-   r)  r   r   r*  ro   r+  rj   r   rc   r  r   r"  r  rw   r   )rO   rU   rX   rV   rW   r   r   r   r  r   r:  r   r   r   r;  r2  inputsr<  r   r1   r1   r2   r[   Y  sv   
 
&z FlaxBertPreTrainedModel.__call__r   )NNNNNNNNFNNNN) r)   r*   r+   r,   r#   config_classbase_model_prefixr  r<   Moduler/   r-   r]   r0   intr5   r^   r   r&  r@   r,  r7  r   r6  r   r    BERT_INPUTS_DOCSTRINGformatr   dictr[   __classcell__r1   r1   r#  r2   r    sn   
  +	
r  c                   @   s   e Zd ZU eed< ejZejed< dZe	ed< dZ
e	ed< dd Z															dd
eej deej deej deej deej de	de	de	de	de	fddZd	S )FlaxBertModuler4   r5   Tadd_pooling_layerFr   c                 C   s>   t | j| jd| _t| j| j| jd| _t| j| jd| _d S )Nrb   r   )	r3   r4   r5   
embeddingsr   r   encoderr   poolerrN   r1   r1   r2   rP     s   zFlaxBertModule.setupNrV   rW   r   r   r   r   rQ   r   r   r   c                 C   s   |d u r	t |}|d u rt t t |jd |j}| j|||||	d}| j||||	||||
||d
}|d }| jrB| 	|nd }|s]|d u rS|f|dd   S ||f|dd   S t
|||j|j|jdS )Nr   rS   )r   rQ   r   r   r   r   r   r   r   r"   )r   pooler_outputr'   r(   r   )r-   r)  r   r   r*  ro   rH  rI  rG  rJ  r   r'   r(   r   )rO   rU   rX   rV   rW   r   r   r   r   rQ   r   r   r   r'   r   pooledr1   r1   r2   r[     s@   
 
zFlaxBertModule.__call__)
NNNNNFTFFT)r)   r*   r+   r#   r/   r-   r]   r5   rG  r^   r   rP   r   r.   r[   r1   r1   r1   r2   rF    sJ   
 	
rF  z^The bare Bert Model transformer outputting raw hidden-states without any specific head on top.c                   @      e Zd ZeZdS )FlaxBertModelN)r)   r*   r+   rF  r  r1   r1   r1   r2   rN        rN  c                	   @   ^   e Zd ZU eed< ejZejed< dZe	ed< dd Z
				dde	d	e	d
e	de	fddZdS )FlaxBertForPreTrainingModuler4   r5   Fr   c                 C   s,   t | j| j| jd| _t| j| jd| _d S )Nr  r4   r5   )rF  r4   r5   r   r  r  clsrN   r1   r1   r2   rP     s   z"FlaxBertForPreTrainingModule.setupTrQ   r   r   r   c
                 C   s   | j |||||||||	d	}
| jjr| j jd d d d }nd }|
d }|
d }| j|||d\}}|	s?||f|
d	d   S t|||
j|
jd
S )NrQ   r   r   r   r  rH  rD   	embeddingr   r"   r  rm   )r%   r&   r'   r(   )r  r4   tie_word_embeddingsr   rS  r$   r'   r(   )rO   rU   rX   rV   rW   r   rQ   r   r   r   r   r
  r'   r  r  r  r1   r1   r2   r[     s6   
z%FlaxBertForPreTrainingModule.__call__NTFFTr)   r*   r+   r#   r/   r-   r]   r5   r   r^   rP   r[   r1   r1   r1   r2   rQ    $   
 	
rQ  z
    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                   @   rM  )FlaxBertForPreTrainingN)r)   r*   r+   rQ  r  r1   r1   r1   r2   rZ  J      rZ  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxBertForPreTraining

    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
    >>> model = FlaxBertForPreTraining.from_pretrained("google-bert/bert-base-uncased")

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.prediction_logits
    >>> seq_relationship_logits = outputs.seq_relationship_logits
    ```
r9  )output_typer>  c                	   @   rP  )FlaxBertForMaskedLMModuler4   r5   Fr   c                 C   .   t | jd| j| jd| _t| j| jd| _d S NF)r4   rG  r5   r   rR  rF  r4   r5   r   r  r  rS  rN   r1   r1   r2   rP   v     zFlaxBertForMaskedLMModule.setupTrQ   r   r   r   c
                 C   s   | j |||||||||	d	}
|
d }| jjr#| j jd d d d }nd }| j||d}|	s7|f|
dd   S t||
j|
jd	S )
NrT  r   r  rH  rD   rU  r  r"   logitsr'   r(   )r  r4   rV  r   rS  r   r'   r(   )rO   rU   rX   rV   rW   r   rQ   r   r   r   r   r'   r
  rc  r1   r1   r2   r[     s.   z"FlaxBertForMaskedLMModule.__call__NrW  rX  r1   r1   r1   r2   r]  q  $   
 	
r]  z2Bert Model with a `language modeling` head on top.c                   @   rM  )FlaxBertForMaskedLMN)r)   r*   r+   r]  r  r1   r1   r1   r2   re    s    re  c                	   @   rP  )'FlaxBertForNextSentencePredictionModuler4   r5   Fr   c                 C   s(   t | j| j| jd| _t| jd| _d S )Nr  rb   )rF  r4   r5   r   r  r  rS  rN   r1   r1   r2   rP     s   z-FlaxBertForNextSentencePredictionModule.setupTrQ   r   r   r   c
                 C   sj   |	d ur|	n| j j}	| j|||||||||	d	}
|
d }| |}|	s,|f|
dd   S t||
j|
jdS )NrT  r"   rm   rb  )r4   r   r  rS  r   r'   r(   )rO   rU   rX   rV   rW   r   rQ   r   r   r   r   r  seq_relationship_scoresr1   r1   r2   r[     s*   
z0FlaxBertForNextSentencePredictionModule.__call__NrW  rX  r1   r1   r1   r2   rf    rY  rf  zJBert Model with a `next sentence prediction (classification)` head on top.c                   @   rM  )!FlaxBertForNextSentencePredictionN)r)   r*   r+   rf  r  r1   r1   r1   r2   rh    rO  rh  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxBertForNextSentencePrediction

    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
    >>> model = FlaxBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")

    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
    >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
    >>> encoding = tokenizer(prompt, next_sentence, return_tensors="jax")

    >>> outputs = model(**encoding)
    >>> logits = outputs.logits
    >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
    ```
c                	   @   rP  )'FlaxBertForSequenceClassificationModuler4   r5   Fr   c                 C   sZ   t | j| j| jd| _| jjd ur| jjn| jj}tj|d| _	tj
| jj| jd| _d S )Nr  r:   rb   rF  r4   r5   r   r  classifier_dropoutrL   r<   rK   rM   rf   
num_labels
classifierrO   rk  r1   r1   r2   rP     s   z-FlaxBertForSequenceClassificationModule.setupTrQ   r   r   r   c
                 C   d   | j |||||||||	d	}
|
d }| j||d}| |}|	s)|f|
dd   S t||
j|
jdS )NrT  r"   rS   rm   rb  )r  rM   rm  r   r'   r(   )rO   rU   rX   rV   rW   r   rQ   r   r   r   r   r  rc  r1   r1   r2   r[   %  *   
z0FlaxBertForSequenceClassificationModule.__call__NrW  rX  r1   r1   r1   r2   ri    s$   
 	
ri  z
    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   @   rM  )!FlaxBertForSequenceClassificationN)r)   r*   r+   ri  r  r1   r1   r1   r2   rq  L  r[  rq  c                	   @   rP  )FlaxBertForMultipleChoiceModuler4   r5   Fr   c                 C   s>   t | j| j| jd| _tj| jjd| _tj	d| jd| _
d S )Nr  r:   r"   rb   )rF  r4   r5   r   r  r<   rK   rL   rM   rf   rm  rN   r1   r1   r2   rP   d  s   z%FlaxBertForMultipleChoiceModule.setupTrQ   r   r   r   c
                 C   s   |j d }
|d ur|d|j d nd }|d ur!|d|j d nd }|d ur0|d|j d nd }|d ur?|d|j d nd }| j|||||||||	d	}|d }| j||d}| |}|d|
}|	sp|f|dd   S t||j|jdS )Nr"   r   rT  rS   rm   rb  )ro   rn   r  rM   rm  r   r'   r(   )rO   rU   rX   rV   rW   r   rQ   r   r   r   num_choicesr   r  rc  reshaped_logitsr1   r1   r2   r[   m  s6   

z(FlaxBertForMultipleChoiceModule.__call__NrW  rX  r1   r1   r1   r2   rr  _  rd  rr  z
    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                   @   rM  )FlaxBertForMultipleChoiceN)r)   r*   r+   rr  r  r1   r1   r1   r2   ru    r[  ru  z(batch_size, num_choices, sequence_lengthc                	   @   rP  )$FlaxBertForTokenClassificationModuler4   r5   Fr   c                 C   s\   t | j| jd| jd| _| jjd ur| jjn| jj}tj|d| _	tj
| jj| jd| _d S )NFr4   r5   rG  r   r:   rb   rj  rn  r1   r1   r2   rP     s   z*FlaxBertForTokenClassificationModule.setupTrQ   r   r   r   c
                 C   ro  )NrT  r   rS   r"   rb  )r  rM   rm  r   r'   r(   )rO   rU   rX   rV   rW   r   rQ   r   r   r   r   r'   rc  r1   r1   r2   r[     rp  z-FlaxBertForTokenClassificationModule.__call__NrW  rX  r1   r1   r1   r2   rv    s$   
 	
rv  z
    Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                   @   rM  )FlaxBertForTokenClassificationN)r)   r*   r+   rv  r  r1   r1   r1   r2   rx    r[  rx  c                	   @   rP  )"FlaxBertForQuestionAnsweringModuler4   r5   Fr   c                 C   s2   t | j| jd| jd| _tj| jj| jd| _d S )NFrw  rb   )	rF  r4   r5   r   r  r<   rf   rl  
qa_outputsrN   r1   r1   r2   rP     s   z(FlaxBertForQuestionAnsweringModule.setupTrQ   r   r   r   c
                 C   s   | j |||||||||	d	}
|
d }| |}tj|| jjdd\}}|d}|d}|	s9||f|
dd   S t|||
j|
j	dS )NrT  r   r   r   r"   )start_logits
end_logitsr'   r(   )
r  rz  r-   r-  r4   rl  squeezer   r'   r(   )rO   rU   rX   rV   rW   r   rQ   r   r   r   r   r'   rc  r{  r|  r1   r1   r2   r[     s0   


z+FlaxBertForQuestionAnsweringModule.__call__NrW  rX  r1   r1   r1   r2   ry    rd  ry  z
    Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                   @   rM  )FlaxBertForQuestionAnsweringN)r)   r*   r+   ry  r  r1   r1   r1   r2   r~  3  r[  r~  c                   @   s   e Zd ZU eed< ejZejed< dZe	ed< dd Z
									dd	eej d
eej deej deej de	de	de	de	de	fddZdS )FlaxBertForCausalLMModuler4   r5   Fr   c                 C   r^  r_  r`  rN   r1   r1   r2   rP   K  ra  zFlaxBertForCausalLMModule.setupNTrV   r   r   r   r   rQ   r   r   r   c                 C   s   | j |||||||||	|
||d}|d }| jjr&| j jd d d d }nd }| j||d}|s:|f|dd   S t||j|j|jd	S )
N)r   r   r   rQ   r   r   r   r   r  rH  rD   rU  r  r"   )rc  r'   r(   r   )	r  r4   rV  r   rS  r   r'   r(   r   )rO   rU   rX   rW   rV   r   r   r   r   rQ   r   r   r   r   r'   r
  rc  r1   r1   r2   r[   T  s6   z"FlaxBertForCausalLMModule.__call__)	NNNNFTFFTr   r1   r1   r1   r2   r  F  sB   
 	
r  z
    Bert Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
    autoregressive tasks.
    c                   @   s.   e Zd ZeZddeej fddZdd Z	dS )FlaxBertForCausalLMNrX   c           	      C   s   |j \}}| ||}tj||fdd}|d ur(|jddd }t||d}nttj|ddd d d f ||f}|||dS )NrR   rb   r   r   r"   )r   r   )r;  rX   rW   )	ro   r   r-   rj   cumsumr   r~   r   r   )	rO   rU   r   rX   r   
seq_lengthr;  extended_attention_maskrW   r1   r1   r2   prepare_inputs_for_generation  s   
&z1FlaxBertForCausalLM.prepare_inputs_for_generationc                 C   s.   |j |d< |d d d dd f d |d< |S )Nr;  rW   r   r"   )r;  )rO   model_outputsmodel_kwargsr1   r1   r2   update_inputs_for_generation  s   
 z0FlaxBertForCausalLM.update_inputs_for_generationr   )
r)   r*   r+   r  r  r   r@   Arrayr  r  r1   r1   r1   r2   r    s    r  )
r  re  ru  rh  rZ  r~  rq  rx  rN  r  )dtypingr   r   flax
flax.linenlinenr<   r@   	jax.numpynumpyr-   r  flax.core.frozen_dictr   r   r   r   r   r	   nn_partitioningflax.linen.attentionr
   flax.traverse_utilr   r   r   modeling_flax_outputsr   r   r   r   r   r   r   r   r   r   modeling_flax_utilsr   r   r   r   r   utilsr   r   r    r!   configuration_bertr#   
get_loggerr)   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   struct	dataclassr$   BERT_START_DOCSTRINGrB  r@  r3   r_   r   r   r   r   r   r   r   r   r   r   r  r  r  r  rF  rN  rQ  rZ  #FLAX_BERT_FOR_PRETRAINING_DOCSTRINGrC  r]  re  rf  rh  &FLAX_BERT_FOR_NEXT_SENT_PRED_DOCSTRINGri  rq  rr  ru  rv  rx  ry  r~  r  r  __all__r1   r1   r1   r2   <module>   s  0
0'+ ,*9Q'
 DG=:5==
;9A