o
    i                     @   s  d dl mZmZ d dlmZ d dlZd dlmZ	 d dlZ
d dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d	d
lmZmZmZmZmZmZm Z m!Z!m"Z" d	dl#m$Z$m%Z%m&Z&m'Z' d	dl(m)Z)m*Z*m+Z+ ddl,m-Z- e+.e/Z0dZ1dZ2ej3Z3dd Z4dZ5dZ6G dd dej7Z8G dd dej7Z9G dd dej7Z:G dd dej7Z;G dd dej7Z<G dd  d ej7Z=G d!d" d"ej7Z>G d#d$ d$ej7Z?G d%d& d&ej7Z@G d'd( d(ej7ZAG d)d* d*ej7ZBG d+d, d,ej7ZCG d-d. d.e%ZDG d/d0 d0ej7ZEe)d1e5G d2d3 d3eDZFe&eFe1ee2 G d4d5 d5ej7ZGe)d6e5G d7d8 d8eDZHe&eHe1ee2d9d: G d;d< d<ej7ZIe)d=e5G d>d? d?eDZJe&eJe1e!e2 G d@dA dAej7ZKe)dBe5G dCdD dDeDZLe'eLe6MdE e&eLe1ee2 G dFdG dGej7ZNe)dHe5G dIdJ dJeDZOe&eOe1e"e2 G dKdL dLej7ZPe)dMe5G dNdO dOeDZQe&eQe1e e2 G dPdQ dQej7ZRe)dRe5G dSdT dTeDZSe&eSe1ee2 g dUZTdS )V    )CallableOptionalN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)partitioning)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )	-FlaxBaseModelOutputWithPastAndCrossAttentionsFlaxBaseModelOutputWithPooling0FlaxBaseModelOutputWithPoolingAndCrossAttentions%FlaxCausalLMOutputWithCrossAttentionsFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstringoverwrite_call_docstring)add_start_docstrings%add_start_docstrings_to_model_forwardlogging   )RobertaConfigzFacebookAI/roberta-baser    c                 C   sx   | |k d}|jdkr)|d|jd f}tj|dd d| }|| j}ntj|dd d| }| d| S )a!  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        input_ids: jnp.ndarray
        padding_idx: int

    Returns: jnp.ndarray
    i4   r   axis)astypendimreshapeshapejnpcumsum)	input_idspadding_idxmaskincremental_indices r0   e/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/roberta/modeling_flax_roberta.py"create_position_ids_from_input_ids4   s   
r2   a   

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`RobertaConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                   @   sB   e Zd ZU dZeed< ejZejed< dd Z	dde
fdd	Zd
S )FlaxRobertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                 C   s   t j| jj| jjtj jj| jjd| j	d| _
t j| jj| jjtj jj| jjd| j	d| _t j| jj| jjtj jj| jjd| j	d| _t j| jj| j	d| _t j| jjd| _d S )N)stddev)embedding_initr5   epsilonr5   rate)nnEmbedr4   
vocab_sizehidden_sizejaxinitializersnormalinitializer_ranger5   word_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutselfr0   r0   r1   setup   s(   zFlaxRobertaEmbeddings.setupTdeterministicc           
      C   sX   |  |d}| |d}| |d}|| | }	| |	}	| j|	|d}	|	S )Nr!   rQ   )rD   r&   rF   rH   rI   rM   )
rO   r,   token_type_idsposition_idsattention_maskrQ   inputs_embedsposition_embedsrH   hidden_statesr0   r0   r1   __call__   s   
zFlaxRobertaEmbeddings.__call__NT)__name__
__module____qualname____doc__r    __annotations__r*   float32r5   rP   boolrY   r0   r0   r0   r1   r3      s   
 r3   c                   @   s~   e Zd ZU eed< dZeed< ejZ	ej	ed< dd Z
dd Zd	d
 Zejdd Z				ddeej dedefddZdS )FlaxRobertaSelfAttentionr4   Fcausalr5   c                 C   s   | j j| j j | _| j j| j j dkrtdtj| j j| jtjj	
| j jd| _tj| j j| jtjj	
| j jd| _tj| j j| jtjj	
| j jd| _| jrettjd| j jfdddd| _d S d S )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads}r5   kernel_initr   ra   r5   )r4   r?   num_attention_headshead_dim
ValueErrorr<   Denser5   r@   rA   rB   rC   querykeyvaluerc   r   r*   onesrE   causal_maskrN   r0   r0   r1   rP      s2   zFlaxRobertaSelfAttention.setupc                 C   s"   | |jd d | jj| jf S Nr"   )r(   r)   r4   rg   rh   rO   rX   r0   r0   r1   _split_heads   s   "z%FlaxRobertaSelfAttention._split_headsc                 C   s   | |jd d | jjf S rp   )r(   r)   r4   r?   rq   r0   r0   r1   _merge_heads   s   z%FlaxRobertaSelfAttention._merge_headsc                 C   s   |  dd}| ddtj|j|j}| ddtj|j|j}| dddd }|rz|jj^ }	}
}}|j}dt|	 |ddf }t	|j||}t	|j||}||_||_|jd	 }|j| |_t
t|
|| k t|	d	||
f }t||}|||fS )
a\  
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slightly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        cache
cached_keycached_valuecache_indexc                   S   s   t jdt jdS )Nr   rf   )r*   arrayint32r0   r0   r0   r1   <lambda>   s    z@FlaxRobertaSelfAttention._concatenate_to_cache.<locals>.<lambda>)r   r   r   )has_variablevariabler*   zerosr)   r5   rm   lenr   dynamic_update_slicebroadcast_toarangetupler   )rO   rl   rm   rk   rU   is_initializedru   rv   rw   
batch_dims
max_length	num_headsdepth_per_head	cur_indexindicesnum_updated_cache_vectorspad_maskr0   r0   r1   _concatenate_to_cache   s(   	


z.FlaxRobertaSelfAttention._concatenate_to_cacheNTkey_value_states
init_cacheoutput_attentionsc                 C   sx  |d u}|j d }	| |}
|r| |}| |}n
| |}| |}| |
}
| |}| |}| jr|
j d |j d }}| ddrj| jd d }| jd d j d }t	| j
dd|dfdd||f}n| j
d d d d d |d |f }t||	f|j dd   }|d ur| jrttj|dd|j }t||}n| jr|}n|d urtj|dd}| jr| dds|r| |||
|\}}}|d urt|dkt|j d| jt|j t| jj| j}nd }d }|s| jjdkr| d	}t|
|||| jjd
|| jd d	}|d urtd||}td||}||j d d d }|r7||f}|S |f}|S )Nr   r   rt   ru   rw   )r$   g        rM   T)biasdropout_rngdropout_ratebroadcast_dropoutrQ   r5   	precisionz...hqk,h->...hqkz...hqk,...khd->...qhdr"   )r#   )r)   rk   rl   rm   rr   rc   r{   	variablesr   dynamic_slicero   r*   r   expand_dimsr   r   selectfullr&   r5   finfominr4   attention_probs_dropout_probmake_rngr
   einsumr(   )rO   rX   rU   layer_head_maskr   r   rQ   r   is_cross_attention
batch_sizequery_states
key_statesvalue_statesquery_length
key_length
mask_shiftmax_decoder_lengthro   attention_biasr   attn_weightsattn_outputoutputsr0   r0   r1   rY      sz   







"


z!FlaxRobertaSelfAttention.__call__NFTF)r[   r\   r]   r    r_   rc   ra   r*   r`   r5   rP   rr   rs   r<   compactr   r   ndarrayrY   r0   r0   r0   r1   rb      s(   
 
$rb   c                   @   >   e Zd ZU eed< ejZejed< dd Zd
de	fddZ
d	S )FlaxRobertaSelfOutputr4   r5   c                 C   sR   t j| jjtj j| jj| jd| _	t j
| jj| jd| _
t j| jjd| _d S )Nre   r5   r8   r:   )r<   rj   r4   r?   r@   rA   rB   rC   r5   denserI   rJ   rK   rL   rM   rN   r0   r0   r1   rP   f  s   zFlaxRobertaSelfOutput.setupTrQ   c                 C   *   |  |}| j||d}| || }|S NrR   r   rM   rI   )rO   rX   input_tensorrQ   r0   r0   r1   rY   o     
zFlaxRobertaSelfOutput.__call__NrZ   r[   r\   r]   r    r_   r*   r`   r5   rP   ra   rY   r0   r0   r0   r1   r   b  
   
 	r   c                   @   sR   e Zd ZU eed< dZeed< ejZ	ej	ed< dd Z
				dd	efd
dZdS )FlaxRobertaAttentionr4   Frc   r5   c                 C   s,   t | j| j| jd| _t| j| jd| _d S )Nrc   r5   rf   )rb   r4   rc   r5   rO   r   outputrN   r0   r0   r1   rP   |  s   zFlaxRobertaAttention.setupNTr   c              	   C   sL   | j |||||||d}|d }	| j|	||d}|f}
|r$|
|d f7 }
|
S )N)r   r   r   rQ   r   r   rR   r   )rO   r   )rO   rX   rU   r   r   r   rQ   r   attn_outputsr   r   r0   r0   r1   rY     s   	zFlaxRobertaAttention.__call__r   )r[   r\   r]   r    r_   rc   ra   r*   r`   r5   rP   rY   r0   r0   r0   r1   r   w  s   
 	r   c                   @   6   e Zd ZU eed< ejZejed< dd Zdd Z	dS )FlaxRobertaIntermediater4   r5   c                 C   s8   t j| jjtj j| jj| jd| _	t
| jj | _d S Nr   )r<   rj   r4   intermediate_sizer@   rA   rB   rC   r5   r   r   
hidden_act
activationrN   r0   r0   r1   rP     s   zFlaxRobertaIntermediate.setupc                 C   s   |  |}| |}|S N)r   r   rq   r0   r0   r1   rY     s   

z FlaxRobertaIntermediate.__call__N
r[   r\   r]   r    r_   r*   r`   r5   rP   rY   r0   r0   r0   r1   r     s
   
 r   c                   @   r   )FlaxRobertaOutputr4   r5   c                 C   sR   t j| jjtj j| jj| jd| _	t j
| jjd| _t j| jj| jd| _d S )Nr   r:   r8   )r<   rj   r4   r?   r@   rA   rB   rC   r5   r   rK   rL   rM   rI   rJ   rN   r0   r0   r1   rP     s   zFlaxRobertaOutput.setupTrQ   c                 C   r   r   r   )rO   rX   attention_outputrQ   r0   r0   r1   rY     r   zFlaxRobertaOutput.__call__NrZ   r   r0   r0   r0   r1   r     r   r   c                   @   sd   e Zd ZU eed< ejZejed< dd Z					dde	ej
 d	e	ej
 d
ededef
ddZdS )FlaxRobertaLayerr4   r5   c                 C   s`   t | j| jj| jd| _t| j| jd| _t| j| jd| _| jj	r.t | jd| jd| _
d S d S )Nr   rf   F)r   r4   
is_decoderr5   	attentionr   intermediater   r   add_cross_attentioncrossattentionrN   r0   r0   r1   rP     s   zFlaxRobertaLayer.setupNFTencoder_hidden_statesencoder_attention_maskr   rQ   r   c	                 C   s   | j ||||||d}	|	d }
|d ur"| j|
|||||d}|d }
| |
}| j||
|d}|f}|rF||	d f7 }|d urF||d f7 }|S )N)r   r   rQ   r   r   )rU   r   r   rQ   r   rR   r   )r   r   r   r   )rO   rX   rU   r   r   r   r   rQ   r   attention_outputsr   cross_attention_outputsr   r0   r0   r1   rY     s6   
zFlaxRobertaLayer.__call__)NNFTF)r[   r\   r]   r    r_   r*   r`   r5   rP   r   r   ra   rY   r0   r0   r0   r1   r     s(   
 	r   c                   @   |   e Zd ZU eed< ejZejed< dZe	ed< dd Z
							dd	eej d
eej de	de	de	de	de	fddZdS )FlaxRobertaLayerCollectionr4   r5   Fgradient_checkpointingc                    sT   j rttdd  fddtjjD _d S fddtjjD _d S )N)         )static_argnumsc                    s"   g | ]} j t|jd qS )namer5   )r4   strr5   .0iFlaxRobertaCheckpointLayerrO   r0   r1   
<listcomp>      z4FlaxRobertaLayerCollection.setup.<locals>.<listcomp>c                    s"   g | ]}t  jt| jd qS r   )r   r4   r   r5   r   rN   r0   r1   r     r   )r   rematr   ranger4   num_hidden_layerslayersrN   r0   r   r1   rP   	  s   


z FlaxRobertaLayerCollection.setupNTr   r   r   rQ   r   output_hidden_statesreturn_dictc              
   C   s"  |rdnd }|	r
dnd }|r|d urdnd }|d ur5|j d t| jkr5tdt| j d|j d  dt| jD ]6\}}|	rE||f7 }||||d urP|| nd |||||}|d }|rp||d f7 }|d urp||d f7 }q:|	rx||f7 }||||f}|
stdd	 |D S t||||d
S )Nr0   r   z&The head_mask should be specified for z/ layers, but it is for                         .r   r"   c                 s   s    | ]	}|d ur|V  qd S r   r0   )r   vr0   r0   r1   	<genexpr>L  s    z6FlaxRobertaLayerCollection.__call__.<locals>.<genexpr>)last_hidden_staterX   
attentionscross_attentions)r)   r~   r   ri   	enumerater   r   )rO   rX   rU   	head_maskr   r   r   rQ   r   r   r   all_attentionsall_hidden_statesall_cross_attentionsr   layerlayer_outputsr   r0   r0   r1   rY     sP   

z#FlaxRobertaLayerCollection.__call__NNFTFFTr[   r\   r]   r    r_   r*   r`   r5   r   ra   rP   r   r   rY   r0   r0   r0   r1   r     s6   
 	
r   c                   @   r   )FlaxRobertaEncoderr4   r5   Fr   c                 C   s   t | j| j| jd| _d S )Nr5   r   )r   r4   r5   r   r   rN   r0   r0   r1   rP   \  s
   zFlaxRobertaEncoder.setupNTr   r   r   rQ   r   r   r   c                 C   s   | j |||||||||	|
d
S )N)r   r   r   r   rQ   r   r   r   )r   )rO   rX   rU   r   r   r   r   rQ   r   r   r   r0   r0   r1   rY   c  s   zFlaxRobertaEncoder.__call__r   r   r0   r0   r0   r1   r   W  s6   
 	
r   c                   @   r   )FlaxRobertaPoolerr4   r5   c                 C   s*   t j| jjtj j| jj| jd| _	d S r   )
r<   rj   r4   r?   r@   rA   rB   rC   r5   r   rN   r0   r0   r1   rP     s
   zFlaxRobertaPooler.setupc                 C   s$   |d d df }|  |}t|S )Nr   )r   r<   tanh)rO   rX   cls_hidden_stater0   r0   r1   rY     s   

zFlaxRobertaPooler.__call__Nr   r0   r0   r0   r1   r     s
   
 r   c                   @   sT   e Zd ZU eed< ejZejed< ej	j
jZedejf ed< dd Zd
dd	ZdS )FlaxRobertaLMHeadr4   r5   .	bias_initc                 C   s   t j| jj| jtj j| jjd| _	t j
| jj| jd| _t j| jj| jdtj j| jjd| _| d| j| jjf| _d S )Nrd   r8   F)r5   use_biasre   r   )r<   rj   r4   r?   r5   r@   rA   rB   rC   r   rI   rJ   
layer_normr>   decoderparamr  r   rN   r0   r0   r1   rP     s   zFlaxRobertaLMHead.setupNc                 C   sh   |  |}td |}| |}|d ur!| jdd|jii|}n| |}t| j| j	}||7 }|S )Ngeluparamskernel)
r   r   r  r  applyTr*   asarrayr   r5   )rO   rX   shared_embeddingr   r0   r0   r1   rY     s   


zFlaxRobertaLMHead.__call__r   )r[   r\   r]   r    r_   r*   r`   r5   r@   r<   rA   r}   r  r   npr   rP   rY   r0   r0   r0   r1   r     s   
 r   c                   @   s8   e Zd ZU eed< ejZejed< dd Zd	ddZ	dS )
FlaxRobertaClassificationHeadr4   r5   c                 C   sz   t j| jj| jtj j| jjd| _	| jj
d ur| jj
n| jj}t j|d| _t j| jj| jtj j| jjd| _d S )Nrd   r:   )r<   rj   r4   r?   r5   r@   rA   rB   rC   r   classifier_dropoutrL   rK   rM   
num_labelsout_projrO   r  r0   r0   r1   rP     s   z#FlaxRobertaClassificationHead.setupTc                 C   sT   |d d dd d f }| j ||d}| |}t|}| j ||d}| |}|S )Nr   rR   )rM   r   r<   r   r  )rO   rX   rQ   r0   r0   r1   rY     s   


z&FlaxRobertaClassificationHead.__call__NrZ   r   r0   r0   r0   r1   r    s
   
 r  c                       s   e Zd ZU dZeZdZdZej	e
d< ddejddfd	ed
ededejdedef fddZdd Zd#dejjd
ededefddZdd Zeed													d$dee dejjdedee dee dee d ee fd!d"Z  ZS )%FlaxRobertaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    robertaNmodule_class)r   r   r   TFr4   input_shapeseedr5   _do_initr   c           	         s4   | j d|||d|}t j||||||d d S )Nr4   r5   r   )r  r  r5   r  r0   )r  super__init__)	rO   r4   r  r  r5   r  r   kwargsmodule	__class__r0   r1   r    s   
z#FlaxRobertaPreTrainedModel.__init__c                 C   s   | j | j| jdd| _d S )NTr  )r  r4   r5   _modulerN   r0   r0   r1   enable_gradient_checkpointing  s
   z8FlaxRobertaPreTrainedModel.enable_gradient_checkpointingrngr  returnc                 C   s  t j|dd}t |}t|| jj}t |}t | jj| jjf}t	j
|\}	}
|	|
d}| jjrPt || jjf }|}| jj||||||||dd	}n| jj||||||dd}|d }|d urtt|}tt|}| jD ]}|| ||< qtt | _tt|S |S )Nr!   rf   )r  rM   F)r   r  )r*   r}   	ones_liker2   r4   pad_token_idrn   r   rg   r@   randomsplitr   r?   r  initr   r   _missing_keyssetr   r   )rO   r"  r  r  r,   rS   rT   rU   r   
params_rngr   rngsr   r   module_init_outputsrandom_paramsmissing_keyr0   r0   r1   init_weights  sB   



z'FlaxRobertaPreTrainedModel.init_weightsc                 C   sl   t j||fdd}t j|dd}t t t |jd |j}| jjt	j
d|||ddd}t|d S )	aW  
        Args:
            batch_size (`int`):
                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
            max_length (`int`):
                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                cache.
        r!   rf   r#   r   FT)r   r   rt   )r*   rn   r$  r   r   
atleast_2dr)   r  r(  r@   r&  PRNGKeyr   )rO   r   r   r,   rU   rT   init_variablesr0   r0   r1   r     s   
 z%FlaxRobertaPreTrainedModel.init_cachezbatch_size, sequence_lengthr   trainr   r   r   past_key_valuesc                 C   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r't|}|d u r2t|| j j}|d u r;t|}|d u rJt	| j j
| j jf}i }|	d urT|	|d< d|pY| ji}| j jr|ri||d< dg}nd}| jj|tj|ddtj|ddtj|ddtj|ddtj|dd|||
 |||||d}|d ur|r|\}}t|d |d< |S |d ur|s|\}}|d d	 t|d f |d	d   }|S | jj|tj|ddtj|ddtj|ddtj|ddtj|dd|
 ||||d
}|S )NrM   r  rt   Fr!   rf   )rS   rT   r   r   r   rQ   r   r   r   r,  mutabler5  r   )rS   rT   r   rQ   r   r   r   r,  )r4   r   r   r   r*   
zeros_liker2   r%  r$  rn   r   rg   r  r   r  r	  rx   r   )rO   r,   rU   rS   rT   r   r   r   r  r   r4  r   r   r   r5  r,  inputsr6  r   r0   r0   r1   rY   1  sv   

&z#FlaxRobertaPreTrainedModel.__call__r   )NNNNNNNNFNNNN) r[   r\   r]   r^   r    config_classbase_model_prefixr  r<   Moduler_   r*   r`   r   intr5   ra   r  r!  r@   r&  r2  r   r0  r   r   ROBERTA_INPUTS_DOCSTRINGformatr   dictrY   __classcell__r0   r0   r  r1   r    sn   
  +	
r  c                   @   s   e Zd ZU eed< ejZejed< dZe	ed< dZ
e	ed< dd Z															dd
eej deej deej deej deej de	de	de	de	de	fddZd	S )FlaxRobertaModuler4   r5   Tadd_pooling_layerFr   c                 C   s>   t | j| jd| _t| j| j| jd| _t| j| jd| _d S )Nrf   r   )	r3   r4   r5   
embeddingsr   r   encoderr   poolerrN   r0   r0   r1   rP     s   zFlaxRobertaModule.setupNrS   rT   r   r   r   r   rQ   r   r   r   c                 C   s   |d u r	t |}|d u rt t t |jd |j}| j|||||	d}| j||||	||||
||d
}|d }| jrB| 	|nd }|s]|d u rS|f|dd   S ||f|dd   S t
|||j|j|jdS )Nr#   rR   )r   rQ   r   r   r   r   r   r   r   r   )r   pooler_outputrX   r   r   )r*   r7  r   r   r1  r)   rC  rD  rB  rE  r   rX   r   r   )rO   r,   rU   rS   rT   r   r   r   r   rQ   r   r   r   rX   r   pooledr0   r0   r1   rY     s@   
 
zFlaxRobertaModule.__call__)
NNNNNFTFFT)r[   r\   r]   r    r_   r*   r`   r5   rB  ra   r   rP   r   r   rY   r0   r0   r0   r1   rA    sJ   
 	
rA  zaThe bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.c                   @      e Zd ZeZdS )FlaxRobertaModelN)r[   r\   r]   rA  r  r0   r0   r0   r1   rI    s    rI  c                	   @   ^   e Zd ZU eed< ejZejed< dZe	ed< dd Z
				dde	d	e	d
e	de	fddZdS )FlaxRobertaForMaskedLMModuler4   r5   Fr   c                 C   .   t | jd| j| jd| _t| j| jd| _d S NF)r4   rB  r5   r   r4   r5   rA  r4   r5   r   r  r   lm_headrN   r0   r0   r1   rP        z"FlaxRobertaForMaskedLMModule.setupTrQ   r   r   r   c
                 C   s   | j |||||||||	d	}
|
d }| jjr#| j jd d d d }nd }| j||d}|	s7|f|
dd   S t||
j|
jd	S )
NrQ   r   r   r   r   r  rC  rD   	embeddingr  r   logitsrX   r   )r  r4   tie_word_embeddingsr   rP  r   rX   r   )rO   r,   rU   rS   rT   r   rQ   r   r   r   r   rX   r  rV  r0   r0   r1   rY     s.   z%FlaxRobertaForMaskedLMModule.__call__NTFFTr[   r\   r]   r    r_   r*   r`   r5   r   ra   rP   rY   r0   r0   r0   r1   rK    $   
 	
rK  z5RoBERTa Model with a `language modeling` head on top.c                   @   rH  )FlaxRobertaForMaskedLMN)r[   r\   r]   rK  r  r0   r0   r0   r1   r[     s    r[  z<mask>)r.   c                	   @   rJ  )*FlaxRobertaForSequenceClassificationModuler4   r5   Fr   c                 C   s.   t | j| jd| jd| _t| j| jd| _d S )NFr4   r5   rB  r   rN  )rA  r4   r5   r   r  r  
classifierrN   r0   r0   r1   rP   3  s   z0FlaxRobertaForSequenceClassificationModule.setupTrQ   r   r   r   c
                 C   sZ   | j |||||||||	d	}
|
d }| j||d}|	s$|f|
dd   S t||
j|
jdS NrR  r   rR   r   rU  )r  r^  r   rX   r   )rO   r,   rU   rS   rT   r   rQ   r   r   r   r   sequence_outputrV  r0   r0   r1   rY   <  s(   z3FlaxRobertaForSequenceClassificationModule.__call__NrX  rY  r0   r0   r0   r1   r\  .  rZ  r\  z
    Roberta Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   @   rH  )$FlaxRobertaForSequenceClassificationN)r[   r\   r]   r\  r  r0   r0   r0   r1   ra  b      ra  c                	   @   rJ  )"FlaxRobertaForMultipleChoiceModuler4   r5   Fr   c                 C   s>   t | j| j| jd| _tj| jjd| _tj	d| jd| _
d S )Nr  r:   r   rf   )rA  r4   r5   r   r  r<   rK   rL   rM   rj   r^  rN   r0   r0   r1   rP   {  s   z(FlaxRobertaForMultipleChoiceModule.setupTrQ   r   r   r   c
                 C   s   |j d }
|d ur|d|j d nd }|d ur!|d|j d nd }|d ur0|d|j d nd }|d ur?|d|j d nd }| j|||||||||	d	}|d }| j||d}| |}|d|
}|	sp|f|dd   S t||j|jdS )Nr   r#   rR  rR   r"   rU  )r)   r(   r  rM   r^  r   rX   r   )rO   r,   rU   rS   rT   r   rQ   r   r   r   num_choicesr   pooled_outputrV  reshaped_logitsr0   r0   r1   rY     s6   

z+FlaxRobertaForMultipleChoiceModule.__call__NrX  rY  r0   r0   r0   r1   rc  v  rZ  rc  z
    Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                   @   rH  )FlaxRobertaForMultipleChoiceN)r[   r\   r]   rc  r  r0   r0   r0   r1   rg    rb  rg  z(batch_size, num_choices, sequence_lengthc                	   @   rJ  )'FlaxRobertaForTokenClassificationModuler4   r5   Fr   c                 C   s\   t | j| jd| jd| _| jjd ur| jjn| jj}tj|d| _	tj
| jj| jd| _d S )NFr]  r:   rf   )rA  r4   r5   r   r  r  rL   r<   rK   rM   rj   r  r^  r  r0   r0   r1   rP     s   z-FlaxRobertaForTokenClassificationModule.setupTrQ   r   r   r   c
                 C   sd   | j |||||||||	d	}
|
d }| j||d}| |}|	s)|f|
dd   S t||
j|
jdS r_  )r  rM   r^  r   rX   r   )rO   r,   rU   rS   rT   r   rQ   r   r   r   r   rX   rV  r0   r0   r1   rY     s*   
z0FlaxRobertaForTokenClassificationModule.__call__NrX  rY  r0   r0   r0   r1   rh    s$   
 	
rh  z
    Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                   @   rH  )!FlaxRobertaForTokenClassificationN)r[   r\   r]   rh  r  r0   r0   r0   r1   ri    rb  ri  c                	   @   rJ  )%FlaxRobertaForQuestionAnsweringModuler4   r5   Fr   c                 C   s2   t | j| jd| jd| _tj| jj| jd| _d S )NFr]  rf   )	rA  r4   r5   r   r  r<   rj   r  
qa_outputsrN   r0   r0   r1   rP     s   z+FlaxRobertaForQuestionAnsweringModule.setupTrQ   r   r   r   c
                 C   s   | j |||||||||	d	}
|
d }| |}tj|| jjdd\}}|d}|d}|	s9||f|
dd   S t|||
j|
j	dS )NrR  r   r#   r$   r   )start_logits
end_logitsrX   r   )
r  rk  r*   r'  r4   r  squeezer   rX   r   )rO   r,   rU   rS   rT   r   rQ   r   r   r   r   rX   rV  rl  rm  r0   r0   r1   rY   '  s0   


z.FlaxRobertaForQuestionAnsweringModule.__call__NrX  rY  r0   r0   r0   r1   rj    rZ  rj  z
    Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                   @   rH  )FlaxRobertaForQuestionAnsweringN)r[   r\   r]   rj  r  r0   r0   r0   r1   ro  R  rb  ro  c                   @   s   e Zd ZU eed< ejZejed< dZe	ed< dd Z
									dd	eej d
eej deej deej de	de	de	de	de	fddZdS )FlaxRobertaForCausalLMModuler4   r5   Fr   c                 C   rL  rM  rO  rN   r0   r0   r1   rP   j  rQ  z"FlaxRobertaForCausalLMModule.setupNTrS   r   r   r   r   rQ   r   r   r   c                 C   s   | j |||||||||	|
||d}|d }| jjr&| j jd d d d }nd }| j||d}|s:|f|dd   S t||j|j|jd	S )
N)r   r   r   rQ   r   r   r   r   r  rC  rD   rS  rT  r   )rV  rX   r   r   )	r  r4   rW  r   rP  r   rX   r   r   )rO   r,   rU   rT   rS   r   r   r   r   rQ   r   r   r   r   rX   r  rV  r0   r0   r1   rY   s  s6   z%FlaxRobertaForCausalLMModule.__call__)	NNNNFTFFTr   r0   r0   r0   r1   rp  e  sB   
 	
rp  z
    Roberta Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
    autoregressive tasks.
    c                   @   s.   e Zd ZeZddeej fddZdd Z	dS )FlaxRobertaForCausalLMNrU   c           	      C   s   |j \}}| ||}tj||fdd}|d ur(|jddd }t||d}nttj|ddd d d f ||f}|||dS )Nr!   rf   r#   r$   r   )r   r   )r5  rU   rT   )	r)   r   r*   rn   r+   r   r   r   r   )	rO   r,   r   rU   r   
seq_lengthr5  extended_attention_maskrT   r0   r0   r1   prepare_inputs_for_generation  s   
&z4FlaxRobertaForCausalLM.prepare_inputs_for_generationc                 C   s.   |j |d< |d d d dd f d |d< |S )Nr5  rT   r#   r   )r5  )rO   model_outputsmodel_kwargsr0   r0   r1   update_inputs_for_generation  s   
 z3FlaxRobertaForCausalLM.update_inputs_for_generationr   )
r[   r\   r]   rp  r  r   r@   Arrayrt  rw  r0   r0   r0   r1   rq    s    rq  )rq  r[  rg  ro  ra  ri  rI  r  )Utypingr   r   
flax.linenlinenr<   r@   	jax.numpynumpyr*   r  flax.core.frozen_dictr   r   r   r   r   r	   nn_partitioningflax.linen.attentionr
   flax.traverse_utilr   r   r   modeling_flax_outputsr   r   r   r   r   r   r   r   r   modeling_flax_utilsr   r   r   r   utilsr   r   r   configuration_robertar    
get_loggerr[   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   r2   ROBERTA_START_DOCSTRINGr=  r;  r3   rb   r   r   r   r   r   r   r   r   r   r  r  rA  rI  rK  r[  r\  ra  rc  rg  r>  rh  ri  rj  ro  rp  rq  __all__r0   r0   r0   r1   <module>   s   ,
', -+:S(#  BG:	4	=
	;	9A