o
    i                     @   s  d dl Z d dlmZmZ d dlmZ d dlZd dlm	Z
 d dl	Zd dlmZmZmZ d dlmZmZ d dlmZ ddlmZmZmZmZmZmZ ddlmZmZmZmZ dd	l m!Z!m"Z"m#Z# d
dl$m%Z% e#&e'Z(dZ)dZ*dZ+dZ,dd Z-dd Z.G dd dej/Z0G dd dej/Z1G dd dej/Z2G dd dej/Z3G dd dej/Z4G dd dej/Z5G d d! d!ej/Z6G d"d# d#eZ7G d$d% d%ej/Z8e!d&e+G d'd( d(e7Z9ee9e)de* G d)d* d*ej/Z:e!d+e+G d,d- d-e7Z;ee;e)ee* G d.d/ d/ej/Z<e!d0e+G d1d2 d2e7Z=ee=e)ee* G d3d4 d4ej/Z>e!d5e+G d6d7 d7e7Z?ee?e,@d8 ee?e)ee* G d9d: d:ej/ZAe!d;e+G d<d= d=e7ZBeeBe)ee* G d>d? d?ej/ZCe!d@e+G dAdB dBe7ZDeeDe)ee* g dCZEdS )D    N)CallableOptional)
FrozenDictfreezeunfreeze)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutputFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstringoverwrite_call_docstring)add_start_docstrings%add_start_docstrings_to_model_forwardlogging   )DistilBertConfigzdistilbert-base-uncasedr   a  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                 C   s*   dt dd|d  t |  }| | S )Nr   i'     )nppowerfloat32)posid_modelangle_rates r"   k/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/distilbert/modeling_flax_distilbert.py
get_angles`   s   "r$   c                 C   s   t t| d d tjf t|tjd d f |}t|d d dd df |d d dd df< t|d d dd df |d d dd df< |tjdf }t|S )Nr   r   r   .)r$   r   arangenewaxissincosjnparray)positionr    
angle_radspos_encodingr"   r"   r#   positional_encodinge   s
   4..
r.   c                   @   sB   e Zd ZU dZeed< ejZejed< dd Z	dde
fdd	Zd
S )FlaxEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                 C   s   t j| jj| jjtj jj| jjdd| _	| jj
s/t j| jj| jjtj jj| jjdd| _n
t| jj| jj| _t jd| jd| _t j| jjd| _d S )Nstddev)embedding_init-q=epsilonr1   rate)nnEmbedr0   
vocab_sizedimjaxinitializersnormalinitializer_rangeword_embeddingssinusoidal_pos_embdsmax_position_embeddingsposition_embeddingsr.   r-   	LayerNormr1   Dropoutdropoutselfr"   r"   r#   setupz   s   
zFlaxEmbeddings.setupTdeterministicc           	      C   s   |j \}}| |d}| jjs+t|d}tj|||fd}| |d}n| j	d d d |d d f }||j
}|| }| |}| j||d}|S )Ni4)shaperL   )rN   rB   astyper0   rC   r)   r%   broadcast_torE   r-   r1   rF   rH   )	rJ   	input_idsrL   
batch_size
seq_lengthinputs_embedsposition_idsposition_embedshidden_statesr"   r"   r#   __call__   s   

zFlaxEmbeddings.__call__NT)__name__
__module____qualname____doc__r   __annotations__r)   r   r1   rK   boolrY   r"   r"   r"   r#   r/   t   s   
 r/   c                   @   F   e Zd ZU eed< ejZejed< dd Z		dde	de	fd	d
Z
dS )FlaxMultiHeadSelfAttentionr0   r1   c                 C   s   | j j| _| j j| _tj| j jd| _| j| j dks'td| j d| j tj| j| j	t
jjj| j jdd| _tj| j| j	t
jjj| j jdd| _tj| j| j	t
jjj| j jdd| _tj| j| j	t
jjj| j jdd| _d S )Nr8   r   Hidden size " not dividable by number of heads r2   r1   kernel_init)r0   n_headsr=   r:   rG   attention_dropoutrH   
ValueErrorDenser1   r>   r?   r@   rA   q_link_linv_linout_linrI   r"   r"   r#   rK      s2   

z FlaxMultiHeadSelfAttention.setupTFrL   output_attentionsc              	      s  |j \ }}|j d }	jj  dd|	f}
 fdd} fdd}||}||}||}|t }t	||
dddd}t||
}||j}|d	d
|   }tj|dd}j||d}t	||}||}|}|r||fS |fS )Nr   c                    s   |   djddddS )zseparate headsr   r   r   r
   )reshaperg   	transposexbsdim_per_headrJ   r"   r#   rN      s   z2FlaxMultiHeadSelfAttention.__call__.<locals>.shapec                    s    |  dddd dj S )zgroup headsr   r   r   r
   rp   )rr   rq   rg   rs   ru   r"   r#   unshape   s    z4FlaxMultiHeadSelfAttention.__call__.<locals>.unshaper   r
   r   gꌠ9Y>)Fg      ?rp   axisrO   )rN   r=   rg   rk   rl   rm   mathsqrtr)   matmulrr   rq   rP   r1   r:   softmaxrH   rn   )rJ   querykeyvaluemaskrL   ro   q_lenr=   k_len
mask_reshprN   rx   qkvscoresweightscontextr"   ru   r#   rY      s,   	

z#FlaxMultiHeadSelfAttention.__call__N)TFr[   r\   r]   r   r_   r)   r   r1   rK   r`   rY   r"   r"   r"   r#   rb      s   
 #rb   c                   @   s>   e Zd ZU eed< ejZejed< dd Zd
de	fddZ
d	S )FlaxFFNr0   r1   c                 C   s   t j| jjd| _| jj| _d| _t j| jj| jt	j j
j| jjdd| _t j| jj| jt	j j
j| jjdd| _t| jj | _d S )Nr8   r   r2   re   )r:   rG   r0   rH   chunk_size_feed_forwardseq_len_dimrj   
hidden_dimr1   r>   r?   r@   rA   lin1r=   lin2r   
activationrI   r"   r"   r#   rK      s   
zFlaxFFN.setupTrL   c                 C   s0   |  |}| |}| |}| j||d}|S )NrO   )r   r   r   rH   )rJ   rX   rL   r"   r"   r#   rY   	  s
   


zFlaxFFN.__call__NrZ   r   r"   r"   r"   r#   r      s
   
 r   c                   @   ra   )FlaxTransformerBlockr0   r1   c                 C   s|   | j j| j j dksJ d| j j d| j j t| j | jd| _tjd| jd| _t	| j | jd| _
tjd| jd| _d S )Nr   rc   rd   r1   r5   r6   )r0   r=   rg   rb   r1   	attentionr:   rF   sa_layer_normr   ffnoutput_layer_normrI   r"   r"   r#   rK     s   zFlaxTransformerBlock.setupFTro   rL   c           	      C   s~   | j ||||||d}|r|\}}nt|tu sJ |d }| || }| j||d}| || }|f}|r=|f| }|S )N)r   r   r   r   ro   rL   r   rO   )r   typetupler   r   r   )	rJ   rX   	attn_maskro   rL   	sa_output
sa_weights
ffn_outputoutputr"   r"   r#   rY      s&   

zFlaxTransformerBlock.__call__N)FTr   r"   r"   r"   r#   r     s   
 r   c                	   @   R   e Zd ZU eed< ejZejed< dd Z				dde	de	d	e	d
e	fddZ
dS )FlaxTransformerr0   r1   c                    s     fddt  jjD  _d S )Nc                    s"   g | ]}t  jt| jd qS ))namer1   )r   r0   strr1   ).0r   rI   r"   r#   
<listcomp>E  s    z)FlaxTransformer.setup.<locals>.<listcomp>)ranger0   n_layerslayersrI   r"   rI   r#   rK   D  s   

zFlaxTransformer.setupFTro   output_hidden_statesrL   return_dictc                 C   s   |rdnd }|r
dnd }| j D ]1}	|r||f }|	||||d}
|
d }|r8t|
dks.J |
d }||f }qt|
dks@J q|rH||f }|sVtdd |||fD S t|||d	S )
Nr"   )rX   r   ro   rL   rp   r   r   r   c                 s   s    | ]	}|d ur|V  qd S Nr"   )r   r   r"   r"   r#   	<genexpr>m  s    z+FlaxTransformer.__call__.<locals>.<genexpr>)last_hidden_staterX   
attentions)r   lenr   r   )rJ   rX   attention_maskro   r   rL   r   all_hidden_statesall_attentionslayer_modulelayer_outputsr   r"   r"   r#   rY   I  s0   	


zFlaxTransformer.__call__NFFTFr   r"   r"   r"   r#   r   @  "   
 	r   c                	   @   r   )FlaxTransformerEncoderr0   r1   c                 C   s   t | j| jd| _d S Nr   )r   r0   r1   layerrI   r"   r"   r#   rK   w  s   zFlaxTransformerEncoder.setupFTro   r   rL   r   c                 C   s   | j ||||||dS )N)rX   r   ro   r   rL   r   )r   )rJ   rX   r   ro   r   rL   r   r"   r"   r#   rY   z  s   	zFlaxTransformerEncoder.__call__Nr   r   r"   r"   r"   r#   r   s  s"   
 r   c                   @   sR   e Zd ZU eed< ejZejed< ej	j
jZedejf ed< dd Zdd Zd	S )
FlaxDistilBertLMDecoderr0   r1   .	bias_initc                 C   s   |  d| j| jjf| _d S )Nbias)paramr   r0   r<   r   rI   r"   r"   r#   rK     s   zFlaxDistilBertLMDecoder.setupc                 C   sV   t || j}t || j}t|||jd fdfdf}t | j| j}|| }|S )Nr   )r   )r"   r"   )r)   asarrayr1   r	   dot_generalndimr   )rJ   inputskernelyr   r"   r"   r#   rY     s   z FlaxDistilBertLMDecoder.__call__N)r[   r\   r]   r   r_   r)   r   r1   r>   r:   r?   zerosr   r   r   ndarrayrK   rY   r"   r"   r"   r#   r     s   
 r   c                       s   e Zd ZU dZeZdZdZej	e
d< ddejdfded	ed
edejdef
 fddZddejjd	ededefddZeed								ddee dejjdedee dee dee fddZ  ZS )FlaxDistilBertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    
distilbertNmodule_class)r   r   r   Tr0   input_shapeseedr1   _do_initc                    s2   | j d||d|}t j||||||d d S )Nr0   r1   )r   r   r1   r   r"   )r   super__init__)rJ   r0   r   r   r1   r   kwargsmodule	__class__r"   r#   r     s   	z&FlaxDistilBertPreTrainedModel.__init__rngparamsreturnc                 C   s   t j|dd}t |}tj|\}}||d}| jj|||ddd }	|d urKtt	|	}	tt	|}| j
D ]}
|	|
 ||
< q8t | _
tt|S |	S )NrM   r   )r   rH   F)r   r   )r)   r   	ones_liker>   randomsplitr   initr   r   _missing_keyssetr   r   )rJ   r   r   r   rR   r   
params_rngdropout_rngrngsrandom_paramsmissing_keyr"   r"   r#   init_weights  s   


z*FlaxDistilBertPreTrainedModel.init_weightszbatch_size, sequence_lengthFr   trainro   r   r   c
              
   C   s   |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|d u r't|}i }
|d ur1||
d< | jjd|p9| jitj	|ddtj	|dd| |||	|
dS )NrH   r   rM   r   )r   )
r0   ro   r   r   r)   r   r   applyr   r*   )rJ   rR   r   	head_maskr   r   r   ro   r   r   r   r"   r"   r#   rY     s&   
z&FlaxDistilBertPreTrainedModel.__call__r   )NNNNFNNN)r[   r\   r]   r^   r   config_classbase_model_prefixr   r:   Moduler_   r)   r   r   intr1   r`   r   r>   r   PRNGKeyr   r   r   DISTILBERT_INPUTS_DOCSTRINGformatr   dictrY   __classcell__r"   r"   r   r#   r     sV   
  	
r   c                	   @   R   e Zd ZU eed< ejZejed< dd Z				dde	de	d	e	d
e	fddZ
dS )FlaxDistilBertModuler0   r1   c                 C   s(   t | j| jd| _t| j| jd| _d S r   )r/   r0   r1   
embeddingsr   transformerrI   r"   r"   r#   rK     s   zFlaxDistilBertModule.setupTFrL   ro   r   r   c                 C   s`   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||d}| j||||||dS )NrO   )rX   r   rL   ro   r   r   )r0   ro   r   r   r   r   )rJ   rR   r   rL   ro   r   r   input_embedsr"   r"   r#   rY     s   	zFlaxDistilBertModule.__call__NTFFTr   r"   r"   r"   r#   r     s"   
 r   zdThe bare DistilBert Model transformer outputting raw hidden-states without any specific head on top.c                   @      e Zd ZeZdS )FlaxDistilBertModelN)r[   r\   r]   r   r   r"   r"   r"   r#   r     s    r   c                	   @   r   )FlaxDistilBertForMaskedLMModuler0   r1   c                 C   s   t | j| jd| _tj| jj| jtjjj	| jj
dd| _tjd| jd| _| jjr5t| j| jd| _d S tj| jj| jtjjj	| jj
dd| _d S )Nr   r2   re   r5   r6   )r   r0   r1   r   r:   rj   r=   r>   r?   r@   rA   vocab_transformrF   vocab_layer_normtie_word_embeddingsr   vocab_projectorr<   rI   r"   r"   r#   rK      s"   z%FlaxDistilBertForMaskedLMModule.setupTFrL   ro   r   r   c                 C   s   |d ur|n| j j}| j||||||d}|d }| |}	t| j j |	}	| |	}	| j jrC| jjd d d d }
| 	|	|
j
}	n| 	|	}	|sU|	f|dd   }|S t|	|j|jdS )	N)rR   r   ro   r   rL   r   r   r   r   rB   	embeddingr   logitsrX   r   )r0   use_return_dictr   r   r   r   r   r   	variablesr   Tr   rX   r   )rJ   rR   r   rL   ro   r   r   dlbrt_outputrX   prediction_logitsshared_embeddingr   r"   r"   r#   rY   4  s2   	


z(FlaxDistilBertForMaskedLMModule.__call__Nr   r   r"   r"   r"   r#   r     s"   
 r   z8DistilBert Model with a `language modeling` head on top.c                   @   r   )FlaxDistilBertForMaskedLMN)r[   r\   r]   r   r   r"   r"   r"   r#   r  ]  s    r  c                	   @   r   )-FlaxDistilBertForSequenceClassificationModuler0   r1   c                 C   sf   t | j| jd| _tj| jj| jtjjj	| jj
dd| _tj| jjd| _tj| jj| jd| _d S )Nr   r2   re   r8   r   )r   r0   r1   r   r:   rj   r=   r>   r?   r@   rA   pre_classifierrG   seq_classif_dropoutrH   
num_labels
classifierrI   r"   r"   r#   rK   i  s   z3FlaxDistilBertForSequenceClassificationModule.setupTFrL   ro   r   r   c                 C   s   |d ur|n| j j}| j||||||d}|d }|d d df }	| |	}	td |	}	| j|	|d}	| |	}
|sC|
f|dd   S t|
|j|j	dS )NrL   ro   r   r   r   relurO   r   r   )
r0   r   r   r  r   rH   r
  r   rX   r   )rJ   rR   r   rL   ro   r   r   distilbert_outputhidden_statepooled_outputr   r"   r"   r#   rY   v  s,   	

z6FlaxDistilBertForSequenceClassificationModule.__call__Nr   r   r"   r"   r"   r#   r  e  "   
 r  z
    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   @   r   )'FlaxDistilBertForSequenceClassificationN)r[   r\   r]   r  r   r"   r"   r"   r#   r        r  c                	   @   r   )%FlaxDistilBertForMultipleChoiceModuler0   r1   c                 C   sb   t | j| jd| _tj| jj| jtjjj	| jj
dd| _tj| jjd| _tjd| jd| _d S )Nr   r2   re   r8   r   r   )r   r0   r1   r   r:   rj   r=   r>   r?   r@   rA   r  rG   r  rH   r
  rI   r"   r"   r#   rK     s   z+FlaxDistilBertForMultipleChoiceModule.setupTFrL   ro   r   r   c                 C   s   |d ur|n| j j}|jd }|d ur|d|jd nd }|d ur+|d|jd nd }| j||||||d}|d }	|	d d df }
| |
}
td |
}
| j|
|d}
| |
}|d|}|sl|f|dd   S t	||j
|jdS )	Nr   rp   r  r   r  rO   r   r   )r0   r   rN   rq   r   r  r   rH   r
  r   rX   r   )rJ   rR   r   rL   ro   r   r   num_choicesoutputsr  r  r   reshaped_logitsr"   r"   r#   rY     s4   	
	

z.FlaxDistilBertForMultipleChoiceModule.__call__Nr   r   r"   r"   r"   r#   r    r  r  z
    DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
    a softmax) e.g. for RocStories/SWAG tasks.
    c                   @   r   )FlaxDistilBertForMultipleChoiceN)r[   r\   r]   r  r   r"   r"   r"   r#   r    r  r  z(batch_size, num_choices, sequence_lengthc                	   @   r   )*FlaxDistilBertForTokenClassificationModuler0   r1   c                 C   s>   t | j| jd| _tj| jjd| _tj| jj| jd| _	d S )Nr   r8   r   )
r   r0   r1   r   r:   rG   rH   rj   r	  r
  rI   r"   r"   r#   rK     s   z0FlaxDistilBertForTokenClassificationModule.setupTFrL   ro   r   r   c           
      C   sr   |d ur|n| j j}| j||||||d}|d }| j||d}| |}	|s0|	f|dd   S t|	|j|jdS )Nr  r   rO   r   r   )r0   r   r   rH   r
  r   rX   r   )
rJ   rR   r   rL   ro   r   r   r  rX   r   r"   r"   r#   rY     s&   		
z3FlaxDistilBertForTokenClassificationModule.__call__Nr   r   r"   r"   r"   r#   r    r   r  z
    DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    c                   @   r   )$FlaxDistilBertForTokenClassificationN)r[   r\   r]   r  r   r"   r"   r"   r#   r  *  r  r  c                	   @   r   )(FlaxDistilBertForQuestionAnsweringModuler0   r1   c                 C   sN   t | j| jd| _tj| jj| jd| _| jjdksJ tj| jj	d| _
d S )Nr   r   r   r8   )r   r0   r1   r   r:   rj   r	  
qa_outputsrG   
qa_dropoutrH   rI   r"   r"   r#   rK   A  s   z.FlaxDistilBertForQuestionAnsweringModule.setupTFrL   ro   r   r   c                 C   s   |d ur|n| j j}| j||||||d}|d }| j||d}| |}	tj|	| j jdd\}
}|
d}
|d}|sG|
|f|dd   S t	|
||j
|jdS )Nr  r   rO   rp   ry   r   )start_logits
end_logitsrX   r   )r0   r   r   rH   r  r)   r   r	  squeezer   rX   r   )rJ   rR   r   rL   ro   r   r   r  rX   r   r  r  r"   r"   r#   rY   G  s.   		


z1FlaxDistilBertForQuestionAnsweringModule.__call__Nr   r   r"   r"   r"   r#   r  =  s"   
 
r  z
    DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                   @   r   )"FlaxDistilBertForQuestionAnsweringN)r[   r\   r]   r  r   r"   r"   r"   r#   r   o  r  r   )r  r  r   r  r  r   r   )Fr{   typingr   r   
flax.linenlinenr:   r>   	jax.numpynumpyr)   r   flax.core.frozen_dictr   r   r   flax.traverse_utilr   r   r	   modeling_flax_outputsr   r   r   r   r   r   modeling_flax_utilsr   r   r   r   utilsr   r   r   configuration_distilbertr   
get_loggerr[   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCFLAX_DISTILBERT_START_DOCSTRINGr   r$   r.   r   r/   rb   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r   r  r  r  r   __all__r"   r"   r"   r#   <module>   s    
-S/3Q"A5<
+2