o
    ic                     @   s  d dl mZ d dlmZ d dlZd dlmZ d dl	m
Z
mZmZ d dlmZ d dlmZmZ ddlmZmZmZ ddlmZmZmZmZ dd	lmZmZ d
dlmZ dZ dZ!G dd dej"Z#G dd dej"Z$G dd dej"Z%G dd dej"Z&G dd dej"Z'G dd dej"Z(G dd dej"Z)G dd dej"Z*G dd dej"Z+G d d! d!ej"Z,G d"d# d#ej"Z-G d$d% d%eZ.G d&d' d'ej"Z/ed(e G d)d* d*e.Z0d+Z1ee0e1 ee0eed, G d-d. d.ej"Z2ed/e G d0d1 d1e.Z3d2Z4ee3e4 ee3eed, g d3Z5dS )4    )OptionalN)
FrozenDictfreezeunfreeze)dot_product_attention_weights)flatten_dictunflatten_dict   )FlaxBaseModelOutputFlaxBaseModelOutputWithPoolingFlaxSequenceClassifierOutput)ACT2FNFlaxPreTrainedModel append_replace_return_docstringsoverwrite_call_docstring)add_start_docstrings%add_start_docstrings_to_model_forward   )	ViTConfiga  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
a  
    Args:
        pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                   @   6   e Zd ZU eed< ejZejed< dd Zdd Z	dS )FlaxViTPatchEmbeddingsconfigdtypec                 C   sp   | j j}| j j}|| ||  }|| _| j j| _tj| j j||f||fd| jt	jj
| j jd ddd| _d S )NVALID   fan_intruncated_normal)kernel_sizestridespaddingr   kernel_init)r   
image_size
patch_sizenum_patchesnum_channelsnnConvhidden_sizer   jaxinitializersvariance_scalinginitializer_range
projection)selfr!   r"   r#    r.   ]/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/vit/modeling_flax_vit.pysetup\   s   
zFlaxViTPatchEmbeddings.setupc                 C   sF   |j d }|| jkrtd| |}|j \}}}}t||d|fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)shaper$   
ValueErrorr,   jnpreshape)r-   pixel_valuesr$   
embeddings
batch_size_channelsr.   r.   r/   __call__m   s   


zFlaxViTPatchEmbeddings.__call__N
__name__
__module____qualname__r   __annotations__r4   float32r   r0   r;   r.   r.   r.   r/   r   X   s
   
 r   c                   @   s<   e Zd ZU dZeed< ejZejed< dd Z	d
ddZ
d	S )FlaxViTEmbeddingsz7Construct the CLS token, position and patch embeddings.r   r   c                 C   s   |  dtjj| jjd dddd| jjf| _t	| j| j
d| _| jj}|  dtjj| jjd ddd|d | jjf| _tj| jjd| _d S )	N	cls_tokenr   r   r   r   r   position_embeddingsrate)paramr(   r%   r)   r*   r   r+   r'   rC   r   r   patch_embeddingsr#   rE   Dropouthidden_dropout_probdropout)r-   r#   r.   r.   r/   r0   ~   s   zFlaxViTEmbeddings.setupTc                 C   sZ   |j d }| |}t| j|d| jjf}tj||fdd}|| j }| j	||d}|S )Nr   r   )axisdeterministic)
r2   rI   r4   broadcast_torC   r   r'   concatenaterE   rL   )r-   r6   rO   r8   r7   
cls_tokensr.   r.   r/   r;      s   


zFlaxViTEmbeddings.__call__NT)r=   r>   r?   __doc__r   r@   r4   rA   r   r0   r;   r.   r.   r.   r/   rB   x   s   
 rB   c                   @   B   e Zd ZU eed< ejZejed< dd Zdde	de	fd	d
Z
dS )FlaxViTSelfAttentionr   r   c                 C   s   | j j| j j dkrtdtj| j j| jtjjj	| j j
d ddd| j jd| _tj| j j| jtjjj	| j j
d ddd| j jd| _tj| j j| jtjjj	| j j
d ddd| j jd| _d S )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`: {self.config.num_attention_heads}r   r   r   )modedistribution)r   r    use_bias)r   r'   num_attention_headsr3   r%   Denser   r(   r)   r*   r+   qkv_biasquerykeyvaluer-   r.   r.   r/   r0      s8   zFlaxViTSelfAttention.setupTFrO   output_attentionsc              
   C   s   | j j| j j }| ||jd d | j j|f }| ||jd d | j j|f }| ||jd d | j j|f }d }|sP| j jdkrP| 	d}t
|||| j jd|| jd d}	td|	|}
|
|
jd d d }
|rz|
|	f}|S |
f}|S )Nr   g        rL   T)dropout_rngdropout_ratebroadcast_dropoutrO   r   	precisionz...hqk,...khd->...qhd)r1   )r   r'   rZ   r]   r5   r2   r_   r^   attention_probs_dropout_probmake_rngr   r   r4   einsum)r-   hidden_statesrO   ra   head_dimquery_statesvalue_states
key_statesrb   attn_weightsattn_outputoutputsr.   r.   r/   r;      s:   



zFlaxViTSelfAttention.__call__NTFr=   r>   r?   r   r@   r4   rA   r   r0   boolr;   r.   r.   r.   r/   rV      s
   
  rV   c                   @   >   e Zd ZU eed< ejZejed< dd Zd
de	fddZ
d	S )FlaxViTSelfOutputr   r   c                 C   D   t j| jjtj j| jjd dd| jd| _	t j
| jjd| _d S Nr   r   r   r    r   rF   r%   r[   r   r'   r(   r)   r*   r+   r   denserJ   rK   rL   r`   r.   r.   r/   r0         zFlaxViTSelfOutput.setupTrO   c                 C   s   |  |}| j||d}|S NrN   rz   rL   )r-   ri   input_tensorrO   r.   r.   r/   r;      s   
zFlaxViTSelfOutput.__call__NrS   rr   r.   r.   r.   r/   ru      
   
 
ru   c                   @   s>   e Zd ZU eed< ejZejed< dd Zdde	fdd	Z
d
S )FlaxViTAttentionr   r   c                 C   s(   t | j| jd| _t| j| jd| _d S NrD   )rV   r   r   	attentionru   outputr`   r.   r.   r/   r0      s   zFlaxViTAttention.setupTFra   c                 C   sD   | j |||d}|d }| j|||d}|f}|r ||d f7 }|S NrO   ra   r   rN   r   )r   r   )r-   ri   rO   ra   attn_outputsro   rp   r.   r.   r/   r;      s   zFlaxViTAttention.__call__Nrq   rr   r.   r.   r.   r/   r      s
   
 r   c                   @   r   )FlaxViTIntermediater   r   c                 C   @   t j| jjtj j| jjd dd| jd| _	t
| jj | _d S Nr   r   r   rx   )r%   r[   r   intermediate_sizer(   r)   r*   r+   r   rz   r   
hidden_act
activationr`   r.   r.   r/   r0        zFlaxViTIntermediate.setupc                 C   s   |  |}| |}|S Nrz   r   )r-   ri   r.   r.   r/   r;     s   

zFlaxViTIntermediate.__call__Nr<   r.   r.   r.   r/   r   	  
   
 
r   c                   @   rt   )FlaxViTOutputr   r   c                 C   rv   rw   ry   r`   r.   r.   r/   r0   !  r{   zFlaxViTOutput.setupTrO   c                 C   s$   |  |}| j||d}|| }|S r|   r}   )r-   ri   attention_outputrO   r.   r.   r/   r;   +  s   
zFlaxViTOutput.__call__NrS   rr   r.   r.   r.   r/   r     r   r   c                   @   rU   )FlaxViTLayerr   r   c                 C   sf   t | j| jd| _t| j| jd| _t| j| jd| _tj	| jj
| jd| _tj	| jj
| jd| _d S NrD   )epsilonr   )r   r   r   r   r   intermediater   r   r%   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr`   r.   r.   r/   r0   6  s
   zFlaxViTLayer.setupTFrO   ra   c                 C   sf   | j | |||d}|d }|| }| |}| |}| j|||d}|f}|r1||d f7 }|S r   )r   r   r   r   r   )r-   ri   rO   ra   attention_outputsr   layer_outputrp   r.   r.   r/   r;   =  s   

zFlaxViTLayer.__call__Nrq   rr   r.   r.   r.   r/   r   2  s
   
 r   c                	   @   R   e Zd ZU eed< ejZejed< dd Z				dde	de	d	e	d
e	fddZ
dS )FlaxViTLayerCollectionr   r   c                    s     fddt  jjD  _d S )Nc                    s"   g | ]}t  jt| jd qS ))namer   )r   r   strr   ).0ir`   r.   r/   
<listcomp>[  s    z0FlaxViTLayerCollection.setup.<locals>.<listcomp>)ranger   num_hidden_layerslayersr`   r.   r`   r/   r0   Z  s   

zFlaxViTLayerCollection.setupTFrO   ra   output_hidden_statesreturn_dictc                 C   s   |rdnd }|r
dnd }t | jD ]\}}	|r||f7 }|	|||d}
|
d }|r0||
d f7 }q|r8||f7 }|f}|sFtdd |D S t|||dS )Nr.   r   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r.   )r   vr.   r.   r/   	<genexpr>z  s    z2FlaxViTLayerCollection.__call__.<locals>.<genexpr>)last_hidden_stateri   
attentions)	enumerater   tupler
   )r-   ri   rO   ra   r   r   all_attentionsall_hidden_statesr   layerlayer_outputsrp   r.   r.   r/   r;   _  s$   

zFlaxViTLayerCollection.__call__NTFFTrr   r.   r.   r.   r/   r   V  s"   
 r   c                	   @   r   )FlaxViTEncoderr   r   c                 C   s   t | j| jd| _d S r   )r   r   r   r   r`   r.   r.   r/   r0     s   zFlaxViTEncoder.setupTFrO   ra   r   r   c                 C   s   | j |||||dS )NrO   ra   r   r   )r   )r-   ri   rO   ra   r   r   r.   r.   r/   r;     s   zFlaxViTEncoder.__call__Nr   rr   r.   r.   r.   r/   r     s"   
 r   c                   @   r   )FlaxViTPoolerr   r   c                 C   r   r   )r%   r[   r   pooler_output_sizer(   r)   r*   r+   r   rz   r   
pooler_actr   r`   r.   r.   r/   r0     r   zFlaxViTPooler.setupc                 C   s$   |d d df }|  |}| |S )Nr   r   )r-   ri   cls_hidden_stater.   r.   r/   r;     s   

zFlaxViTPooler.__call__Nr<   r.   r.   r.   r/   r     r   r   c                       s   e Zd ZU dZeZdZdZdZe	j
ed< ddejdfded	ed
ejdef fddZddejjdededefddZeed						ddee dejjdedee dee dee fddZ  ZS )FlaxViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    vitr6   Nmodule_classr   Tr   seedr   _do_initc                    sL   | j d||d|}|d u rd|j|j|jf}t j||||||d d S )N)r   r   r   )input_shaper   r   r   r.   )r   r!   r$   super__init__)r-   r   r   r   r   r   kwargsmodule	__class__r.   r/   r     s   	zFlaxViTPreTrainedModel.__init__rngr   paramsreturnc           
      C   s   t j|| jd}tj|\}}||d}| jj||ddd }|d urFtt	|}tt	|}| j
D ]}	||	 ||	< q3t | _
tt|S |S )NrD   )r   rL   F)r   r   )r4   zerosr   r(   randomsplitr   initr   r   _missing_keyssetr   r   )
r-   r   r   r   r6   
params_rngrb   rngsrandom_paramsmissing_keyr.   r.   r/   init_weights  s   

z#FlaxViTPreTrainedModel.init_weightszbatch_size, sequence_lengthFrb   trainra   r   r   c           	   	   C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}t|d}i }|d ur.||d< | jjd|p6| jitj	|tj
d| ||||dS )N)r   r   r	   r   rL   r   rD   )r   )r   ra   r   r   r4   	transposer   applyr   arrayrA   )	r-   r6   r   rb   r   ra   r   r   r   r.   r.   r/   r;     s"   zFlaxViTPreTrainedModel.__call__r   )NNFNNN)r=   r>   r?   rT   r   config_classbase_model_prefixmain_input_namer   r%   Moduler@   r4   rA   intr   rs   r   r(   r   PRNGKeyr   r   r   r   VIT_INPUTS_DOCSTRINGformatr   dictr;   __classcell__r.   r.   r   r/   r     sP   
  r   c                	   @   s^   e Zd ZU eed< ejZejed< dZe	ed< dd Z
				dde	d	e	d
e	de	fddZdS )FlaxViTModuler   r   Tadd_pooling_layerc                 C   s`   t | j| jd| _t| j| jd| _tj| jj| jd| _	| j
r+t| j| jd| _d S d | _d S r   )rB   r   r   r7   r   encoderr%   r   r   	layernormr   r   poolerr`   r.   r.   r/   r0      s   &zFlaxViTModule.setupFrO   ra   r   r   c           	      C   s   | j ||d}| j|||||d}|d }| |}| jr"| |nd }|s=|d u r3|f|dd   S ||f|dd   S t|||j|jdS )NrN   r   r   r   )r   pooler_outputri   r   )r7   r   r   r   r   r   ri   r   )	r-   r6   rO   ra   r   r   ri   rp   pooledr.   r.   r/   r;     s*   
zFlaxViTModule.__call__Nr   )r=   r>   r?   r   r@   r4   rA   r   r   rs   r0   r;   r.   r.   r.   r/   r     s$   
 	r   z]The bare ViT Model transformer outputting raw hidden-states without any specific head on top.c                   @      e Zd ZeZdS )FlaxViTModelN)r=   r>   r?   r   r   r.   r.   r.   r/   r   )  s    r   a  
    Returns:

    Examples:

    ```python
    >>> from transformers import AutoImageProcessor, FlaxViTModel
    >>> from PIL import Image
    >>> import requests

    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)

    >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
    >>> model = FlaxViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

    >>> inputs = image_processor(images=image, return_tensors="np")
    >>> outputs = model(**inputs)
    >>> last_hidden_states = outputs.last_hidden_state
    ```
)output_typer   c                   @   sH   e Zd ZU eed< ejZejed< dd Z					d
de	fdd	Z
dS )#FlaxViTForImageClassificationModuler   r   c                 C   sF   t | j| jdd| _tj| jj| jtjj	| jj
d ddd| _d S )NF)r   r   r   r   r   r   )r   r    )r   r   r   r   r%   r[   
num_labelsr(   r)   r*   r+   
classifierr`   r.   r.   r/   r0   O  s   z)FlaxViTForImageClassificationModule.setupNTrO   c           
      C   sx   |d ur|n| j j}| j|||||d}|d }| |d d dd d f }|s3|f|dd   }	|	S t||j|jdS )Nr   r   r   )logitsri   r   )r   use_return_dictr   r   r   ri   r   )
r-   r6   rO   ra   r   r   rp   ri   r   r   r.   r.   r/   r;   Y  s$   z,FlaxViTForImageClassificationModule.__call__)NTNNNrr   r.   r.   r.   r/   r   K  s   
 r   z
    ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                   @   r   )FlaxViTForImageClassificationN)r=   r>   r?   r   r   r.   r.   r.   r/   r   y  s    r   ag  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoImageProcessor, FlaxViTForImageClassification
    >>> from PIL import Image
    >>> import jax
    >>> import requests

    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)

    >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
    >>> model = FlaxViTForImageClassification.from_pretrained("google/vit-base-patch16-224")

    >>> inputs = image_processor(images=image, return_tensors="np")
    >>> outputs = model(**inputs)
    >>> logits = outputs.logits

    >>> # model predicts one of the 1000 ImageNet classes
    >>> predicted_class_idx = jax.numpy.argmax(logits, axis=-1)
    >>> print("Predicted class:", model.config.id2label[predicted_class_idx.item()])
    ```
)r   r   r   )6typingr   
flax.linenlinenr%   r(   	jax.numpynumpyr4   flax.core.frozen_dictr   r   r   flax.linen.attentionr   flax.traverse_utilr   r   modeling_flax_outputsr
   r   r   modeling_flax_utilsr   r   r   r   utilsr   r   configuration_vitr   VIT_START_DOCSTRINGr   r   r   rB   rV   ru   r   r   r   r   r   r   r   r   r   r   FLAX_VISION_MODEL_DOCSTRINGr   r   FLAX_VISION_CLASSIF_DOCSTRING__all__r.   r.   r.   r/   <module>   s\   # !G$+N.
.
