o
    wiU                     @   s~  d dl mZmZmZ d dlZd dlZd dlZd dlmZ ddl	m
Z
mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZmZm Z  ddl!m"Z" erlddl#m$Z$ ddiZ%dZ&e'e(Z)G dd deZ*G dd de"eZ+G dd dej,Z-G dd deZ.G dd de Z/G dd deZ0G d d! d!eZ1G d"d# d#eZ2g d$Z3dS )%    )TYPE_CHECKINGAnyOptionalN)nn   )CacheDynamicCache)PretrainedConfig)create_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPast)Unpack)
AddedTokenPreTrainedTokenizer)logging   )LlamaForCausalLMLlamaForSequenceClassificationLlamaForTokenClassificationLlamaMLP
LlamaModel)LlamaTokenizer)	TextInput
vocab_fileztokenizer.modelu   ▁c                       s   e Zd ZdZdZdgZddddddddZdgdgfd	d
gd	gfd	gd	gfdZ																				d  fdd	Z  Z	S )!GemmaConfiga  
    This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma-7B.
    e.g. [google/gemma-7b](https://huggingface.co/google/gemma-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Gemma model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`GemmaModel`]
        hidden_size (`int`, *optional*, defaults to 3072):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 24576):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 28):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 16):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The legacy activation function. It is overwritten by the `hidden_activation`.
        hidden_activation (`str` or `function`, *optional*):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
    ```python
    >>> from transformers import GemmaModel, GemmaConfig
    >>> # Initializing a Gemma gemma-7b style configuration
    >>> configuration = GemmaConfig()
    >>> # Initializing a model from the gemma-7b style configuration
    >>> model = GemmaModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemmapast_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnorm      `           gelu_pytorch_tanhN    {Gz?ư>Tr      r        @F        c                    s   || _ |
| _|| _|| _|| _|| _|| _|| _|| _|	| _	|| _
|| _|| _|| _|| _|| _t jd||||d| d S )N)pad_token_idbos_token_ideos_token_idtie_word_embeddings )
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_heads
hidden_acthidden_activationinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropoutsuper__init__)selfr8   r:   r;   r<   r=   r?   r>   r@   rA   r9   rB   rC   rD   r3   r5   r4   r6   rE   rF   rG   kwargs	__class__r7   d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/gemma/modular_gemma.pyrI      s0   
zGemmaConfig.__init__)r&   r'   r(   r)   r*   r*   r+   r,   Nr-   r.   r/   Tr   r0   r   Tr1   Fr2   )
__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planrI   __classcell__r7   r7   rL   rN   r   4   sJ    C


r   c                	   @   s   e Zd ZdZ										dd	eeeef  fd
dZdd Z	dd Z
dddee fddZdd Z		ddee dededefddZdd ZdS ) GemmaTokenizera
  
    Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        sp_model_kwargs (`dict[str, Any]`, `Optional`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Whether or not the default system prompt for Gemma should be used.
        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to add spaces between special tokens.
    <unk><bos><eos><pad>NTFsp_model_kwargsc                 K   s   |d u ri n|| _ t|trt|dddn|}t|tr#t|dddn|}t|tr1t|dddn|}t|tr?t|dddn|}|| _|| _|| _|
| _tj	di | j | _
| j
| tj| f||||||||	|
|d
| d S )NFT)
normalizedspecial)
	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr]   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokensr7   )r]   
isinstancestrr   r   rd   re   rg   spmSentencePieceProcessorsp_modelLoadr   rI   )rJ   r   rb   r`   ra   rc   r]   rd   re   rf   rg   rh   rK   r7   r7   rN   rI      s6   
zGemmaTokenizer.__init__c                 C      t dNzNot needed for GemmaAttributeErrorrJ   r7   r7   rN   get_spm_processor     z GemmaTokenizer.get_spm_processorc                 C   ro   rp   rq   rs   r7   r7   rN   unk_token_length  ru   zGemmaTokenizer.unk_token_lengthtextr   returnc                 K   s   t j| |fi |S )ze
        Args:
            text: TextInput
        Simply calls PreTrainedTokenizer's method
        )r   tokenizerJ   rw   rK   r7   r7   rN   ry     s   zGemmaTokenizer.tokenizec                 K   s   | j j|tdS )z
        Args:
            text: TextInput
        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
        )out_type)rm   encoderj   rz   r7   r7   rN   	_tokenize#  s   zGemmaTokenizer._tokenize	token_idsskip_special_tokensrh   c                 K   s   g }g }|D ]+}|r|| j v rq|| jv r,|r || j| || j| j g }q|| q|r=|| j| |rEd|}nd|}|tdS )N  )	all_special_ids_added_tokens_decoderappendrm   decodecontentjoinreplaceSPIECE_UNDERLINE)rJ   r~   r   rh   rK   	sub_textscurrent_sub_textidsr7   r7   rN   _decode+  s"   

zGemmaTokenizer._decodec                 C   sT   g }d}|D ]}|| j v r|| j|| 7 }g }q|| q|| j|7 }|S )z:Converts a sequence of tokens (string) in a single string.r   )_added_tokens_encoderrm   r   r   )rJ   tokenscurrent_sub_tokens
out_stringtokenr7   r7   rN   convert_tokens_to_stringH  s   
z'GemmaTokenizer.convert_tokens_to_string)
rY   rZ   r[   r\   NTFFFF)FF)rO   rP   rQ   rR   r   dictrj   r   rI   rt   rv   listry   r}   intboolr   r   r7   r7   r7   rN   rX      s>    1
+
rX   c                       s@   e Zd Zddedef fddZdd Zdd	 Zd
d Z  Z	S )GemmaRMSNormr/   dimepsc                    s&   t    || _tt|| _d S )N)rH   rI   r   r   	Parametertorchzerosweight)rJ   r   r   rL   r7   rN   rI   X  s   
zGemmaRMSNorm.__init__c                 C   s$   |t |djddd| j  S )Nr   T)keepdim)r   rsqrtpowmeanr   )rJ   xr7   r7   rN   _norm]  s   $zGemmaRMSNorm._normc                 C   s*   |  | }|d| j   }||S )Ng      ?)r   floatr   type_as)rJ   r   outputr7   r7   rN   forward`  s   
zGemmaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler   shaper   rs   r7   r7   rN   
extra_reprg  s   zGemmaRMSNorm.extra_repr)r/   )
rO   rP   rQ   r   r   rI   r   r   r   rW   r7   r7   rL   rN   r   W  s
    r   c                          e Zd Z fddZ  ZS )GemmaMLPc                    sP   t    tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NF)bias)	rH   rI   r   Linearr:   r;   	gate_projup_proj	down_proj)rJ   configrL   r7   rN   rI   l  s   
zGemmaMLP.__init__)rO   rP   rQ   rI   rW   r7   r7   rL   rN   r   k      r   c                   @   s   e Zd Z									ddeej deej deej dee deej dee	 dee	 d	ee	 d
eej de
e defddZdS )
GemmaModelNr   r"   position_idsr   r    rD   output_attentionsoutput_hidden_statescache_positionrK   rx   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|rK|d u rKt
 }|	d u rg|d urW| nd}tj|||jd  |jd}	|d u rp|	d}t| j |||	||d}|}| ||}tj| j jd |jd	}|| }|rd
nd }|rd
nd }| jd | j j D ]&}|r||f7 }||f||||||	|d|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r0   )device)r   input_embedsr"   r   r   r   g      ?)dtyper7   )r"   r   past_key_valuer   rD   r   position_embeddings)last_hidden_stater   r!   
attentions)r   r   r   rD   
ValueErrorgradient_checkpointingtrainingloggerwarning_oncer#   r   get_seq_lengthr   aranger   r   	unsqueezer
   
rotary_embtensorr:   r   r$   r<   r%   r   )rJ   r   r"   r   r   r    rD   r   r   r   rK   past_seen_tokenscausal_maskr!   r   
normalizerall_hidden_statesall_self_attnsdecoder_layerlayer_outputsr7   r7   rN   r   t  s   



	


zGemmaModel.forward)	NNNNNNNNN)rO   rP   rQ   r   r   
LongTensorTensorr   FloatTensorr   r   r   r   r   r7   r7   r7   rN   r   s  sB    	
r   c                       r   )GemmaForCausalLMc                     s   t  jdi | S )a|  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, GemmaForCausalLM

        >>> model = GemmaForCausalLM.from_pretrained("google/gemma-7b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```Nr7   )rH   r   )super_kwargsrL   r7   rN   r     s   zGemmaForCausalLM.forward)rO   rP   rQ   r   rW   r7   r7   rL   rN   r     r   r   c                   @      e Zd ZdS )GemmaForSequenceClassificationNrO   rP   rQ   r7   r7   r7   rN   r         r   c                   @   r   )GemmaForTokenClassificationNr   r7   r7   r7   rN   r     r   r   )r   rX   r   r   r   r   GemmaPreTrainedModel)4typingr   r   r   sentencepiecerk   r   torch.utils.checkpointr   cache_utilsr   r   configuration_utilsr	   masking_utilsr
   modeling_flash_attention_utilsr   modeling_outputsr   processing_utilsr   tokenization_utilsr   r   utilsr   llama.modeling_llamar   r   r   r   r   llama.tokenization_llamar   tokenization_utils_baser   VOCAB_FILES_NAMESr   
get_loggerrO   r   r   rX   Moduler   r   r   r   r   r   __all__r7   r7   r7   rN   <module>   s>   
  h