o
    	۷iIQ                     @   s  d dl mZmZmZ d dlZd dlZd dlmZ ddlm	Z	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$ ernddl%m&Z& ddiZ'dZ(e)e*Z+G dd deZ,G dd de$eZ-G dd dej.Z/G dd deZ0G dd de"Z1G dd de!Z2G d d! d!e Z3G d"d# d#eZ4G d$d% d%eZ5G d&d' d'eZ6g d(Z7dS ))    )TYPE_CHECKINGAnyOptionalN)nn   )CacheDynamicCache)PretrainedConfig)create_causal_mask)BaseModelOutputWithPast)PreTrainedModel)Unpack)
AddedTokenPreTrainedTokenizer)TransformersKwargslogging   )LlamaForCausalLMLlamaForSequenceClassificationLlamaForTokenClassificationLlamaMLP
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbedding)LlamaTokenizer)	TextInput
vocab_fileztokenizer.modelu   ▁c                       s   e Zd ZdZdZdgZddddddddZdgdgfd	d
gd	gfd	gd	gfdZ																				d  fdd	Z  Z	S )!GemmaConfiga  
    This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma-7B.
    e.g. [google/gemma-7b](https://huggingface.co/google/gemma-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Gemma model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`GemmaModel`]
        hidden_size (`int`, *optional*, defaults to 3072):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 24576):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 28):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 16):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The legacy activation function. It is overwritten by the `hidden_activation`.
        hidden_activation (`str` or `function`, *optional*):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
    ```python
    >>> from transformers import GemmaModel, GemmaConfig
    >>> # Initializing a Gemma gemma-7b style configuration
    >>> configuration = GemmaConfig()
    >>> # Initializing a model from the gemma-7b style configuration
    >>> model = GemmaModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemmapast_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnorm      `           gelu_pytorch_tanhN    {Gz?ư>Tr      r        @F        c                    s   || _ |
| _|| _|| _|| _|| _|| _|| _|| _|	| _	|| _
|| _|| _|| _|| _|| _t jd||||d| d S )N)pad_token_idbos_token_ideos_token_idtie_word_embeddings )
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_heads
hidden_acthidden_activationinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropoutsuper__init__)selfr;   r=   r>   r?   r@   rB   rA   rC   rD   r<   rE   rF   rG   r6   r8   r7   r9   rH   rI   rJ   kwargs	__class__r:   ]/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/gemma/modular_gemma.pyrL      s0   
zGemmaConfig.__init__)r)   r*   r+   r,   r-   r-   r.   r/   Nr0   r1   r2   Tr   r3   r   Tr4   Fr5   )
__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planrL   __classcell__r:   r:   rO   rQ   r   5   sJ    C


r   c                	   @   s   e Zd ZdZ										dd	eeeef  fd
dZdd Z	dd Z
dddee fddZdd Z		ddee dededefddZdd ZdS ) GemmaTokenizera
  
    Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        sp_model_kwargs (`dict[str, Any]`, `Optional`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Whether or not the default system prompt for Gemma should be used.
        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to add spaces between special tokens.
    <unk><bos><eos><pad>NTFsp_model_kwargsc                 K   s   |d u ri n|| _ t|trt|dddn|}t|tr#t|dddn|}t|tr1t|dddn|}t|tr?t|dddn|}|| _|| _|| _|
| _tj	di | j | _
| j
| tj| f||||||||	|
|d
| d S )NFT)
normalizedspecial)
	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr`   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokensr:   )r`   
isinstancestrr   r   rg   rh   rj   spmSentencePieceProcessorsp_modelLoadr   rL   )rM   r   re   rc   rd   rf   r`   rg   rh   ri   rj   rk   rN   r:   r:   rQ   rL      s6   
zGemmaTokenizer.__init__c                 C      t dNzNot needed for GemmaAttributeErrorrM   r:   r:   rQ   get_spm_processor     z GemmaTokenizer.get_spm_processorc                 C   rr   rs   rt   rv   r:   r:   rQ   unk_token_length  rx   zGemmaTokenizer.unk_token_lengthtextr   returnc                 K   s   t j| |fi |S )ze
        Args:
            text: TextInput
        Simply calls PreTrainedTokenizer's method
        )r   tokenizerM   rz   rN   r:   r:   rQ   r|     s   zGemmaTokenizer.tokenizec                 K   s   | j j|tdS )z
        Args:
            text: TextInput
        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
        )out_type)rp   encoderm   r}   r:   r:   rQ   	_tokenize$  s   zGemmaTokenizer._tokenize	token_idsskip_special_tokensrk   c                 K   s   g }g }|D ]+}|r|| j v rq|| jv r,|r || j| || j| j g }q|| q|r=|| j| |rEd|}nd|}|tdS )N  )	all_special_ids_added_tokens_decoderappendrp   decodecontentjoinreplaceSPIECE_UNDERLINE)rM   r   r   rk   rN   	sub_textscurrent_sub_textidsr:   r:   rQ   _decode,  s"   

zGemmaTokenizer._decodec                 C   sT   g }d}|D ]}|| j v r|| j|| 7 }g }q|| q|| j|7 }|S )z:Converts a sequence of tokens (string) in a single string.r   )_added_tokens_encoderrp   r   r   )rM   tokenscurrent_sub_tokens
out_stringtokenr:   r:   rQ   convert_tokens_to_stringI  s   
z'GemmaTokenizer.convert_tokens_to_string)
r\   r]   r^   r_   NTFFFF)FF)rR   rS   rT   rU   r   dictrm   r   rL   rw   ry   listr|   r   intboolr   r   r:   r:   r:   rQ   r[      s>    1
+
r[   c                       s@   e Zd Zddedef fddZdd Zdd	 Zd
d Z  Z	S )GemmaRMSNormr2   dimepsc                    s&   t    || _tt|| _d S )N)rK   rL   r   r   	Parametertorchzerosweight)rM   r   r   rO   r:   rQ   rL   Y  s   
zGemmaRMSNorm.__init__c                 C   s$   |t |djddd| j  S )Nr   T)keepdim)r   rsqrtpowmeanr   )rM   xr:   r:   rQ   _norm^  s   $zGemmaRMSNorm._normc                 C   s*   |  | }|d| j   }||S )Ng      ?)r   floatr   type_as)rM   r   outputr:   r:   rQ   forwarda  s   
zGemmaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler   shaper   rv   r:   r:   rQ   
extra_reprh  s   zGemmaRMSNorm.extra_repr)r2   )
rR   rS   rT   r   r   rL   r   r   r   rZ   r:   r:   rO   rQ   r   X  s
    r   c                          e Zd Z fddZ  ZS )GemmaMLPc                    sR   t  | tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NF)bias)	rK   rL   r   Linearr=   r>   	gate_projup_proj	down_proj)rM   configrO   r:   rQ   rL   m  s   zGemmaMLP.__init__)rR   rS   rT   rL   rZ   r:   r:   rO   rQ   r   l      r   c                   @      e Zd ZdS )GemmaRotaryEmbeddingNrR   rS   rT   r:   r:   r:   rQ   r   t      r   c                   @   s   e Zd Zdd ZdS )GemmaPreTrainedModelc                 C   s,   t | | d|jjv r|jj  d S d S )NRMSNorm)r   _init_weightsrP   rR   r   datazero_)rM   moduler:   r:   rQ   r   y  s   z"GemmaPreTrainedModel._init_weightsN)rR   rS   rT   r   r:   r:   r:   rQ   r   x  s    r   c                   @   st   e Zd Z							ddeej deej deej dee deej dee	 deej d	e
e d
efddZdS )
GemmaModelNr"   r%   position_idsr   r#   rG   cache_positionrN   r{   c              
   K   s*  |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| j|||||d}
|}| ||}tj| jjd |jd}|| }| jd | jj D ]}||f|
|||||d	|}qr| |}t||r|d
S d d
S )Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   r3   )device)r   input_embedsr%   r   r   r   g      ?)dtype)r%   r   r   rG   r   position_embeddings)last_hidden_stater   )
ValueErrorr&   r   r   get_seq_lengthr   aranger   r   	unsqueezer
   
rotary_embtensorr=   r   r'   r?   r(   r   )rM   r"   r%   r   r   r#   rG   r   rN   past_seen_tokenscausal_maskr$   r   
normalizerdecoder_layerr:   r:   rQ   r     sZ   




zGemmaModel.forward)NNNNNNN)rR   rS   rT   r   r   
LongTensorTensorr   FloatTensorr   r   r   r   r   r:   r:   r:   rQ   r     s6    	
r   c                       r   )GemmaForCausalLMc                     s   t  jdi | S )a|  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, GemmaForCausalLM

        >>> model = GemmaForCausalLM.from_pretrained("google/gemma-7b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```Nr:   )rK   r   )super_kwargsrO   r:   rQ   r     s   zGemmaForCausalLM.forward)rR   rS   rT   r   rZ   r:   r:   rO   rQ   r     r   r   c                   @   r   )GemmaForSequenceClassificationNr   r:   r:   r:   rQ   r     r   r   c                   @   r   )GemmaForTokenClassificationNr   r:   r:   r:   rQ   r     r   r   )r   r[   r   r   r   r   r   )8typingr   r   r   sentencepiecern   r   r   cache_utilsr   r   configuration_utilsr	   masking_utilsr
   modeling_outputsr   modeling_utilsr   processing_utilsr   tokenization_utilsr   r   utilsr   r   llama.modeling_llamar   r   r   r   r   r   r   llama.tokenization_llamar   tokenization_utils_baser   VOCAB_FILES_NAMESr   
get_loggerrR   loggerr   r[   Moduler   r   r   r   r   r   r   r   __all__r:   r:   r:   rQ   <module>   s@   $	
  	E