o
    i(                     @   s:   d Z ddlmZ ddlmZ eeZG dd deZdS )zJAIS configuration    )PretrainedConfig)loggingc                       st   e Zd ZdZdZdgZdddddZ			
																							d fdd	Zdd Z  Z	S )
JAISConfigaw  
    This is the configuration class to store the configuration of a
    [`JAISModel`]. It is used to instantiate a JAIS model according to the
    specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used
    to control the model outputs. Read the documentation from
    [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 50257):
            Vocabulary size of the JAIS model. Defines the number of different
            tokens that can be represented by the
            `inputs_ids` passed when calling [`JAISModel`].
        n_positions (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used
            with. Typically set this to something large just in case
            (e.g., 512 or 1024 or 2048).
        n_embd (`int`, *optional*, defaults to 768):
            Dimensionality of the embeddings and hidden states.
        n_layer (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        n_head (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the
            Transformer encoder.
        n_inner (`int`, *optional*, defaults to None):
            Dimensionality of the inner feed-forward layers. `None` will set
            it to 4 times n_embd
        activation_function (`str`, *optional*, defaults to `"gelu"`):
            Activation function, to be selected in the list
            `["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
        resid_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in
            the embeddings, encoder, and pooler.
        embd_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the embeddings.
        attn_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon to use in the layer normalization layers.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        scale_attn_weights (`bool`, *optional*, defaults to `True`):
            Scale attention weights by dividing by sqrt(hidden_size)..
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models).
        scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`):
            Whether to additionally scale attention weights
            by `1 / layer_idx + 1`.
        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
            Whether to scale keys (K) prior to computing attention
            (dot-product)
            and upcast attention dot-product/softmax to float() when training
            with mixed precision.
        position_embedding_type (`str`, *optional*, defaults to `"learned"`):
            Positional embedding can be either `"alibi"` or `"learned"`.
        mup_width_scale (`float`, *optional*, defaults to 1.0):
            muP parameter to scale learning rate and initializers. Calculated
            as (`d_model,0 / d_model`), where
            `d_model` is the model's width and `d_model,0` is the proxy
            model's width.
        mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
            muP parameter to scale token and position embeddings.
        mup_output_alpha (`float`, *optional*, defaults to 1.0):
            muP parameter to scale output logits
            (`output_logits_scale = mup_output_alpha * mup_width_scale`).
        mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
            Scale attention weights by dividing by hidden_size instead of
            sqrt(hidden_size). Need to set scale_attn_weights to `True` as
            well.
        alibi_scaling (`dict`, *optional*):
            Dictionary containing the scaling configuration for ALiBi
            embeddings. Currently only supports linear
            scaling strategy. Can specify either the scaling `factor` (must be
            a float greater than 1) for fixed scaling
            or `train_seq_len` for dynamic scaling on input samples with
            sequence length > `train_seq_len`. The expected
            formats are `{"type": strategy name, "factor": scaling factor}` or
            `{"type": strategy name,
            "train_seq_len": training sequence length}`.
        architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
            architecture names for Jais.

    Example:

    ```python
    >>> from transformers import JAISConfig, JAISModel

    >>> # Initializing a JAIS configuration
    >>> configuration = JAISConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = JAISModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```jaispast_key_valuesn_embdn_positionsn_headn_layer)hidden_sizemax_position_embeddingsnum_attention_headsnum_hidden_layersQ           Ngelu_new皙?h㈵>{Gz?TP  Flearned      ?c                    s   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|   |d u rSdg}t jd|||d| d S )NJAISLMHeadModel)bos_token_ideos_token_idarchitectures )
vocab_sizer   r   r
   r	   n_inneractivation_functionresid_pdrop
embd_pdrop
attn_pdroplayer_norm_epsiloninitializer_rangescale_attn_weights	use_cachescale_attn_by_inverse_layer_idxreorder_and_upcast_attnr   r   position_embedding_typemup_width_scalemup_embeddings_scalemup_output_alphamup_scale_qk_dot_by_dalibi_scaling_alibi_scaling_validationsuper__init__)selfr   r   r   r
   r	   r    r!   r"   r#   r$   r%   r&   r'   r(   r   r   r)   r*   r+   r,   r-   r.   r/   r0   r   kwargs	__class__r   Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/transformers_utils/configs/jais.pyr3      sD   
zJAISConfig.__init__c                 C   s   | j du rdS t| j trt| j dkrtd| j  | j dd}| j dd}| j dd}|du s9|dkr@td| |durIt|trQ|durX|d	krXtd
| |durat|tri|durp|dkrrtd| dS dS )z=
        Validate the `alibi_scaling` configuration.
        N   zm`alibi_scaling` must be a dictionary with two fields, `type` and `factor` or `type` and `train_seq_len`, got typefactortrain_seq_lenlinearz3`alibi_scaling`'s type field must be 'linear', got r   z:`alibi_scaling`'s factor field must be a float > 1.0, got    zD`alibi_scaling`'s `train_seq_len` field must be an integer > 1, got )r0   
isinstancedictlen
ValueErrorgetfloatint)r4   alibi_scaling_typealibi_scaling_factoralibi_dynamic_scalingr   r   r8   r1      sH   
z$JAISConfig._alibi_scaling_validation)r   r   r   r   r   Nr   r   r   r   r   r   TTr   r   FFr   r   r   r   FNN)
__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr3   r1   __classcell__r   r   r6   r8   r      sH    e	Cr   N)	rL    transformers.configuration_utilsr   transformers.utilsr   
get_loggerrI   loggerr   r   r   r   r8   <module>   s
   
