o
    پi*                     @   sn   d Z ddlmZmZ ddlmZ ddlmZ ee	Z
i ZG dd deZG dd deZG d	d
 d
eZdS )zDbrx configuration.    )AnyOptional)PretrainedConfig)loggingc                       s^   e Zd ZdZ				ddedee ded	ed
ef
 fddZe	de
d
eddfddZ  ZS )DbrxAttentionConfigaB  Configuration class for Dbrx Attention.

    [`DbrxAttention`] class. It is used to instantiate attention layers
    according to the specified arguments, defining the layers architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        attn_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability for the attention layers.
        clip_qkv (`float`, *optional*, defaults to None):
            If not `None`, clip the queries, keys, and values in the attention layer to this value.
        kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
        rope_theta (float): The base frequency for rope.
    r   N        @
attn_pdropclip_qkv
kv_n_heads
rope_thetakwargsc                    sd   t  jdi | || _|| _|| _|| _dD ]}||v r"|| qt|dkr0td|d S )N
model_typer   Found unknown kwargs= )	super__init__r	   r
   r   r   poplen
ValueError)selfr	   r
   r   r   r   k	__class__r   K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/configs/dbrx.pyr   "   s   
zDbrxAttentionConfig.__init__pretrained_model_name_or_pathreturnr   c                 K   |   |  | | j|fi |\}}|ddkr|d }d|v r5t| dr5|d | jkr5td|d | j | j|fi |S )Nr   dbrxattn_configYou are using a model of type %s to instantiate a model of type %s. This is not supported for all configurations of models and can yield errors._set_token_in_kwargsget_config_dictgethasattrr   loggerwarning	from_dictclsr   r   config_dictr   r   r   from_pretrained6   $   

z#DbrxAttentionConfig.from_pretrained)r   Nr   r   )__name__
__module____qualname____doc__floatr   intr   r   classmethodstrr-   __classcell__r   r   r   r   r      s2    r   c                       s~   e Zd ZdZ								ddee d	ed
ededee dedee dede	f fddZ
edede	ddfddZ  ZS )DbrxFFNConfigaI  Configuration class for Dbrx FFN.

    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
    the specified arguments, defining the layers architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
            The dict should have a key 'name' with the value being the name of
            the activation function along with any additional keyword arguments.
        ffn_hidden_size (int, optional): The hidden size of the feedforward network.
        moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
        moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
        moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
        moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
        moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
        uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
            This should only be used for benchmarking purposes.
    N      r   {Gz?F
ffn_act_fnffn_hidden_sizemoe_num_experts	moe_top_kmoe_jitter_epsmoe_loss_weightmoe_normalize_expert_weightsuniform_expert_assignmentr   c	                    s   t    |d u rddi}|| _|| _|| _|| _|| _|| _|| _|| _	dD ]}
|
|	v r2|	
|
 q't|	dkr@td|	d S )Nnamesilur   r   r   )r   r   r<   r=   r>   r?   r@   rA   rB   rC   r   r   r   )r   r<   r=   r>   r?   r@   rA   rB   rC   r   r   r   r   r   r   j   s$   

zDbrxFFNConfig.__init__r   r   r   c                 K   r   )Nr   r   
ffn_configr!   r"   r*   r   r   r   r-      r.   zDbrxFFNConfig.from_pretrained)Nr9   r:   r   Nr;   r   F)r/   r0   r1   r2   r   dictr4   r3   boolr   r   r5   r6   r-   r7   r   r   r   r   r8   S   sJ    	
r8   c                       s   e Zd ZdZdZdddddZ				
										ddedededededededee	 dee
 dededededef fddZ  ZS )
DbrxConfiga
  Configuration class for Dbrx.

    [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
    specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        d_model (`int`, *optional*, defaults to 6144):
            Dimensionality of the embeddings and hidden states.
        n_heads (`int`, *optional*, defaults to 48):
            Number of attention heads for each attention layer in the Transformer encoder.
        n_layers (`int`, *optional*, defaults to 40):
            Number of hidden layers in the Transformer encoder.
        max_seq_len (`int`, *optional*, defaults to 32768):
            The maximum sequence length of the model.
        vocab_size (`int`, *optional*, defaults to 100352):
            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`DbrxModel`].
        resid_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability applied to the attention output before combining with residual.
        emb_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability for the embedding layer.
        attn_config (`dict`, *optional*):
            A dictionary used to configure the model's attention module.
        ffn_config (`dict`, *optional*):
            A dictionary used to configure the model's FFN module.
        use_cache (`bool`, *optional*, defaults to `False`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss. See [here]() for more details
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.


    Example:
    ```python
    >>> from transformers import DbrxConfig, DbrxModel

    >>> # Initializing a Dbrx configuration
    >>> configuration = DbrxConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = DbrxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    r   n_headsd_modeln_layersmax_seq_len)num_attention_headshidden_sizenum_hidden_layersmax_position_embeddings          }          NT{Gz?F皙?
vocab_sizeresid_pdrop	emb_pdropr    rF   	use_cacheinitializer_rangeoutput_router_logitsrouter_aux_loss_coefr   c                    s   |d u r	t  | _nt|trt di || _n|| _|	d u r#t | _nt|	tr1tdi |	| _n|	| _|| _|| _|| _|| _	|| _
|| _|| _|
| _|| _|| _|| _|dd}|ratdt jdd|i| d S )Ntie_word_embeddingsFz5tie_word_embeddings is not supported for Dbrx models.r   )r   r    
isinstancerG   r8   rF   rK   rJ   rL   rM   rY   rZ   r[   r\   r]   r^   r_   r   r   r   r   )r   rK   rJ   rL   rM   rY   rZ   r[   r    rF   r\   r]   r^   r_   r   r`   r   r   r   r      s:   





zDbrxConfig.__init__)rR   rS   rT   rR   rU   rV   rV   NNTrW   FrX   )r/   r0   r1   r2   r   attribute_mapr4   r3   r   r   r8   rH   r   r   r7   r   r   r   r   rI      sd    7		
rI   N)r2   typingr   r    transformers.configuration_utilsr   transformers.utilsr   
get_loggerr/   r'   "DBRX_PRETRAINED_CONFIG_ARCHIVE_MAPr   r8   rI   r   r   r   r   <module>   s   
CR