o
    ¾e¦iŒ%  ã                   @   s*   d dl mZmZ G dd„ deƒZdgZdS )é   )ÚPreTrainedConfigÚlayer_type_validationc                ,       s   e Zd ZdZdZdgZddddddddœZdgdgfd	d
gd	gfd	gd	gfdœZdZdddddddddddddg d¢ddddddddfd e	d!e	d"e	d#e	d$e	d%e	d&e	d'e
d(e	d)ed*ed+ed,e	dB d-e	d.ed/ed0e	d1ed2edB d3e	d4ee
 dB f*‡ fd5d6„Z‡  ZS )7Ú	CwmConfigaÙ  
    Configuration for Code World Model (CWM).
    This is an inherited Llama3-compatible configuration with layer-interleaved
    sliding-window attention. Configures a `CwmModel`. Designed to yield a configuration mirroring the model in the
    [facebook/cwm](https://huggingface.co/facebook/cwm) architecture by default. Other models include:
    - [facebook/cwm-sft](https://huggingface.co/facebook/cwm-sft)
    - [facebook/cwm-pretrain](https://huggingface.co/facebook/cwm-pretrain)

    Args:
        vocab_size (`int`, *optional*, defaults to 128256):
            Vocabulary size of the CWM model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`CwmModel`]
        hidden_size (`int`, *optional*, defaults to 6144):
            Dimension of the hidden representations
        intermediate_size (`int`, *optional*, defaults to 21504):
            Dimension of the MLP representations
        num_hidden_layers (`int`, *optional*, defaults to 64):
            Number of hidden layers in the Transformer decoder
        num_attention_heads (`int`, *optional*, defaults to 48):
            Number of attention heads for each attention layer in the Transformer decoder
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention (GQA).
            If it is not specified, will default to `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 128):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with. CWM's attention allows sequence
            lengths up to 131072 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        eos_token_id (`int` or `list[int]`, *optional*, defaults to `[128001, 128008, 128009]`):
            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
        bos_token_id (`int`, *optional*, defaults to 128000):
            The id of the *beginning-of-sequence* token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        pretraining_tp (`int`, *optional*, defaults to 1):
            Tensor parallelism degree used during pretraining. See [this
            document](https://huggingface.co/docs/transformers/parallelism) and [this
            issue](https://github.com/pytorch/pytorch/issues/76232).
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        sliding_window (`int`, *optional*, defaults to 8192):
            Sliding window attention window size.
        layer_types (`List[str]`, *optional*):
            List of layer types for each layer. Each element should be either "full_attention" or "sliding_attention".
            If not specified, will default to alternating pattern based on the provided window pattern.
    ÚcwmÚpast_key_valuesÚcolwiseÚrowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projÚ	input_idsÚinputs_embedsÚhidden_statesÚattention_mask)Úembed_tokensÚlayersÚnormç    €„.Ai õ i   i T  é@   é0   é   é€   Úsilui   g{®Gáz”?gñhãˆµøä>TN)iô iô i	ô i ô Fg        é   é    Ú
vocab_sizeÚhidden_sizeÚintermediate_sizeÚnum_hidden_layersÚnum_attention_headsÚnum_key_value_headsÚhead_dimÚ
hidden_actÚmax_position_embeddingsÚinitializer_rangeÚrms_norm_epsÚ	use_cacheÚpad_token_idÚbos_token_idÚtie_word_embeddingsÚattention_dropoutÚpretraining_tpÚmlp_biasÚrope_parametersÚsliding_windowÚlayer_typesc                    s  |d u rdddddddœ}|d u rd‰ ‡ fd	d
„t |ƒD ƒ}nt||ƒ |r*t|ƒnd | _t|ƒ| _|| _|	| _|| _|| _	|| _
|| _|d u rJ|}|| _|| _|
| _|| _|| _|| _|| _|| _|d urh|n| j| j | _|| _|| _|| _|| _|| _tƒ jdi |¤Ž d S )Nr   g      0@g      @g      ð?r   Úllama3)Ú
rope_thetaÚfactorÚhigh_freq_factorÚlow_freq_factorÚ original_max_position_embeddingsÚ	rope_typeé   c                    s    g | ]}|ˆ  d krdnd‘qS )é    Úfull_attentionÚsliding_attention© )Ú.0Úi©Úwindow_patternr8   úg/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/cwm/configuration_cwm.pyÚ
<listcomp>•   s    ÿÿz&CwmConfig.__init__.<locals>.<listcomp>r8   )Úranger   Úintr+   Úlistr,   r   r    r   r   r   r   r   r   r!   r"   r(   r#   r'   r)   r   r*   r&   r$   r%   Úeos_token_idÚsuperÚ__init__)Úselfr   r   r   r   r   r   r   r   r    r!   r"   r#   r$   rB   r%   r&   r'   r(   r)   r*   r+   r,   Úkwargs©Ú	__class__r;   r=   rD   m   sN   ú	
þ

zCwmConfig.__init__)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú
model_typeÚkeys_to_ignore_at_inferenceÚbase_model_tp_planÚbase_model_pp_planÚdefault_thetar@   ÚstrÚfloatÚboolÚdictrA   rD   Ú__classcell__r8   r8   rG   r=   r      s¤    @ù


ýèþýüûúùø	÷
öõôóòðïîíìëé
èr   N)Úconfiguration_utilsr   r   r   Ú__all__r8   r8   r8   r=   Ú<module>   s    
#