o
    ei5                     @   s   d dl mZ ddlmZ ddlmZmZ ddlmZ ddl	m
Z
mZmZmZ ddlmZ G d	d
 d
eZG dd deZG dd de
ZG dd deZG dd deZG dd deZg dZdS )    N   )RopeParameters)auto_docstringcan_return_tuple   )LlamaConfig)LlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel)NemotronMLPc                ,       s  e Zd ZdZdZdddddddZ						
															d+dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB de	dB d edB d!edB d"edB d#e	dB d$e	dB d%edB d&e	dB d'edB d(e
eee
f B dB f( fd)d*Z  ZS ),Jais2Configa
  
    This is the configuration class to store the configuration of a [`Jais2Model`]. It is used to instantiate a Jais2
    model according to the specified arguments, defining the model architecture.
    [inceptionai/Jais-2-8B-Chat](https://huggingface.co/inceptionai/Jais-2-8B-Chat).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 150272):
            Vocabulary size of the Jais2 model.
        hidden_size (`int`, *optional*, defaults to 3328):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 26624):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 26):
            Number of attention heads for each attention layer.
        num_key_value_heads (`int`, *optional*):
            Number of key_value heads for Grouped Query Attention.
        hidden_act (`str`, *optional*, defaults to `"relu2"`):
            The non-linear activation function in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether to return last key/values attentions.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 0):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 150024):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings.
        attention_bias (`bool`, *optional*, defaults to `True`):
            Whether to use a bias in the query, key, value and output projection layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `True`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers.
        head_dim (`int`, *optional*):
            The attention head dimension.
        rope_parameters (`dict`, *optional*):
            The RoPE parameters.
    jais2colwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj K     h         Nrelu2    {Gz?h㈵>Tr   J F        
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actmax_position_embeddingsinitializer_rangelayer_norm_eps	use_cachepad_token_idbos_token_ideos_token_idtie_word_embeddingsattention_biasattention_dropoutmlp_biashead_dimrope_parametersc                    s   t  jdi d|d|d|d|d|d|d|d|d	|	d
|d|d|d|d|d|d|d|d|d|| |
| _| `| `d S )Nr   r   r   r   r    r!   r"   r#   r$   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/    )super__init__r%   rms_norm_epspretraining_tp)selfr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   kwargs	__class__r0   e/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/jais2/modular_jais2.pyr2   ]   sV   	
zJais2Config.__init__)r   r   r   r   r   Nr   r   r   r   TNr   r   FTr   TNN)__name__
__module____qualname____doc__
model_typebase_model_tp_planintstrfloatboolr   dictr2   __classcell__r0   r0   r7   r9   r      s    3	
r   c                   @      e Zd ZdS )Jais2MLPNr:   r;   r<   r0   r0   r0   r9   rG          rG   c                       s&   e Zd Zdedef fddZ  ZS )Jais2DecoderLayerconfig	layer_idxc                    s:   t  || tj|j|jd| _tj|j|jd| _d S N)eps)r1   r2   nn	LayerNormr   r%   input_layernormpost_attention_layernorm)r5   rK   rL   r7   r0   r9   r2      s   zJais2DecoderLayer.__init__)r:   r;   r<   r   r@   r2   rE   r0   r0   r7   r9   rJ      s    rJ   c                   @   rF   )Jais2PreTrainedModelNrH   r0   r0   r0   r9   rS      rI   rS   c                       s"   e Zd Zdef fddZ  ZS )
Jais2ModelrK   c                    s$   t  | tj|j|jd| _d S rM   )r1   r2   rO   rP   r   r%   norm)r5   rK   r7   r0   r9   r2      s   zJais2Model.__init__)r:   r;   r<   r   r2   rE   r0   r0   r7   r9   rT      s    rT   c                       s$   e Zd Zee fddZ  ZS )Jais2ForCausalLMc                    s   t  jdi |S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Jais2ForCausalLM

        >>> model = Jais2ForCausalLM.from_pretrained("inceptionai/Jais-2-8B-Chat")
        >>> tokenizer = AutoTokenizer.from_pretrained("inceptionai/Jais-2-8B-Chat")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```Nr0   )r1   forward)r5   super_kwargsr7   r0   r9   rW      s   zJais2ForCausalLM.forward)r:   r;   r<   r   r   rW   rE   r0   r0   r7   r9   rV      s    rV   )r   rT   rV   rS   )torch.nnrO   modeling_rope_utilsr   utilsr   r   llama.configuration_llamar   llama.modeling_llamar   r	   r
   r   nemotron.modeling_nemotronr   r   rG   rJ   rS   rT   rV   __all__r0   r0   r0   r9   <module>   s   r