o
    ¾e¦iƒ"  ã                   @   sH   d dl mZ ddlmZmZ G dd„ deƒZG dd„ deƒZddgZdS )	é   )ÚPreTrainedConfigé   )ÚCONFIG_MAPPINGÚ
AutoConfigc                       s<   e Zd ZdZdZ									
			d‡ fdd„	Z‡  ZS )ÚGlmAsrEncoderConfiga`  
    This is the configuration class to store the configuration of a [`GlmAsrEncoder`]. It is used to instantiate a
    glmasr audio encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the audio encoder of the glmasr
    architecture.

    e.g. [zai-org/GLM-ASR-Nano-2512](https://huggingface.co/zai-org/GLM-ASR-Nano-2512)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1280):
            Dimensionality of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 5120):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 20):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler.
        max_position_embeddings (`int`, *optional*, defaults to 1500):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        num_mel_bins (`int`, *optional*, defaults to 128):
            Number of mel features used per input features. Should correspond to the value used in the
            `GlmAsrProcessor` class.

    ```python
    >>> from transformers import GlmAsrEncoderConfig, GlmAsrEncoder

    >>> # Initializing a GlmAsrEncoderConfig
    >>> configuration = GlmAsrEncoderConfig()

    >>> # Initializing a GlmAsrEncoder (with random weights)
    >>> model = GlmAsrEncoder(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Úglmasr_encoderé   é   é    é   NÚgeluéÜ  ç{®Gáz”?ç        é€   c                    sz   || _ || _|| _|| _|d u r|}|| _|| _|| _|| | _|| _|	| _	|
| _
|| _| dd¡ tƒ jdi |¤Ž d S )NÚpartial_rotary_factorg      à?© )Úhidden_sizeÚintermediate_sizeÚnum_hidden_layersÚnum_attention_headsÚnum_key_value_headsÚ
hidden_actÚinitializer_rangeÚhead_dimÚmax_position_embeddingsÚrope_parametersÚattention_dropoutÚnum_mel_binsÚ
setdefaultÚsuperÚ__init__)Úselfr   r   r   r   r   r   r   r   r   r   r   Úkwargs©Ú	__class__r   úm/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/glmasr/configuration_glmasr.pyr!   O   s    
zGlmAsrEncoderConfig.__init__)r   r	   r
   r   Nr   r   r   Nr   r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú
model_typer!   Ú__classcell__r   r   r$   r&   r      s    9ôr   c                       s^   e Zd ZdZdZeedœZdddddd	d
ddg d¢dddœdœZ				d‡ fdd„	Z‡  Z	S )ÚGlmAsrConfiga  
    This is the configuration class to store the configuration of a [`GlmAsrForConditionalGeneration`]. It is used to instantiate an
    glmasr model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the glmasr-Mini-3B.

    e.g. [zai-org/GLM-ASR-Nano-2512](https://huggingface.co/zai-org/GLM-ASR-Nano-2512)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        audio_config (`Union[AutoConfig, dict]`, *optional*):
            The config object or dictionary of the audio encoder.
        text_config (`Union[AutoConfig, dict]`, *optional*):
            The config object or dictionary of the text model.
        audio_token_id (`int`, *optional*, defaults to 59260):
            The audio token index to encode the audio prompt.
        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The activation function (function or string) in the multi-modal projector.

    ```python
    >>> from transformers import GlmAsrForConditionalGeneration, GlmAsrConfig

    >>> # Initializing a glmasr configuration
    >>> configuration = GlmAsrConfig()

    >>> # Initializing a GLM-ASR-Nano-2512 model with random weights
    >>> model = GlmAsrForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Úglmasr)Útext_configÚaudio_configi€ç  i   i   é   é   é   i    gñhãˆµøä>T)inç  iuç  iwç  g     ˆÃ@Údefault)Ú
rope_thetaÚ	rope_type)Ú
vocab_sizer   r   r   r   r   r   Úrms_norm_epsÚ	use_cacheÚeos_token_idr   Né|ç  r   c                    sÚ   t |tƒr| dd¡|d< t|d  di |¤Ž}n	|d u r"td ƒ }|| _t |tƒrC| dd¡|d< t|d  di i | j¥|¥¤Ž}n|d u rQtd di | j¤Ž}|| _|j| _|j| _|| _	|| _
tƒ jdi |¤Ž d S )Nr+   r   Úllamar   )Ú
isinstanceÚdictÚgetr   r0   Ú_default_text_config_kwargsr/   r7   r   Úaudio_token_idÚprojector_hidden_actr    r!   )r"   r0   r/   rA   rB   r#   r$   r   r&   r!   ¤   s&   


ÿzGlmAsrConfig.__init__)NNr;   r   )
r'   r(   r)   r*   r+   r   Úsub_configsr@   r!   r,   r   r   r$   r&   r-   q   s*    !
õûr-   N)Úconfiguration_utilsr   Úautor   r   r   r-   Ú__all__r   r   r   r&   Ú<module>   s
   ^S