o
    iP                     @   s*   d dl mZ d dlZG dd dejZdS )    )AnyNc                       s   e Zd ZU dZejed< dZdZdZ										
				d!de
eef dB de
eef dB dedB dedB dedededededededef fddZ fddZedejfdd Z  ZS )"UltravoxConfiga  
    This is the configuration class to store the configuration of a
    [`UltravoxForConditionalGeneration`]. It is used to instantiate an
    Ultravox model according to the specified arguments, defining the model
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to
    control the model outputs. Read the documentation from [`PretrainedConfig`]
    for more information.

    Args:
        audio_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom audio config or dict.
        text_config (`Union[AutoConfig, dict]`, *optional*):
            The config object of the text backbone.
        audio_model_id (`str`, *optional*):
            The model ID of the audio backbone.
        text_model_id (`str`, *optional*):
            The model ID of the text backbone.
        ignore_index (`int`, *optional*, defaults to -100):
            The ignore index for the loss function.
        audio_token_index (`int`, *optional*, defaults to 32000):
            The audio token index to encode the audio prompt.
        stack_factor (`int`, *optional*, defaults to 8):
            Audio downsampling factor for the multimodal projector.
        norm_init (`float`, *optional*, defaults to 0.4):
            The initialization value for the layer normalization.
        projector_act (`str`, *optional*, defaults to `"swiglu"`):
            The activation function used by the multimodal projector.
        projector_ln_mid (`bool`, *optional*, defaults to `False`):
            Whether to apply layer normalization at the middle of the
            projector or at the end. Versions v0.4.1 and below
            use `False`, but v0.5 and above use `True`.
    wrapped_model_configultravoxz	<|audio|>FN }        皙?swiglur   audio_configtext_configaudio_model_idtext_model_idignore_indexaudio_token_indexhidden_sizestack_factor	norm_initprojector_actprojector_ln_midnum_projector_layersc                    s   || _ || _|| _|| _|	| _|
| _|| _|| _|| _|d u r2|p"i }t	j
|dd di || _|| _|d u rOd | _|p?i }t	j
|dd di || _t jdi | d S )N
model_typellamawhisper )r   r   r   r   r   r   r   r   r   transformersCONFIG_MAPPINGgetr   r   r   super__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__r   ^/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/transformers_utils/configs/ultravox.pyr    3   s4   

zUltravoxConfig.__init__c                    sd   |dkr|d urddl m} ||dd| _n|dkr+|d ur+ddl m} ||dd| _t ||S )Nr   r   )
get_configF)trust_remote_coder   )vllm.transformers_utils.configr&   r   r   r   __setattr__)r!   keyvaluer&   r#   r   r%   r)   `   s   zUltravoxConfig.__setattr__returnc                 C   s
   | j  S )N)r   get_text_config)r!   r   r   r%   r   s   s   
zUltravoxConfig.text_config)NNNNr   r   r   r	   r
   r   Fr   )__name__
__module____qualname____doc__r   PretrainedConfig__annotations__r   audio_tokenis_compositiondictstrr   intfloatboolr    r)   propertyr   __classcell__r   r   r#   r%   r   
   s\   
 
#	
-r   )typingr   r   r2   r   r   r   r   r%   <module>   s   