o
    i                     @   s    d dl mZ G dd deZdS )    )PretrainedConfigc                       sP   e Zd ZdZdgZ										
											d fdd	Z  ZS )Olmo3Configolmo3past_key_values      +      Nsilu   {Gz?T   g  F        h㈵>c                    s$  d|vr
dg|d< nd|d v r|d  d |d d t jd||||d| || _|| _|| _|| _|| _|| _	|d u rD|}|| _
|| _|	| _|
| _|dd }|p]|p]ddi}|dd	}d|vrl||d< || _|| _|| _|| _|| _|| _| jd u rd
d t| jD | _d S d S )NarchitecturesOlmo2ForCausalLMOlmo3ForCausalLM)pad_token_idbos_token_ideos_token_idtie_word_embeddingsrope_scaling	rope_typedefault
rope_thetag     @c                 S   s$   g | ]}|d  d dkrdndqS )r      r   sliding_attentionfull_attention ).0ir   r   [/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/transformers_utils/configs/olmo3.py
<listcomp>P   s    z(Olmo3Config.__init__.<locals>.<listcomp>r   )removeappendsuper__init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actinitializer_range	use_cachepoprope_parametersattention_biasattention_dropoutrms_norm_epssliding_windowlayer_typesrange)selfr(   r*   r+   r,   r-   r.   r/   r)   r0   r1   r   r   r   r   r3   r4   r5   r6   r7   r8   kwargsr   r   	__class__r   r"   r'      sR   
zOlmo3Config.__init__)r   r   r   r	   r	   Nr
   r   r   Tr   Nr   FNFr   r   r   N)__name__
__module____qualname__
model_typekeys_to_ignore_at_inferencer'   __classcell__r   r   r<   r"   r      s0    r   N) transformers.configuration_utilsr   r   r   r   r   r"   <module>   s   