o
    پi                     @   s  d dl Z d dlZd dlZd dlZd dlmZmZmZ d dlm	Z	 d dl
mZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lmZmZm Z m!Z!m"Z" d dl#m$Z$ e%e&Z'G dd deZ(G dd de)eZ*dede+fddZ,dede-fddZ.dede-fddZ/dede-fddZ0G dd dZ1ej2ej2ej3ej3ej4dZ5dedee)ej6f dej6fddZ7d>d!ee) d"e+fd#d$Z8g d%Z9ej:;  Z<re9=e< d!ee) fd&d'Z>d!ee) fd(d)Z?d!ee) fd*d+Z@d!ee) fd,d-ZAd!ee) fd.d/ZBd!ee) fd0d1ZCd!ee) fd2d3ZDd?d5eEd6eEdeEfd7d8ZFd!ee) fd9d:ZGd!ee) d;efd<d=ZHdS )@    N)EnumIntEnumauto)Path)AnyListOptionalSetUnion)PretrainedConfig)envs)QUANTIZATION_METHODS)
ServerArgs)is_hipis_sm100_supportedretry)
get_configget_context_lengthget_generation_configget_hf_text_configget_sparse_attention_config)is_in_cic                   @   s   e Zd Ze Ze ZdS )AttentionArchN)__name__
__module____qualname__r   MLAMHA r   r   S/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/configs/model_config.pyr   *   s    
r   c                   @   s   e Zd ZdZdZdZdZdS )	ModelImplr   sglangtransformers	mindsporeN)r   r   r   AUTOSGLANGTRANSFORMERS	MINDSPOREr   r   r   r   r    /   s
    r    configreturnc                 C   s(   | j d uo| j d dv ot| dd d uS )Nr   )DeepseekV3ForCausalLMDeepseekV32ForCausalLMDeepseekV3ForCausalLMNextNMistralLarge3ForCausalLMPixtralForConditionalGenerationGlmMoeDsaForCausalLM
index_topk)architecturesgetattrr(   r   r   r   is_deepseek_nsa6   s   

r4   c                 C      t | sJ | jS N)r4   index_head_dimr3   r   r   r   get_nsa_index_head_dimF      r8   c                 C   r5   r6   )r4   r0   r3   r   r   r   get_nsa_index_topkK   r9   r:   c                 C   r5   r6   )r4   index_n_headsr3   r   r   r   get_nsa_index_n_headsP   r9   r<   c                '   @   s  e Zd Zddddddddddejddddddfdeded	ee d
ee dedee dee dedee dee dede	eef dededededededdf&ddZ
e			dMdedededefddZd d! Zd"d# Zd
efd$d%Zd&d' Zdefd(d)Zdefd*d+Zdefd,d-Zdefd.d/Zdefd0d1Zd2d3 Zd4d5 Zd6edee fd7d8Zdefd9d:Zdefd;d<Zdee fd=d>Zd?d@ ZdNdAdBZdNdCdDZ dEdF Z!dee"e  fdGdHZ#deee$f fdIdJZ%dNdKdLZ&dS )OModelConfigTNz{}r   Fopenai
model_pathtrust_remote_coderevisioncontext_lengthmodel_override_argsis_embeddingenable_multimodaldtypequantizationoverride_config_fileis_draft_model
model_implsampling_defaultsquantize_and_serveis_multi_layer_eagleencoder_onlylanguage_onlydisable_hybrid_swa_memoryr)   c                 C   sd  || _ || _|	| _|| _|| _|| _|| _|| _|| _| 	  | 
  t|| _i }|
r7|
 r7|
 |d< t| j f||| jd|| _t| j| _t| j f||d|| _|d u ryg d}| jjd |v rwd}td| jj d nd	}|   t| jd
d | _|  | _t| jj|| _|ot | jj| _!|ot"| jj| _#|ot$| jj| _%|ot&| jj| _&|ot'| jd| _(|ot'| jd| _)|ot*| jj| _*t+| jj| _,t-| jj| _-t.| j|| _/| 0| | 1  | 2  | 3  | 4  | 5  | 6 | _7t| jdd pt| jdd | _8|| j_9|| j_:t| jdd | _;| j;p.t| jdd| _<d S )N_configuration_file)r@   rA   rC   )r@   rA   )Gemma3ForConditionalGenerationLlama4ForConditionalGenerationStep3VLForConditionalGenerationr   FzMultimodal is disabled for z(. To enable it, set --enable-multimodal.Tattention_chunk_sizevision_configaudio_configimage_token_idimage_token_indexmatryoshka_dimensionsis_matryoshka)=r?   rA   rG   rI   rJ   rK   rL   rM   rP   #_validate_quantize_and_serve_config'_maybe_pull_model_tokenizer_from_remotejsonloadsrC   stripr   	hf_configr   hf_text_configr   hf_generation_configr1   loggerinfo
model_type_config_draft_modelr2   rU   _get_sliding_window_sizesliding_window_sizeis_generation_modelis_generationis_multimodal_modelis_multimodalis_multimodal_gen_modelis_multimodal_genis_image_gen_modelis_image_genis_audio_modelhasattris_image_understandable_modelis_audio_understandable_model'is_multimodal_chunked_prefill_supportedis_encoder_decoder_modelis_encoder_decoderis_local_attention_model_get_and_verify_dtyperF   _derive_context_length_derive_model_shapes_derive_hybrid_model_verify_quantization_verify_transformers_version#_verify_dual_chunk_attention_config_get_hf_eos_token_idhf_eos_token_idrX   rN   rO   rZ   r[   )selfr?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   kwargsmm_disabled_modelsr   r   r   __init__V   s   





zModelConfig.__init__server_argsmodel_revisionc                 K   s<  |r| j n| j}|r| jn| j}tdi d|p| jd| jd|p`| jd| jd| j	d| j
d| jd| jd	|d
| jd| jd| jd|d| jd| jd| jd|d| j|S d| jd| j	d| j
d| jd| jd	|d
| jd| jd| jd|d| jd| jd| jd|d| j|S )Nr?   r@   rA   rB   rC   rD   rE   rF   rG   rJ   rK   rL   rH   rM   rO   rN   rI   rP   r   )$speculative_draft_model_quantizationrG   decrypted_draft_config_filedecrypted_config_filer=   r?   r@   rA   rB   json_model_override_argsrD   rE   rF   rJ   rK   rL   enable_multi_layer_eaglerO   rN   rP   )r   r?   r   rI   r   rG   rH   r   r   r   from_server_args   s   

	
	
zModelConfig.from_server_argsc                 C   s  | j }|r| jjd dv rd| jjd< |r#| jjd dv r#d| jjd< |r3| jjd dv r3d| jjd< |rI| jjd dkrId	| jjd< | jj| j_|rY| jjd d
krYd| jjd< |ri| jjd dkrid| jjd< |ry| jjd dkryd| jjd< |r| jjd dv rd| jjd< |r| jjd dkrd| jjd< |r| jjd dkrd| jjd< d| j_|r| jjd dv rd| jjd< d| j_|r| jjd dkrd| jjd< d| j_|r| jjd dkrd| jjd< d| j_d S d S d S )Nr   )r*   r/   r,   )Glm4MoeForCausalLMGlm4MoeLiteForCausalLMGlm4MoeForCausalLMNextN)GlmOcrForConditionalGeneration#GlmOcrForConditionalGenerationNextNLongcatFlashForCausalLMLongcatFlashForCausalLMNextNMiMoForCausalLMMiMoMTPMiMoV2FlashForCausalLM	MiMoV2MTPStep3p5ForCausalLM
Step3p5MTP)BailingMoeV2ForCausalLMBailingMoeForCausalLMBailingMoeV2_5ForCausalLMBailingMoeForCausalLMNextNErnie4_5_MoeForCausalLMErnie4_5_MoeForCausalLMMTPQwen3NextForCausalLMQwen3NextForCausalLMMTP   )Qwen3_5ForConditionalGeneration"Qwen3_5MoeForConditionalGenerationQwen3_5ForCausalLMMTPExaoneMoEForCausalLMExaoneMoEForCausalLMMTPNemotronHForCausalLMNemotronHForCausalLMMTP)rI   ra   r1   num_nextn_predict_layersnum_hidden_layers)r   rI   r   r   r   rg     sN   zModelConfig._config_draft_modelc                 C   sJ   t | jjo	| j | _| jrt| jj| j\| _| _| jjd dv | _	d S )Nr   )r   r   )
is_hybrid_swa_modelra   r1   rP   is_hybrid_swaget_hybrid_layer_idsrb   swa_attention_layer_idsfull_attention_layer_idsis_hybrid_swa_compressr   r   r   r   r}   S  s   
z ModelConfig._derive_hybrid_modelc                 C   s   | j }t| j}|d urM||krI|rdnd}d| d| d| d}tj s*t rBt| || _	|rA|| j_
td| d nt| d	|| _	n|| _	| j	| j_	d S )
NzTarget model'szUser-specifiedz	Warning: z context_length (z.) is greater than the derived context_length (z). This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config.z8Overriding the draft model's max_position_embeddings to .z^ To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1)rI   r   rb   r   )SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LENgetr   rd   warningcontext_lenmax_position_embeddings
ValueErrorra   )r   rB   rI   derived_context_lenreasonmsgr   r   r   r{   g  s2   


z"ModelConfig._derive_context_lengthc                 C   s  t | jd| jj| jj | _t | jd| j| _t | jd| j| _t | jd| j| _d| jj	v swd| jj	v swd| jj	v swd| jj	v swd	| jj	v swd
| jj	v swd| jj	v swd| jj	v swd| jj	v swd| jj	v swd| jj	v swd| jj	v swd| jj	v rd| _t
j| _| jj| _| jj| _| jj| _| jj| _t| jrt| jnd | _dt| j| j  | _| jj}|r|dp|dpd}|dkr|dd}|d }t|t|}| j| | | _n/d| jj	v rd| _t
j| _| jj| _| jj| _nd| jj	v rt | jddrd| _t
j| _| jj| _| jj| _| jj| _nd| jj	v r>d| _t
j| _| jj| _| jj| _| jj| _| jj| _nd | jj	v rad!| _t
j| _| jj| _| jj| _| jj| _| jj| _nd"| jj	v sod#| jj	v r| jj| _t
j| _| jj| _| jj| _| jj| _| jj| _dt| j| j  | _| jjr| jjdd}| jjd }t|t|}| j| | | _nPd$| jj	v sd%| jj	v sd&| jj	v rt | dd d u r| jj| jj | _t| jdr| jjd u rt| jd| j nd'| jj	v r| jjd(k| _t
j| _| jj| _t | jd)d | _| jjd*v r(t | jj d+d | _| jd u r2| j| _| jj| _| jj!| _!| j!| _"d| jj	v rM| j!d, | _"d-| jj	v ret | jd.d}t#| j!t#| | _"t | jd/d | _$| jj%| _%d S )0Nhead_dim
v_head_dimswa_head_dimswa_v_head_dimDeepseekV2ForCausalLMr+   r*   r,   r   r/   r   r   DotsVLMForCausalLMr-   r.   MistralLarge3ForCausalLMEagleKimiK25ForConditionalGeneration   r   	rope_typetypedefaultmscale_all_dimFfactorMiniCPM3ForCausalLM   DeepseekVL2ForCausalLMuse_mlaTKimiVLForConditionalGenerationKimiLinearForCausalLMH   r   r   MistralModelMixtralForCausalLMMistralForCausalLMBaichuanForCausalLMi   num_key_value_heads)dbrxmpt
kv_n_heads   IQuestLoopCoderForCausalLMloop_numr   )&r2   rb   hidden_sizenum_attention_headsr   r   r   r   ra   r1   r   r   attention_archkv_lora_rankqk_nope_head_dimqk_rope_head_dimr4   r8   r7   mathsqrtscalingrope_scalingr   yarn_get_mscalefloatrs   setattr	use_alibir   r   rf   attn_configr   num_attention_layersintr   
vocab_size)r   r   r   r   scaling_factormscaler   r   r   r   r|     s  
























z ModelConfig._derive_model_shapesc                 C   s   | j S r6   )r   r   r   r   r   get_total_num_attention_heads)  s   z)ModelConfig.get_total_num_attention_headsc                 C   s   | j }td|| S )Nr   )r   max)r   tensor_parallel_sizetotal_num_attention_headsr   r   r   get_num_attention_heads,  s   z#ModelConfig.get_num_attention_headsc                    s  g d} j j|v ot j dd}|st jddrdS  j jdv r2d j jv r. j jd S  j jS  j jdv rBt j jd j jS  j jd	v rm fd
d j jD }t|dkr]tdt|dkrgt	dt
t|S g d}|D ]}t j|d}|dur|  S qs jjS )z%Returns the total number of KV heads.)falcon
RefinedWebRefinedWebModelnew_decoder_architectureFmulti_queryr   )r   r   )r   )znemotron-nasc                    s$   h | ]}|j js jj|j j qS r   )	attentionno_opra   r   n_heads_in_group).0blockr   r   r   	<setcomp>O  s    z5ModelConfig.get_total_num_kv_heads.<locals>.<setcomp>r   z%Couldn't determine number of kv headszCVariable GQA (VGQA) is not yet supported for nemotron-nas in sglang)	n_head_kvnum_kv_headsr   multi_query_group_numnum_attention_groupsN)ra   rf   r2   rb   r   r   block_configslenRuntimeErrorr   nextiter)r   falcon_model_typesnew_decoder_arch_falconnkvh
attributesattrr  r   r   r   get_total_num_kv_heads1  sH   
z"ModelConfig.get_total_num_kv_headsc                 C   s   |   }td|| S )z'Returns the number of KV heads per GPU.r   )r  r   r   r   total_num_kv_headsr   r   r   get_num_kv_headsp  s   zModelConfig.get_num_kv_headsc                 C   sT   t | jdr| jj}td|| S t | jdr%| jjd}td|| S | |S )z+Similar to get_num_kv_heads(), but for SWA.swa_num_key_value_headsr   attention_other_settingr  )rs   rb   r  r   r  r   r  r  r   r   r   get_swa_num_kv_headsy  s   
z ModelConfig.get_swa_num_kv_headsc              
      s\  t jdd }|d urt|ts| }|d ur*d|vr*d|i}|r*|| |d u r5t jdd }|d u r,tj	j
}|stj rTddlm}m} |  ndd l}ddlm}m} |  zTd}	|jjs|t fd	d
dddd}	|	s||W S tj r|j
djd}
n|j
dj|jjd}
t|
}t|}W d    n1 sw   Y  |}W |S  |jjy   tdj
 d Y |S  |jjy   t d Y |S  t!y } zt dj
| W Y d }~|S d }~ww tj	tj"j
dr,tj"j
d}
t|
}t|}W d    n	1 s"w   Y  |}|S )Nquantization_configquant_methodrG   compression_configr   )HubApimodel_file_download)HfApihf_hub_downloadFc                      s     jdS )Nhf_quant_config.json)file_existsr?   r   hf_apir   r   r   <lambda>  s    z4ModelConfig._parse_quant_hf_config.<locals>.<lambda>r         ?g      @)	max_retryinitial_delay	max_delayr  )model_id	file_pathrA   )repo_idfilenamerA   local_files_onlyz,hf_quant_config.json not found in cache for z0 (offline mode, normal for non-quantized models)z<Offline mode is enabled, skipping hf_quant_config.json checkz4Failed to load hf_quant_config.json for model %s: %s)#r2   ra   
isinstancedictto_dict_parse_modelopt_quant_configupdateospathexistsr?   r   SGLANG_USE_MODELSCOPEr   
modelscoper  r  huggingface_hubr  r  	constantsHF_HUB_OFFLINEr   rA   openr^   loaderrorsLocalEntryNotFoundErrorrd   debugOfflineModeIsEnabledr   	Exceptionjoin)r   	quant_cfg
parsed_cfgis_localr  r  r7  r  r  r   quant_config_filefquant_config_dicter   r!  r   _parse_quant_hf_config  s   






z"ModelConfig._parse_quant_hf_configc                 C   sV   t | jd}d }| r)t|}t|}W d    n1 s w   Y  d|d< |S )Nzquant_model_description.json	modelslimr  )r   r?   is_filer:  r^   r;  )r   rE  rB  rF  r   r   r   _find_quant_modelslim_config  s   
z(ModelConfig._find_quant_modelslim_configrG  c                 C   sX   |d }| dd}|dkrddiS |r d|v sd|v r dd	iS |r*d
|v r*ddiS dS )zKParse ModelOpt quantization config and return the appropriate quant_method.rG   
quant_algoNMIXED_PRECISIONr  w4afp8FP4NVFP4modelopt_fp4FP8modelopt_fp8r   )r   rG  json_quant_configsrM  r   r   r   r0    s   z(ModelConfig._parse_modelopt_quant_configc                 C   s6   t | jddst | jddrdS ddlm} || jS )z>Check if the model is already quantized based on config files.r  Nr  Tr   )has_hf_quant_config)r2   ra   sglang.srt.utilsrW  r?   )r   rW  r   r   r   _is_already_quantized   s   
z!ModelConfig._is_already_quantizedc                 C   sb   | j dkrdS | j dkrdS | j dkr/|  }|r-|dd }d|v r'dS d|v r-dS dS dS )	zBExtract ModelOpt quantization type from unified quantization flag.rT  fp8rR  nvfp4modeloptr   fp4)rG   rI  r   lower)r   rB  r  r   r   r   _get_modelopt_quant_type  s   


z$ModelConfig._get_modelopt_quant_typec                 C   s(   t | jdd }|d u rt | jdd }|S )Nri   sliding_window)r2   rb   )r   ri   r   r   r   rh   !  s   z$ModelConfig._get_sliding_window_sizec                 C   sB   | j sdS g d}| j|v }|stddt| dtd)z*Validate quantize_and_serve configuration.N)r\  rT  rR  zLquantize_and_serve requires ModelOpt quantization (set with --quantization {z, z})zquantize_and_serve functionality is currently disabled due to compatibility issues. Please use the separate quantize-then-deploy workflow instead. Step 1: Quantize and export model. Step 2: Deploy the exported model.)rL   rG   r   rA  sortedNotImplementedError)r   _MODELOPT_QUANTIZATION_METHODSmodelopt_quantization_specifiedr   r   r   r\   '  s   z/ModelConfig._validate_quantize_and_serve_configc                 C   s^  g t }g d}g d}dgdgdgddgddgd}| jd ur&| j | _g }|  }|  }|p3|}|d ur=|| dd |D }t|d	krNtd
|rT|d nd }	|	d ur|	d| jsbdn| j }
t 	 D ]\}}|
|	| j}|r|}
|| _ nql| jd u r|
| _nC| j|
kr| j|v o|
|| j v }|rtd| j d|
 d n"| jrtd|
 d| j d|
  |
| _ntd|
 d| j d|	dd dk| _ddlm} | js|jrtd | jd ur)| j|vrtd| j d| dt r| j|vrt| j d| j|vr+| jdkr t s-td| j d S d S d S d S ) N)awqgptqrZ  compressed_tensorscompressed-tensors
fbgemm_fp8w8a8_fp8petit_nvfp4quarkmxfp4z
auto-roundquark_int4fp8_moe)rZ  marlinrT  rR  gptq_marlin_24gptq_marlin
awq_marlinrj  rh  ri  experts_int8	w8a8_int8rk  	moe_wna16qoqrO  rl  rm  rJ  r\  ri  rh  )rT  rR  rl  ru  rk  c                 S   s   g | ]}|d ur|qS r6   r   )r   itemr   r   r   
<listcomp>~  s    z4ModelConfig._verify_quantization.<locals>.<listcomp>r   z;Config list contains configs from 2 methods, must be only 1r   r  r]  z"Using CLI-specified quantization (z3) which is compatible with HF config quant_method (z).zDraft model quantization (z() differs from main model quantization (z.). Using draft model's detected quantization: z3Quantization method specified in the model config (zS) does not match the quantization method specified in the `quantization` argument (	scale_fmtue8m0)deep_gemm_wrapperzuDeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell.zUnknown quantization method: z. Must be one of r   z1 quantization is currently not supported in ROCm.rn  z^%s quantization is not fully optimized yet. The speed can be slower than non-quantized models.)r   rG   r_  rI  rL  appendr  r   r   itemsoverride_quantization_methodrd   re   rI   use_scale_ue8m0sglang.srt.layersr|  DEEPGEMM_SCALE_UE8M0r   r   r   )r   supported_quantizationrocm_supported_quantizationoptimized_quantization_methodscompatible_quantization_methodscfg_listra   modelslim_configquant_configrB  r  _methodquantization_overrideis_compatibler|  r   r   r   r~   E  s   








z ModelConfig._verify_quantizationc                 C   sN   t | jdr#t| j}|sd S || jjd< d| jjvr%d| jjd< d S d S d S )Ndual_chunk_attention_configsparse_attention_configsparse_attention_enabledT)rs   ra   r   r?   r  )r   sparse_attn_configr   r   r   r     s   

	z/ModelConfig._verify_dual_chunk_attention_configc           	      C   s   dd l }ddlm} t|dd }|d u rd S t| jdd }d| j v p/|d uo/t|dd dk}|}||}|d}||k rU|rStd	| d
| j d| jj	 dd S |sgt
d	| d| jj	 d d S d S )Nr   )version__version__rV   zglm-4.6vrf   glm4v_moe_visionz	5.0.0dev0zTransformers version z is not supported for model z or model type z*. Please upgrade transformers to >= 5.0.0.z is used for model type z. If you experience issues related to RoPE parameters, they may be due to incompatibilities between Transformers >=5.0.0 and some models. You can try downgrading to transformers==4.57.1 as a workaround.)r"   	packagingr  r2   ra   r?   r_  parser   rf   rd   r   )	r   r"   r  tf_version_strrV   is_glm_46vmoeneeds_tf_v5
tf_versionrequired_versionr   r   r   r     s4   

z(ModelConfig._verify_transformers_versionc                 C   sx   t | jdd }|d urt|tr|hnt|}|d u rt }| jr:t | jdd }|r:t|tr2|hnt|}||B }|S )Neos_token_id)r2   ra   r-  r   setrc   )r   eos_idsgeneration_eos_idsr   r   r   r     s    z ModelConfig._get_hf_eos_token_idc                    sD   | j dkri S | jdu ri S | j  g d} fdd|D }|S )aG  
        Get default sampling parameters from the model's generation config.

        This method returns non-default sampling parameters from the model's
        generation_config.json when sampling_defaults is set to "model".

        Returns:
            A dictionary containing the non-default sampling parameters.
        modelN)repetition_penaltytemperaturetop_ktop_pmin_pc                    s&   i | ]}  |d ur|  |qS r6   rU  )r   pr3   r   r   
<dictcomp>6  s     z;ModelConfig.get_default_sampling_params.<locals>.<dictcomp>)rK   rc   r/  )r   available_paramsdefault_sampling_paramsr   r3   r   get_default_sampling_params  s   




z'ModelConfig.get_default_sampling_paramsc                 C   sl   ddl m} ddlm} || jr2td || j}|| jr4|jdgd | j| _|	 | _dS dS dS )z
        Pull the model config files to a temporary
        directory in case of remote.

        Args:
            model: The model name or path.

        r   )create_remote_connector)is_remote_urlz$Pulling model configs from remote...z*config.json)allow_patternN)
sglang.srt.connectorr  rX  r  r?   rd   re   
pull_filesmodel_weightsget_local_dir)r   r  r  clientr   r   r   r]   <  s   	



z3ModelConfig._maybe_pull_model_tokenizer_from_remote)NNF)r)   N)'r   r   r   r    r$   strboolr   r   r
   r   staticmethodr   r   rg   r}   r{   r|   r   r   r  r  r  rI  rL  r.  r0  rY  r`  rh   r\   r~   r   r   r	   r   r   r  r]   r   r   r   r   r=   U   s    	


 'C" !?	_
 
# r=   )halffloat16r   float32bfloat16rF   c                 C   sn  t | tr| dd p| dd }| dd}nt| dd }t| dd}t |tr/t|d }|d u r6tj}t |tr|| }|dkrl|tjkri|	dre|dkrTd}n|d }t
d| d	 tj}n'tj}n#|}n |tvrwtd
| t| }nt |tjr|}ntd
| ||kr|tjkrt
d|| 	 |S |tjkrt
d|| 	 |S t
d|| |S )NrF   torch_dtyperf   r]  r   gemma   z
For Gemma zs, we downcast float32 to bfloat16 instead of float16 by default. Please specify `dtype` if you want to use float16.zUnknown dtype: zUpcasting %s to %s.zDowncasting %s to %s.zCasting %s to %s.)r-  r.  r   r2   r  _STR_DTYPE_TO_TORCH_DTYPEtorchr  r_  
startswithrd   re   r  r  r   rF   r   )r(   rF   config_dtyperf   gemma_versionr  r   r   r   rz   _  sP   







	
rz   Fmodel_architecturesrD   c                 C   s   d| v s@d| v s@d| v s@d| v s@d| v s@d| v s@d| v s@d| v s@d	| v s@d
| v s@d| v s@d| v s@d| v s@d| v s@d| v s@d| v rBdS | S )NLlamaEmbeddingModelr   LlamaForSequenceClassification0LlamaForSequenceClassificationWithNormal_WeightsInternLM2ForRewardModelQwen2ForRewardModelQwen3ForRewardModelQwen2ForSequenceClassificationQwen3ForSequenceClassification	CLIPModel	BertModel
ContrieverBertForSequenceClassificationXLMRobertaModel#XLMRobertaForSequenceClassificationGemma2ForSequenceClassificationFr   )r  rD   r   r   r   rj     s$   rj   )3r  r   &Ernie4_5_VLMoeForConditionalGenerationrR   Gemma3nForConditionalGenerationGlm4vForConditionalGeneration Glm4vMoeForConditionalGenerationr   GlmAsrForConditionalGenerationGrok1VForCausalLMGrok1AForCausalLMLlavaLlamaForCausalLMrS   LlavaMistralForCausalLMLlavaQwenForCausalLMLlavaForConditionalGenerationLlavaVidForCausalLM"LightOnOCRForConditionalGenerationMiniCPMOMiniCPMV Mistral3ForConditionalGenerationMultiModalityCausalLMMllamaForConditionalGenerationNemotronH_Nano_VL_V2r.   "Qwen2AudioForConditionalGenerationQwen2VLForConditionalGeneration"Qwen2_5_VLForConditionalGenerationQwen3VLForConditionalGeneration"Qwen3VLMoeForConditionalGenerationr   r   $Qwen3OmniMoeForConditionalGenerationr   InternVLChatModel InternS1ForConditionalGeneration#InternS1ProForConditionalGenerationPhi4MMForCausalLMrT   POINTSV15ChatModelr   DotsOCRForCausalLMSarashina2VisionForCausalLMNVILAForConditionalGeneration!NVILALiteForConditionalGenerationDeepseekOCRForCausalLMJetVLMForConditionalGeneration#PaddleOCRVLForConditionalGenerationMiDashengLMModelStepVLForConditionalGenerationr   c                    s   t  fddtD rdS dS )Nc                 3       | ]}| v V  qd S r6   r   r   multi_model_archr  r   r   	<genexpr>  s
    
z&is_multimodal_model.<locals>.<genexpr>TF)anymultimodal_model_archsr  r   r  r   rl     s
   rl   c                 C      dS NFr   r  r   r   r   rn        rn   c                 C   r   r  r   r  r   r   r   rp     r  rp   c                 C   r   r  r   r  r   r   r   rr     r  rr   c                 C      d| v S )Nr  r   r  r   r   r   rw        rw   c                 C   r  )NrS   r   r  r   r   r   ry     r  ry   c                    s&   g d t  fdd| D rdS dS )z=Check if chunked prefill is supported for a MultiModal model.)r  r  r  r  r  c                 3   r  r6   r   r  unsupportedr   r   r        z:is_multimodal_chunked_prefill_supported.<locals>.<genexpr>FTr  r  r   r  r   rv     s   rv   r   scaler   c                 C   s"   | dkrdS d| t |  d S )Nr   r$  g?)r   log)r	  r   r   r   r   r   "  s   r   c                    s   h d t  fdd| D S )N>   r   r   GptOssForCausalLMr   r   rS   c                 3   r  r6   r   )r   archhybrid_swa_archsr   r   r  2  r  z&is_hybrid_swa_model.<locals>.<genexpr>r  r  r   r  r   r   (  s   r   rb   c                    sH  |j d| v rdd tD }dd tD }||fS d| v r=t|dd }dd t|D }dd t|D }||fS d	| v rat|d
d   fddtD } fddtD }||fS d| v rndg}g }||fS d| v r|j}fddt|D }fddt|D }||fS d| v rdg}g }||fS d }d }||fS )NrS   c                 S   s    g | ]}|d  d dkr|qS r      r   r   r   ir   r   r   ry  ;      z(get_hybrid_layer_ids.<locals>.<listcomp>c                 S   s    g | ]}|d  d dkr|qS r  r   r  r   r   r   ry  >  r  r  layer_typesc                 S      g | ]
\}}|d kr|qS sliding_attentionr   r   r  xr   r   r   ry  C      c                 S   r  full_attentionr   r  r   r   r   ry  F  r  r   hybrid_layer_patternc                       g | ]
} | d kr|qS )r   r   r  r  r   r   ry  K  r  c                    r  )r   r   r  r  r   r   ry  N  r  r   r   r   c                    $   g | ]\}}|d kr| k r|qS r  r   r  r   r   r   ry  V  
    c                    r   r  r   r  r!  r   r   ry  [  r"  r   )r   ranger2   	enumerater  )r  rb   r   r   r  r   )r  r   r   r   5  s^   ( 



r   )F)r   r   )Ir^   loggingr   r2  enumr   r   r   pathlibr   typingr   r   r   r	   r
   r  r"   r   sglang.srt.environr   sglang.srt.layers.quantizationr   sglang.srt.server_argsr   rX  r   r   r   &sglang.srt.utils.hf_transformers_utilsr   r   r   r   r   sglang.utilsr   	getLoggerr   rd   r   r  r    r  r4   r   r8   r:   r<   r=   r  r  r  r  rF   rz   rj   r  SGLANG_EXTERNAL_MM_MODEL_ARCHr   external_mm_model_archr}  rl   rn   rp   rr   rw   ry   rv   r   r   r   r   r   r   r   r   <module>   s|   
        	

@6

