o
    پi                    @   s>  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
mZ ddlmZ ddlmZmZmZmZmZ ddlZddlmZ ddlmZ ddlmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( dd	l)m*Z*m+Z+ dd
l,m-Z-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZF ddlGmHZH ddlImJZJmKZKmLZLmMZM ddlNmOZOmPZPmQZQmRZR ddlSmTZT ddlUmVZV ddlWmXZX ddlYmZZZm[Z[ ddl\m]Z] ddl^m_Z_ ddl`maZambZbmcZcmdZdmeZe ddlfmgZg ddlhmiZimjZjmkZk ddllmmZm dd lnmoZo dd!lpmqZq dd"lrmsZs dd#ltmuZu dd$lvmwZw dd%lxmyZy dd&lzm{Z{ dd'l|m}Z} dd(l~mZ dd)lmZ dd*lmZmZ dd+lmZmZmZmZ dd,lmZ dd-lmZ dd.lmZ dd/lmZ dd0lmZmZ dd1lmZmZmZ dd2lmZ dd3lmZ dd4lmZ dd5lmZmZmZ dd6lmZ dd7lmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ dd8lmZ dd9lmZmZmZ dd:lmZmZ dd;lmZ dd<lmZ dd=lmZmZ e Ze Ze Ze Zeːr8dd>lmZ eσ  g d?Zg d@ZejdAejdAejdBejdCiZdDdE ZdFdG ZdHZeeۡZdIejdJejfdKdLZG dMdN dNej߃ZeG dOdP dPZG dQdR dReZdSeeeejf  fdTdUZdVdW ZeG dXdY dYZdS )Z2ModelRunner runs the forward passes of the models.    N)defaultdict)	dataclass)CallableListOptionalTupleUnion)nn)BailingHybridConfigFalconH1ConfigGraniteMoeHybridConfigJetNemotronConfigJetVLMConfigKimiLinearConfig
Lfm2ConfigLfm2MoeConfigNemotronH_Nano_VL_V2_ConfigNemotronHConfigQwen3_5ConfigQwen3_5MoeConfigQwen3NextConfig)DeviceConfig)
LoadConfig
LoadFormat)AttentionArchModelConfig	ModelImpl)#adjust_config_with_unaligned_cpu_tp)GPU_MEMORY_TYPE_WEIGHTS)register_forward_hook_for_model)get_pp_groupget_tp_groupget_world_groupinit_distributed_environmentinitialize_model_parallelset_custom_all_reduceset_mscclpp_all_reduceset_torch_symm_mem_all_reduce)use_symmetric_memory) monkey_patch_vllm_parallel_state)ElasticEPStateManager)envs)EPLBManager)ExpertDistributionMetricsExpertDistributionRecorder'get_global_expert_distribution_recorder'set_global_expert_distribution_recorder)ExpertLocationMetadata(compute_initial_expert_location_metadata#get_global_expert_location_metadata#set_global_expert_location_metadata)ExpertLocationUpdater)NPUGraphRunner)deep_gemm_wrapper)ATTENTION_BACKENDSattn_backend_wrapper)is_nsa_enable_prefill_cp)TboAttnBackend)DpPaddingModeget_attention_tp_groupinitialize_dp_attentionset_dp_buffer_lenset_is_extend_in_batch)LogitsProcessorOutput)RoutedExpertsCapturerget_global_experts_capturerset_global_experts_capturer)get_moe_a2a_backend)EmbeddingPoolerOutput)	fp8_dtype)create_sampler)apply_torchao_config_to_model)LoRAManager)LoRARef)sanity_check_mm_pad_shift_value)BaseTokenToKVPoolAllocator)ReqToTokenPool)CPUGraphRunner)CudaGraphRunnerset_torch_compile_config)CaptureHiddenModeForwardBatchForwardModePPProxyTensors)register_forward_hooks)GraphInputBuffers)ModelRunnerKVCacheMixin)PiecewiseCudaGraphRunner)DefaultModelLoaderget_model_loader)!RemoteInstanceWeightLoaderBackendregister_memory_region;trigger_init_weights_send_group_for_remote_instance_request)set_default_torch_dtype)default_weight_loader)SamplingBatchInfo)
ServerArgsget_global_server_args$set_global_server_args_for_scheduler)SpeculativeAlgorithm)MultiprocessingSerializercpu_has_amx_supportdynamic_importempty_contextenable_show_time_costget_available_gpu_memoryget_cpu_ids_by_nodeget_local_ip_autoinit_custom_process_groupis_hipis_host_cpu_arm64is_npulog_info_on_rank0monkey_patch_p2p_access_checkrequire_attn_tp_gatherrequire_gathered_bufferrequire_mlp_tp_gather%reserve_rope_cache_for_long_sequencesset_cuda_archslow_rank_detector)PytHooks)!create_offloader_from_server_argsget_offloaderset_offloader)monkey_patch_torch_reductionsregister_sgl_tp_rank)TorchMemorySaverAdapter)WeightChecker)FlattenedTensorBucketFlattenedTensorMetadata)init_npu_backend)
aiter
flashinferfa3fa4tritonflashmlacutlass_mla
trtllm_mlaascendnsa)r   r   r   r   r   r   fp8_e4m3fp8_e5m2bf16c                 C   ,   | t vrt |  td|  d d S d S )NAdded z to MLA_ATTENTION_BACKENDS.)MLA_ATTENTION_BACKENDSappendloggerinfobackend_name r   Z/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/model_executor/model_runner.pyadd_mla_attention_backend   s   
r   c                 C   r   )Nr   z6 to CHUNKED_PREFIX_CACHE_SUPPORTED_ATTENTION_BACKENDS.)1CHUNKED_PREFIX_CACHE_SUPPORTED_ATTENTION_BACKENDSr   r   r   r   r   r   r   *add_chunked_prefix_cache_attention_backend   s   

r   i  modelreturnc                 C   s   | j j}|dkr| jjS | jS )N$Qwen3OmniMoeForConditionalGeneration)	__class____name__thinkerr   )r   model_cls_namer   r   r   resolve_language_model   s   r   c                       s(   e Zd ZdZ fddZdd Z  ZS )RankZeroFilterz_Filter that only allows INFO level logs from rank 0, but allows all other levels from any rank.c                    s   t    || _d S N)super__init__is_rank_zero)selfr   r   r   r   r     s   

zRankZeroFilter.__init__c                 C   s   |j tjkr	| jS dS )NT)levelnologgingINFOr   )r   recordr   r   r   filter	  s   zRankZeroFilter.filter)r   
__module____qualname____doc__r   r   __classcell__r   r   r   r   r     s    r   c                   @   s6   e Zd ZU eeef ed< eed< dZe	e
 ed< dS )ModelRunnerOutputlogits_outputcan_run_graphNexpert_distribution_metrics)r   r   r   r	   rB   rV   __annotations__boolr   r   r.   r   r   r   r   r     s   
 r   c                &   @   s  e Zd ZdZ							ddededededed	ed
edededededee dee dee de	dee
 dee dee f$ddZdd ZdefddZdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+ed,ee fd-d.Z		dd/ed0ed1eeege	f  d2e	d3ee	ef f
d4d5Z	6dd7d8Zd9d: Z	6dd;d<Zd=d> Z 	dd0ee fd?d@Z!dAdB Z"	ddCee#ee$e%j&dDf f  d0ee fdEdFZ'dGdH Z(	IddJedKed3ee%j& fdLdMZ)dNdO Z*dPe+fdQdRZ,	ddPe+fdSdTZ-dPe+fdUdVZ.e/dWdX Z0e/dYdZ Z1e/d[d\ Z2e/d]d^ Z3e/d_d` Z4e/dadb Z5e/dcdd Z6dedf Z7dgdh Z8didj Z9dkdl Z:ddme	fdndoZ;	ddpedme	fdqdrZ<dsdt Z=dudv Z>d3e	fdwdxZ?dydz Z@dd{efd|d}ZAd~d ZBdd ZCdd ZDdd ZEdefddZF		ddeGde	d3e$eHeIf fddZJ		ddeGde	d3e#e$eHeIeKf e	f fddZL	ddeGd3e$eHeIf fddZM		ddeGde	ded3eHfddZN				ddeGde	deeI de	ded3eOfddZP		ddeGde	deeI de	ded3eOfddZQdeHdeRfddZSdeHdeGd3e%j&fddZTdeHdeGd3dfddZUe/d3e	fddZVdefddZW	ddedee dee fddZXdefddZYdd ZZdd Z[dS )ModelRunnerr   NFmodel_configmem_fraction_staticgpu_idtp_ranktp_sizemoe_ep_rankmoe_ep_sizepp_rankpp_size	nccl_portserver_argsdp_rankattn_cp_rankmoe_dp_rankis_draft_workerreq_to_token_pooltoken_to_kv_pool_allocatordraft_model_idxc                 C   s  || _ |j| _|| _|| _|| _|| _|| _|jr|jnd| _|| _	|	| _
|| _|j| _|| _|j| _|| _|
| _|| _|| _|j| _|j| _|j| _t|j| _|j| _|| _|| _|j| _|j| _| jjt j!k| _"|j#| _#d| _$d| _%|| _&d | _'d| _(d | _)d| _*| j+ r| jst,j-||j.|j/dd}d| _*zt0|j1dd }|2dd| _*|d	 | _3W n   d | _3Y |j4rt5  | 6  t7| t8 }| j"|_"| jd
kr| 9  | :  | ; }t<=| j> | _?t@tA||d tB| d| _CtDjE2 rtFG  | H  tIjJrtIK|| | L| | M  | jrtN| jjO dtPQ| jRjSjTv | _U| j
dkr8| jUs8J di | _Vi | _Wd S )N   r   F T)
model_pathmodel_revisionis_draft_modeleagle_configuse_aux_hidden_state eagle_aux_hidden_state_layer_idscpu)r   )model_runnerpp_proxy_tensorsz4Pipeline Parallel is not compatible with this model.)Xr   devicer   r   r   r   r   enable_dp_attentiondp_sizer   r   r   attn_cp_sizer   moe_dp_sizer   	dist_portr   r   is_generationis_multimodal'is_multimodal_chunked_prefill_supportedrf   from_stringspeculative_algorithmspec_algorithm	page_sizer   r   is_hybrid_swais_hybrid_swa_compressattention_archr   MLAuse_mla_backendattention_chunk_sizeforward_pass_idinit_new_workspacer   remote_instance_transfer_engine*remote_instance_transfer_engine_session_id+remote_instance_transfer_engine_weight_infoeagle_use_aux_hidden_state	is_eagle3r   from_server_argsspeculative_draft_model_path speculative_draft_model_revisiongetattr	hf_configgetr   show_time_costrk   model_specific_adjustmentre   rd   init_threads_binding$init_shared_mooncake_transfer_engineinit_torch_distributedtorchget_device_moduleStreamforward_streamr~   r|   r   _weight_checkerr,   SGLANG_DETECT_SLOW_RANKrz   executeinit_mindspore_runnerr8   ENABLE_JIT_DEEPGEMMupdate_deep_gemm_config
initialize!check_quantized_moe_compatibilityrM   
vocab_sizeinspect	signaturer   forward
parameters
support_pp_model_update_group_weights_send_group)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   draft_model_configr   global_server_argsmin_per_gpu_memoryr   r   r   r     s   




zModelRunner.__init__c                 C   s\   | j j tjkr*tr,ddlm} || j| j	 | j| j
 | j | j| j | jd d S d S d S )Nr   )init_ms_distributed)
world_sizerank
local_rankr   port)r   
model_impllowerr   	MINDSPORE_is_npu*sglang.srt.model_executor.mindspore_runnerr  r   r   r   r   r   r   )r   r  r   r   r   r    s   

z!ModelRunner.init_mindspore_runnerr  c                    s   j }tj j jd _ j  r    jsBtt	| j
 jd  jdkr6tj r6tdt   ttj|t  jd  j jrM jsMt nd  _t  _ j jr^t j nd  t  _     j  r j!d ur j"d u rt# j$ j! _" j
j%d u} jr|r j
j%nt& j
j' j
j(} j
j)j*d dkrd}n j
j)j*d dkrd}t+ j$d	d _,t+ j$d
| _- j- j,  _.t+ j
j)dd}|dkr׈ j.|  _.|r j/0 s j/0 s j.|ksJ d fddt1 j, j-d D } fddt1 j, j-d D }| j
_2| j
_3t+ j$dd}|s&t4 j$t5 j6 t+ j$dd}	 j7dkr:|	r: 8  |j9rB :  |j;rV|j<d u rPt=d >|j< |j?rcddl@mA}
 |
   B   C| tD|jEd u rx jFd n|jE jG  jHjI _E J   jKdks jKdkr L   M   N   O  n jKdv r M   O  n
d  _Pd _Q M  |jRrtS j$|jR  jTrӈ j$U jV  W   X  d S )N)enable)r   r   r   r   z"Initial expert_location_metadata: )r  	MiMoV2MTPr   
Step3p5MTPstart_layer	end_layerloop_numz%PP is not compatible with MTP models.c                    (   g | ]}t  jd r| jjv r|qS )full_attention_layer_ids)hasattrr   r)  .0	layer_idxr   r   r   
<listcomp>      
z*ModelRunner.initialize.<locals>.<listcomp>c                    r(  )swa_attention_layer_ids)r*  r   r1  r+  r.  r   r   r/  !  r0  torchao_appliedFsupports_torch_tpzGPlease specify the heavy channel type for double sparsity optimization.)enable_batch_invariant_mode   cudamusa)npur   )Yr   r   createenable_memory_savermemory_saver_adapter1remote_instance_weight_loader_use_transfer_engine$remote_instance_init_transfer_enginer   r5   r3   r   r   r   r,   #SGLANG_LOG_EXPERT_LOCATION_METADATAr   r   r   r4   r1   r/   init_newenable_eplbr-   eplb_managerr6   expert_location_updaterelastic_ep_backendr+   initrI   sampler
load_modelr   r   r^   r   num_nextn_predict_layersmaxnum_hidden_layersnum_attention_layersr   architecturesr   r%  r&  num_effective_layersr   is_noneranger1  r)  rJ   rd   torchao_configr   apply_torch_tpenable_lorainit_lora_managerenable_double_sparsityds_heavy_channel_type
ValueError#init_double_sparsity_channel_configenable_deterministic_inferencesglang.srt.batch_invariant_opsr4  configure_kv_cache_dtypeinit_memory_poolminmax_running_requestsmax_total_num_tokensr   r   sizeinit_routed_experts_capturerr   init_cublasinit_attention_backendkernel_warmupinit_device_graphsgraph_runnergraph_mem_usageforward_hooksrW   r   set_eagle3_layers_to_capturer   init_piecewise_cuda_graphsprealloc_symmetric_memory_pool)r   r  r   model_has_mtp_layersmodel_num_layersr'  r)  r1  r2  r3  r4  r   r.  r   r    s  














zModelRunner.initializec              	   C   sR   | j jst| jdr| jj}nd}ttjt j	| j
|| j| j | j| jd d S )Nnum_fused_shared_expertsr   )r"  r   rl  
num_tokensr\  r   )r   disable_shared_experts_fusionr*  r   rl  rE   rC   r9  rd   enable_return_routed_expertsr   r]  r   r\  r   )r   rl  r   r   r   r_  w  s   


z(ModelRunner.init_routed_experts_capturerc              
   C   s   zddl m} W n ty  } ztd W Y d }~d S d }~ww | | _t }| j|ddtj	
  | d| j  | _d S )Nr   )TransferEnginezWPlease install mooncake for using remote instance transfer engine: pip install mooncakeP2PHANDSHAKErdma:)mooncake.enginerp  ImportErrorr   warningr   rn   r  r,   MOONCAKE_DEVICEr   get_rpc_portr   )r   rp  elocal_ipr   r   r   r=    s    z0ModelRunner.remote_instance_init_transfer_enginec                 C   sx   | j }|jrtd d|_d|_| jr%| js%d|_td| j	j
j  | jr-|jtvr0d|_|js:ttd d S d S )NzQDouble sparsity optimization is turned on. Use triton backend without CUDA graph.r   TzIAutomatically turn off --chunked-prefill-size as it is not supported for z"Chunked prefix cache is turned on.)r   rS  r   r   attention_backenddisable_cuda_graphr   r   chunked_prefill_sizer   r   
model_typer   r   disable_chunked_prefix_cachers   )r   r   r   r   r   r     s0   z%ModelRunner.model_specific_adjustmentc                 C   s   t | jjdd  }d ury|dd  }d ur{|d }| j| j dkr.td| j d| j | j| j }t | jjdd }|d u rBd S || dkr[td| d| d	| j d
| j d	|| | dkr}td|d|d|d| j d| j dd S d S d S )Nquantization_configweight_block_sizer   ztp_size z must be divisible by ep_size moe_intermediate_sizezmoe_intermediate_size z# must be divisible by moe_tp_size (z) which is tp_size (z) divided by moe_ep_size (z).zBFor quantized MoE models, please make sure (moe_intermediate_size=z / moe_tp_size=z) % weight_block_size_n=z- == 0 where moe_tp_size is equal to tp_size (z) divided by ep_size (zE). You can fix this by setting arguments `--tp` and `--ep` correctly.)r   r   r   r   r   r   rU  hf_text_config)r   r  r  weight_block_size_nmoe_tp_sizer  r   r   r   r    sD   

 z-ModelRunner.check_quantized_moe_compatibilityc                    s\  t  }td zt j j W n$ t	y9   t
d jd jdtjdd jd j
  w  jdkri jjd	krfd	} jjre jjd
}zddlm} || W n0   Y n+n*d}n' jdkrqd}n jdkryd}n jdkrd}n jdkrd}n jdkrd}t j j} jjst  tj }|r|}n jjrd jj }nd j }t jj  t  jj! t" jj#  j$sD jdkrt%st&rtj'j() j* t+ jtjd< tj'j(, j j tj-.d fdd}nt
d t/| j j0  j j1  j  j| jj2d t3 j j4 j0 j5 j6 j7 jj8d t9 j j:d  t; rDt< j t j jt= j>d!kt= j?d"}	t@  _AtB  _CtD  _Et j j}
 jd!kr j$s|	|
d# k rd$}|d%|	d&|
d'|
d# 7 }tjF rtG|t
| td(t  | d)d*||
 d)d+ |	S ),NzInit torch distributed begin.zContext: self.device=z self.gpu_id=z( os.environ.get('CUDA_VISIBLE_DEVICES')=CUDA_VISIBLE_DEVICESz self.tp_rank=z self.tp_size=r6  mooncake,r   )epncclxpuxcclhpuhcclr   gloor8  r7  mccltcp://ztcp://127.0.0.1:
LOCAL_SIZEzsgl_kernel::shm_allgatherc                    s   t j| g j |dS )N)dim)r  catr   )datar  r.  r   r   _  s   z-ModelRunner.init_torch_distributed.<locals>._zrinit_cpu_threads_env and shared memory based AllReduce is disabled, only intel amx backend and arm64 are supported)backendr  r  r  distributed_init_methodtimeout)tensor_model_parallel_sizeattention_data_parallel_sizepipeline_model_parallel_sizeexpert_model_parallel_size%attention_context_model_parallel_sizemoe_data_model_parallel_sizeduplicate_tp_group)r   r   r   )distributed	cpu_groupg?zQThe memory capacity is unbalanced. Some GPUs may be occupied by other processes. zmin_per_gpu_memory=z, local_gpu_memory=z, local_gpu_memory * 0.9=z%Init torch distributed ends. elapsed=.2fz s, mem usage= GB)Htimeperf_counterr   r   r  r  r   
set_devicer   	Exceptionrv  osenvironr   r   r   r   rC  mooncake_ib_devicesplitr  r  set_device_filterrl   enable_p2p_checkrt   r,   'SGLANG_DISTRIBUTED_INIT_METHOD_OVERRIDEdist_init_addrr   r&   disable_custom_all_reducer'   enable_mscclppr(   enable_torch_symm_memr   _is_cpu_amx_available_is_cpu_arm64ops
sgl_kernelinit_cpu_threads_envlocal_omp_cpuidstrr  libraryregister_faker$   r   r   dist_timeoutr%   r   r   r   r   enable_pdmuxr?   r   rr   r   r#   r  r  r"   tp_groupr!   pp_groupr>   attention_tp_group'SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECKRuntimeError)r   ticr  r  mooncake_epbefore_avail_memorydist_init_method_overridedist_init_methodr  r  local_gpu_memorymsgr   r.  r   r     s   
0
	








	


z"ModelRunner.init_torch_distributedc                 C   s   | j jdko| j jdkp)| j jo| j jdkp)| j jo| j jdkp)| j jo)| j jdk}|rCddlm	} |t
 | j| j jp>| j jd dS dS )z
        Need MooncakeTransferEngine when:
        1) PD disaggregation uses mooncake for KV transfer (prefill/decode)
        2) HiCache uses mooncake storage backend
        3) Encoder disaggregation uses mooncake
        nullr  r   )init_mooncake_transfer_engine)hostnamer   	ib_deviceN)r   disaggregation_modedisaggregation_transfer_backendenable_hierarchical_cachehicache_storage_backendencoder_onlyencoder_transfer_backendlanguage_onlyDsglang.srt.distributed.device_communicators.mooncake_transfer_enginer  rn   r   disaggregation_ib_devicer  )r   use_mooncake_ter  r   r   r   r   W  s*   	

	


z0ModelRunner.init_shared_mooncake_transfer_enginec           	      C   sR  t  }t| j| j}tdt| j| jdd | jdkr$td | jdkrMtj	
 d dk rMtd	 d
| j_tj| j_tj	
 d dk rMtdt  ddlm} || jj| jj| jj| jj| jjd}t| jj| jj| jj| j| jj| jj| jj| jj | j!|| jj"| j#d| _$| jdkrt%| j| j$| j&| _| jjt'j(kr| jj t)j*kr| jdkrt+,t+- }t.j/t0| jj| jj| jj|fd}|1  t2  | jj3p| j4o| jj5}| j6j7t8|d, t9| j$| jd| _:| j:j;| jt<| j| jd| _=t>| j:dr| j:j?| _?W d    n	1 sw   Y  t2dd t@ A  | jjBr2tC | _D| jDjE| j=dd | jjFdkrg| jjGd urbtHtI| j=dd r[| j=J| jjG td| jjG ntd| j=jKtLd d | _Mt>| j=drx| j=N | _Mn'| jjOr| jjMd ur| jjM| _Mn| jjPd ur| jjP| _Mtd| jM  | jj| _t| j| j}|| | _Qtd t  | dd!tR| j=jS d"| j d#|dd$| jQdd% | jjTd urtU| j=| jjT| jjV| j&| j| jW tX| j=| j| jt | jjYd&krtZj[t\ j]d' d S ztZj^t\ j]t_j`tad(dd) W d S  ty(   tbd*| j d+d w ),NzLoad weight begin. avail mem=r  r  r   r   r6  r      zKCompute capability below sm80. Use float16 due to lack of bfloat16 support.float16   z$SGLang only supports sm75 and above.)ModelOptConfig)quantcheckpoint_restore_pathcheckpoint_save_pathexport_pathquantize_and_serve)load_formatdownload_dirmodel_loader_extra_configr   .remote_instance_weight_loader_seed_instance_ip8remote_instance_weight_loader_seed_instance_service_port6remote_instance_weight_loader_send_weights_group_ports%remote_instance_weight_loader_backend-remote_instance_weight_loader_transfer_enginemodelopt_configrl_quant_profiler   )targetargs)enable_cpu_backup)load_configr   )r   device_configr   T)reverser   )module_prefixr   load_kv_cache_scalesz'Loaded KV cache scaling factors from %szfUsing FP8 KV cache and scaling factors provided but model %s does not support loading scaling factors.zUsing FP8 KV cache but no scaling factors provided. Defaulting to scaling factors of 1.0. This may lead to less accurate results!!get_attention_sliding_window_sizez8Setting sliding_window_size to be attention_chunk_size: zLoad weight end. elapsed=z	 s, type=z, dtype=z, avail mem=z GB, mem usage= GB.r  group)seconds)r  r  wait_all_rankszTP rank z could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node.)cr  r  rl   r   r   r   r   r  set_num_threadsr6  get_device_capabilityr   dtyper  r   r  ry   "sglang.srt.configs.modelopt_configr  modelopt_quant modelopt_checkpoint_restore_pathmodelopt_checkpoint_save_pathmodelopt_export_pathr  r   r  r  r  r   r  r  r  r  r   r  r   r  r   r   r   REMOTE_INSTANCEr]   NCCLsocketgethostbynamegethostname	threadingThreadr_   startr*   enable_weights_cpu_backupr   enable_draft_weights_cpu_backupr;  regionr   r\   loaderrF  r   r   r*  r   r}   	post_initenable_layerwise_nvtx_markerr{   	pyt_hooksregister_hookskv_cache_dtypequantization_param_pathcallabler   r  r   rv  sliding_window_sizer  r   r   weight_load_mem_usagetyper   debug_tensor_dump_output_folderr    debug_tensor_dump_layersr   rx   rC  distbarrierr"   r  monitored_barrierdatetime	timedelta"UNBALANCED_MODEL_LOADING_TIMEOUT_SrU  )	r   	tic_totalr  r  r  instance_iptr  after_avail_memoryr   r   r   rF    sB  





	








zModelRunner.load_modelnew_expert_location_metadataupdate_layer_idsc                 C   sn   t  d ur%t }|d usJ |j||d | | jj| jjdd  d S | jj| j	j
||| jj| jd d S )N)r&  c                 S   s   d| v od| vS )Nzmlp.expertszmlp.shared_expertsr   )namer   r   r   <lambda>L  s    z4ModelRunner.update_expert_location.<locals>.<lambda>)r&  nnodesr  )r+   instancer4   updateupdate_weights_from_diskr   r   r  rB  r   routed_experts_weights_of_layerr)  r   )r   r%  r&  old_expert_location_metadatar   r   r   update_expert_location<  s&   
z"ModelRunner.update_expert_locationr   r  weight_name_filterrecapture_cuda_graphr   c                    s  t dtjjdd tj|j_t|d}t	|j t
 ts3d  d}d|fS  fdd	} fd
d}tjjp z|j}	W n# tys }
 zd|
 d}d|fW  Y d}
~
W  d   S d}
~
ww z|j|	}W n4 ty }
 z(d|
 d}~	t  |j}	|j|	_d|fW  Y d}
~
W  d   S d}
~
ww W d   n1 sw   Y  |_|j_|j_|_|rވjdksڈjdkrވ  t d dS )z-Update engine weights in-place from the disk.z8Update engine weights online from disk begin. avail mem=r  r  )r  zFailed to get model loader: .Fc                    s4     tj| j}d urfdd|D }|S )Nc                 3   s$    | ]\}} |r||fV  qd S r   r   )r,  r'  weight)r0  r   r   	<genexpr>s  s    
zPModelRunner.update_weights_from_disk.<locals>.get_weight_iter.<locals>.<genexpr>)_get_weights_iteratorr[   Sourcer?  r   )configiter)r  r   r0  r   r   get_weight_itern  s   
z=ModelRunner.update_weights_from_disk.<locals>.get_weight_iterc                    s     | | | S r   )load_weights_and_postprocess)r   r8  )r  target_devicer   r   model_load_weightsy  s   z@ModelRunner.update_weights_from_disk.<locals>.model_load_weightsz Failed to get weights iterator: NzFailed to update weights: z#.
Rolling back to original weights.r6  r7  zUpdate weights end.)Tz"Succeeded to update model weights.)r   r   rl   r   r   r  r   r   r   r\   
isinstancer[   r`   r  r  r   gccollectr   r  r  rc  )r   r   r  r0  r1  r  messager9  r<  r8  ry  r   r   )r  r   r;  r0  r   r,  W  s\   




z$ModelRunner.update_weights_from_diskr  c                 C   sp  t j s	J d|dksJ d|d}t|| jks+J d| j dt| d|| j }| d| d| j }td	| j d
| j	 d| d| d| d| d| d|  t j
  d}	d}
z.t|d| d| |||t d| j	d| j|< tj| j| d d}	d| d| d}
W n ty } zd| d}
t|
 W Y d }~nd }~ww t j
  |	|
fS )N/Default torch process group must be initializedr   Group name cannot be emptyr  	Expected  ports, but got  ports.r  z#init custom process group: tp_rank=z	, gpu_id=z, master_address=, master_port=z, group_rank=, world_size=, group_name=
, backend=Fr  rs  r6  )r  init_methodr  r  
group_name	device_idr  Tz Succeeded to init group through z group.zFailed to init group: r2  )r  r  is_initializedr  lenr   r   r   r   r   r6  empty_cachero   r   r  r  r  r  error)r   master_addressports
group_rankr  rK  r  
ports_list
group_portsuccessr@  ry  r   r   r   +init_weights_send_group_for_remote_instance  sZ   




z7ModelRunner.init_weights_send_group_for_remote_instancec              
   C   sj  t j s	J d|dksJ d|d}t|| jks+J d| j dt| d|| j }| d| d| j }| j| d urH| j| }nd	| d
}t	| d|fS t j
  d}d}z#| j D ]\}	}
t jj|
d|d qfd}d| d| d| d}W n ty } zd| d}t	| W Y d }~nd }~ww | j|= t jj| t j
  ||fS )NrA  r   rB  r  rC  rD  rE  r  Group zb not in _weights_send_group list. Please call `init_weights_send_group_for_remote_instance` first.Fr   srcr  Tz"Succeeded to send weights through rs   r2  zFailed to send weights: )r  r  rM  r  rN  r   r   r  r   rP  r6  rO  r   named_parameters	broadcastr  distributed_c10ddestroy_process_group)r   rQ  rR  rK  rT  rU  
send_groupr@  rV  r  weightsry  r   r   r   send_weights_to_remote_instance  sJ   




z+ModelRunner.send_weights_to_remote_instancec           
      C   s   t j s	J d|dksJ d|| j }td| d| d| d| d| d	| d
|  zt|d| d| |||d| j|< W dS  tyf } zd| d}	t	|	 d|	fW  Y d}~S d}~ww )a  Initialize the Torch process group for model parameter updates.

        `_model_update_group` is used in the RLHF workflow, where rank
        0 is the actor model in the training engine, and the other ranks are
        the inference engine, which is used for rollout.

        In the RLHF workflow, the training engine updates the model
        weights/parameters online, and broadcasts them to the inference
        engine through the `_model_update_group` process group.
        rA  r   rB  z*init custom process group: master_address=rF  z, rank_offset=z, rank=rG  rH  rI  r  rs  )r  rJ  r  r  rK  )Tz-Succeeded to initialize custom process group.z+Failed to initialize custom process group: r2  FN)
r  r  rM  r   r   r   ro   r  r  rP  )
r   rQ  master_portrank_offsetr  rK  r  r  ry  r@  r   r   r   init_weights_update_group  sD   

z%ModelRunner.init_weights_update_groupc              
   C   sr   z|| j v r| j |}tj| W dS W dS  ty8 } zd| d}t| d|fW  Y d }~S d }~ww )N)Tz*Succeeded to destroy custom process group.)Fz)The group to be destroyed does not exist.z(Failed to destroy custom process group: r2  F)r  popr  r  r_  r  r   rP  )r   rK  pgry  r@  r   r   r   destroy_weights_update_group(  s   

z(ModelRunner.destroy_weights_update_groupc              
   C   s$  || j v sJ d| dt| j   d|dkr!| ||||S zOg }g }t|||D ]2\}}	}
t|	tjr9|	ntt|	}tj	|
|| j
d}|tjj|d| j | dd |||f q,|D ]}|  qa| j| W d	S  ty } zd
| d}t| d|fW  Y d}~S d}~ww )aD  
        Update specific parameter in the model weights online
        through `_model_update_group` process group.

        Args:
            name: the name of the parameter to be updated.
            dtype: the data type of the parameter to be updated.
            shape: the shape of the parameter to be updated.
        rX  z not in z0. Please call `init_weights_update_group` first.flattened_bucketr  r   r   T)rZ  r  async_opT%Succeeded to update parameter online.#Failed to update parameter online: ^. The full weights of the ModelRunner are partially updated. Please discard the whole weights.FN)r  listkeys)_update_bucketed_weights_from_distributedzipr=  r  r  r   emptyr   r   r  r]  waitr   load_weightsr  r   rP  )r   namesdtypesshapesrK  r  ra  handlesr'  r  shapetarget_dtyper3  handlery  	error_msgr   r   r   update_weights_from_distributed5  sD   


z+ModelRunner.update_weights_from_distributedc              
   C   s   zKg }t |||D ] \}}}t|tjr|ntt|}	||tj||	| jdf q	t|d}
|
	 }tj
j|d| j| d |
 }| j| d
W S  tyl } zd| d}t| d	|fW  Y d }~S d }~ww )Nrj  )named_tensorsr   rY  Trm  rn  ro  Frl  )rs  r=  r  r  r   r   rt  r   r   get_flattened_tensorr  r]  r  reconstruct_tensorsr   rv  r  r   rP  )r   rw  rx  ry  rK  r  r'  r  r{  r|  bucketflattened_tensorreconstructed_tensorsry  r~  r   r   r   rr  p  s2   


z5ModelRunner._update_bucketed_weights_from_distributedr  LocalSerializedTensorc                    s   t   |dkrj|dS tj_j   fdd|D }|dkr/tj| dS |j	j
v rAt|}|j| dS |d u rMj| dS td| )Nri  )flattened_tensor_bucket_dictc                    s$   g | ]\}}|t |j d fqS ))r   r   )_unwrap_tensorr   )r,  r'  tensorinfered_devicer   r   r   r/    s    z:ModelRunner.update_weights_from_tensor.<locals>.<listcomp>directzUnknown load_format=TSuccess)r   %_update_weights_from_flattened_bucketr  r  r   device_modulecurrent_device_model_load_weights_directr   r   custom_weight_loaderri   rv  NotImplementedError)r   r  r  custom_loaderr   r  r   update_weights_from_tensor  s*   
z&ModelRunner.update_weights_from_tensorc           	   	   C   sl   |d }|d }g }|D ]}t |j|j|j|j|j|jd}|| qt||d}|	 }| j
| dS )z1Handle flattened bucket format for weight updatesr  metadata)r'  r{  r  	start_idxend_idxnumel)r  r  r  )r   r'  r{  r  r  r  r  r   r   r  r   rv  )	r   r  r  r  converted_metadatametaconverted_metar  r  r   r   r   r    s&   z1ModelRunner._update_weights_from_flattened_bucketd   r'  truncate_sizec              
   C   sT   z| j j||| jdW S  ty) } ztd| d|  W Y d}~dS d}~ww )zGet the weights of the parameter by its name. Similar to `get_parameter` in Hugging Face.

        Only used for unit test with an unoptimized performance.
        For optimized performance, please use torch.save and torch.load.
        )r   zError when getting parameter z: N)r   get_weights_by_namer   r  r   rP  )r   r'  r  ry  r   r   r   r    s   	zModelRunner.get_weights_by_namec                 C   sJ   t | j| jj| jj| j| j| j| jj| j	| j
| jj| jj| jjd| _d S )N)
base_modelbase_hf_configmax_loras_per_batchr  r  r   lora_backendr   r   max_lora_ranktarget_modules
lora_paths)rK   r   r   r   r   r  r  r  r  r   r   r  lora_target_modulesr  lora_managerr.  r   r   r   rR    s   zModelRunner.init_lora_managerlora_refc                 C   X   t d| dt| j| jdd | j|}t d| dt| j| jdd |S )z1Load a new lora adapter from disk or huggingface.zLoRA adapter loading starts: . avail mem=r  r  z LoRA adapter loading completes: )r   r   rl   r   r   r  load_lora_adapterr   r  resultr   r   r   r       zModelRunner.load_lora_adapterc                 C   s:   t d| d | j||||}t d| d |S )Nz*LoRA adapter loading from tensors starts: r2  z-LoRA adapter loading from tensors completes: )r   r   r  load_lora_adapter_from_tensors)r   r  tensorsconfig_dictadded_tokens_configr  r   r   r   r     s   z*ModelRunner.load_lora_adapter_from_tensorsc                 C   r  )zZUnload a lora adapter that was previously loaded during initialization or dynamic loading.zLoRA adapter unloading starts: r  r  r  z"LoRA adapter unloading completes: )r   r   rl   r   r   r  unload_lora_adapterr  r   r   r   r  
  r  zModelRunner.unload_lora_adapterc                 C      | j j}t|tr|S d S r   )r   r   r=  r   r   r7  r   r   r   qwen3_next_config     
zModelRunner.qwen3_next_configc                 C   r  r   )r   r   r=  r   r  r   r   r   hybrid_lightning_config"  r  z#ModelRunner.hybrid_lightning_configc                 C   s.   | j j }t|ttB tB tB tB r|S d S r   )	r   r   get_text_configr=  r   r   r   r   r   r  r   r   r   hybrid_gdn_config)  s   zModelRunner.hybrid_gdn_configc                 C   s   | j j}t|tr| jrt|dd }|d urd|vrd S t|ttB tB tB r)|S t|t	r1|j
S t|trItdd t|dg D }|sGd S |S d S )Nmtp_hybrid_override_patternMc                 s   s    | ]}|d kV  qdS )mambaNr   )r,  
layer_typer   r   r   r4  H  s
    
z,ModelRunner.mamba2_config.<locals>.<genexpr>layer_types)r   r   r=  r   r   r   r   r   r   r   
llm_configr   any)r   r7  pattern	has_mambar   r   r   mamba2_config7  s&   


zModelRunner.mamba2_configc                 C   s   | j r
t| j| jS | jS )z?Return the max token pool size considering hybrid swa settings.)r   r[  swa_max_total_num_tokensr]  r.  r   r   r   max_token_pool_sizeS  s   zModelRunner.max_token_pool_sizec                 C   r  r   )r   r   r=  r   r  r   r   r   kimi_linear_config[  r  zModelRunner.kimi_linear_configc                 C   s   | j p| jp| jp| jS r   )r  r  r  r  r.  r   r   r   mambaish_configb  s   zModelRunner.mambaish_configc                 C   s^   | j rdS | jjrttd dS | jdkrttd dS t  s&t  r-ttd dS dS )NFzYDisable piecewise CUDA graph because piecewise_cuda_graph has conflict with torch compiler   zMDisable piecewise CUDA graph because piecewise_cuda_graph does not support PPz?Disable piecewise CUDA graph due to existing compilation errorsT)	r   r   enable_torch_compilers   r   r   rF   	is_deepepis_mooncaker.  r   r   r   can_run_piecewise_cuda_graphk  s*   
z(ModelRunner.can_run_piecewise_cuda_graphc                 C   s:  | j jdkr<t| jdd }t|dd }t|tr7| dkr7tr+t| _t	| j | j _ngt
j| _t	| j | j _n[| j| _nV| j jdkrMtrHt| _nJt
j| _nE| j jdkr^trYt| _n9t
j| _n4| j jdv rit
j| _n)| j jdkrtt
d	r~t
j| _td
 ntd | j| _n
td| j j dttd| j  d S )Nautoquant_configkv_cache_quant_algoFP8r   r   )r   bfloat16fp4_e2m1float4_e2m1fn_x2z2FP4 (E2M1) KV Cache might lead to a accuracy drop!zh--kv-cache-dtype falls back to 'auto' because this torch version does not support torch.float4_e2m1fn_x2zUnsupported kv_cache_dtype: r2  zUsing KV cache dtype: )r   r  r   r   r=  r  upper_is_hiprH   TORCH_DTYPE_TO_KV_CACHE_STRr  float8_e4m3fnr  float8_e5m2r  r*  r  r   rv  rU  rs   )r   r  r  r   r   r   rY    sL   







z$ModelRunner.configure_kv_cache_dtypec                 C   s6   t j}d}t jd||d}t jd||d}|| }|S )zYWe need to run a small matmul to init cublas. Otherwise, it will raise some errors later.r6  )   r  rj  )r  r  ones)r   r  r   abcr   r   r   r`    s   zModelRunner.init_cublasc                 C   s|   | j jr'| jdd| _g | _t| j jD ]
}| j|   q| jd | _dS | j j	r7| j
s7t| j| _dS |  | _dS )Init attention kernel backend.Tr   r   N)r   r  _get_attention_backendattn_backenddecode_attn_backend_grouprN  sm_group_numr   decode_attn_backendenable_two_batch_overlapr   r<   r?  )r   r  r   r   r   ra    s   z"ModelRunner.init_attention_backendr   c                 C   s   | j j}| jr|rtd| d | j||dS | j  \| _| _| j| jkrUddl	m
} || | j| j|d| j| j|dd}td| j d| j d td	 n	| j| j j|d}| j| j	t _t _|S )
r  z&Overriding draft attention backend to r2  r  r   )HybridAttnBackend)decode_backendprefill_backendzFUsing hybrid attention backend for decode and prefill: decode_backend=z, prefill_backend=zWarning: Attention backend specified by --attention-backend or default backend might be overridden.The feature of hybrid attention backend is experimental and unstable. Please raise an issue if you encounter any problem.)r   #speculative_draft_attention_backendr   r   rv  _get_attention_backend_from_strget_attention_backendsprefill_attention_backend_strdecode_attention_backend_str/sglang.srt.layers.attention.hybrid_attn_backendr  r   r|  rd   prefill_attention_backenddecode_attention_backend)r   r   draft_attn_backendr  r  r   r   r   r    sV   



z"ModelRunner._get_attention_backendbackend_strc                 C   s2   |t vrtd| || _t | | }t| |S )NzInvalid attention backend: )r9   rU  r   r:   )r   r  r   full_attention_backendr   r   r   r     s
   
z+ModelRunner._get_attention_backend_from_strc                 C   s   d| d }g | _ t| jjd}t|}W d    n1 s w   Y  t| j| jD ]%}dt	| d | }| j 
t|| d d d | jjf    q,d S )Nr2  _projrzmodel.layers.z
.self_attn)sorted_channelsopenr   ds_channel_config_pathjsonloadrN  r%  r&  r  r   r  r  ds_heavy_channel_num
contiguousr6  )r   selected_channelfchannel_configikeyr   r   r   rV  	  s   z/ModelRunner.init_double_sparsity_channel_configc                 C   s&   | j dkrdS |  r|   dS dS )zv
        Warmup and tune kernels before cuda graph capture.
        Currently only doing FlashInfer autotune.
        r6  N)r   _should_run_flashinfer_autotune_flashinfer_autotuner.  r   r   r   rb    s
   
zModelRunner.kernel_warmupc                 C   sd   | j jrdS | j j}|dvrdS tj \}}|dk rdS | j s,| j s,| j	 r0| j
 S dS )z+Check if flashinfer autotune should be run.F)flashinfer_trtllmflashinfer_mxfp4	   T)r   disable_flashinfer_autotunemoe_runner_backendr  r6  r  r   is_eagleis_standaloneis_ngramr   )r   r  majorr  r   r   r   r  %  s    z+ModelRunner._should_run_flashinfer_autotunec              
   C   s   ddl m} td | jtj  t	| j
| j9 t % |  | j| jj| d W d   n1 s<w   Y  W d   n1 sKw   Y  W d   n1 sZw   Y  tj | j td dS )zRun flashinfer autotune.r   )autotunezRunning FlashInfer autotune...)
batch_sizerun_ctxNzFlashInfer autotune completed.)flashinfer.autotunerr  r   r   r  wait_streamr  r6  current_streamr  r   streaminference_mode
_dummy_runr   r^  )r   r  r   r   r   r  C  s   

 z ModelRunner._flashinfer_autotuner  c                    sv  j rtj}ntj}tj}dj sj sj	 r,j
r%tdtj}jjjjr3tj}| j }jjrCt  jrKj  tj}tjr^|s^tjs^J tjj|jjjj jj!jj"jj#jj$||dt%j&dd  j'd< j sdg| }|g| }}	t%j(|f|t%j)jd}
t%j*|ft%j)jd}t%j+dt%j)jd}nd}d}d}	d}
d}d}jj#dkrt,fd	d
 j-. D |r j/0t%j1gjj" t%j)jd  j20t%j1gjj" t%j)jd jj" n)tjr/ j/0t%j1gt%j)jd  j20t%j1gt%j)jd nd fdd}| }|tjkrK|rH|j3ntj}jj4rVdg| }nd}t5d0i d|d|d j6d j7d j8d j9d j:d j8dj;dj<djd j=d j8> ? d j@ddd jAd|	d|
d|d |d!|d"|d# j/d$ j2d%tBC d&d' jDd(jd)|d*|d+ j'd,|d-||durjEF jG  fd.d/}t%HjI  jJK  t%L + |ptM  |  W d   n1 sw   Y  W d   dS W d   dS 1 s4w   Y  dS )1z.Run a dummy forward pass for warmup/profiling.r   zThis should not happenr   F)r   max_bsmax_num_tokenhidden_sizer  r  r   r   is_encoder_decoderrw   seq_len_fill_valueencoder_len_fill_valuenum_tokens_per_bscache_loc_dtypeenable_mamba_track.rj  Nc                    s   i | ]\}}||d   qS r   r   r,  kv)rm  r   r   
<dictcomp>  s    z*ModelRunner._dummy_run.<locals>.<dictcomp>c                     s   d } j  sj  r4ddlm} jrtd|d  jd d d d d jj	jj
jjtjd d d} | S j  rOddlm} |d  jd d d d d} tj| _| S )Nr   )EagleVerifyInputzThis should not happen.)draft_tokencustom_mask	positionsretrive_indexretrive_next_tokenretrive_next_siblingretrive_cum_len
spec_stepstopkdraft_token_numcapture_hidden_modeseq_lens_sumseq_lens_cpu)NgramVerifyInput)r&  	tree_maskr(  r)  r*  r+  r/  )r   r  r  !sglang.srt.speculative.eagle_infor%  r   r  r'  r   speculative_num_stepsspeculative_eagle_topkspeculative_num_draft_tokensrS   FULLr  !sglang.srt.speculative.ngram_infor3  NULLr0  )	spec_infor%  r3  )buffersr  r   r   r   get_spec_info  sD   
	z-ModelRunner._dummy_run.<locals>.get_spec_infoforward_moder  	input_idsreq_pool_indicesseq_lensr2  next_token_logits_bufferorig_seq_lensr   token_to_kv_poolr  out_cache_locr1  encoder_lensreturn_logprobr(  extend_num_tokensextend_seq_lensextend_prefix_lensextend_start_locextend_prefix_lens_cpuextend_seq_lens_cpuglobal_num_tokens_gpu!global_num_tokens_for_logprob_gpudp_padding_modeglobal_dp_buffer_lenmrope_positionsr   r<  r0  num_token_non_paddedglobal_forward_modelora_idsc                     s   d  _ _tj  td i } jjdkr3dt	j
jjv r3tdd j D | d< js:d| d< j
j jjfi | }|S )NFr   r   c                 S   s   i | ]	\}}||  qS r   )cloner!  r   r   r   r$  5  s    z<ModelRunner._dummy_run.<locals>.run_once.<locals>.<dictcomp>Tget_embedding)dp_local_start_posdp_local_num_tokensr@   rQ  
is_max_lenrA   r   r   r  r  r   r  r  rV   r  itemsr   r@  r(  )kwargs!logits_output_or_pp_proxy_tensors)r=  forward_batchrR  rm  r   r   r   r   run_once%  s2   z(ModelRunner._dummy_run.<locals>.run_oncer   )Nr   rU   DECODEEXTENDrS   r;  r   r  r  r  r   r  TARGET_VERIFYr   r8  enable_return_hidden_statesr9  r  !get_cuda_graph_seq_len_fill_valuer  rR   r   r   rg  rw   rv   ru   rX   r9  r   r   r  r  r  r   r   r  r  int64rT  fullint32zerosarangerV   r   r\  rO  copy_r  rP  r0  rQ  rT   r@  rA  rB  r2  rC  r   rE  rF  sumitemrG  r(  r=   get_default_mode_in_cuda_graphrS  r  prepare_lora_batchinit_forward_metadatar  synchronizer  r  r  rj   )r   r  r  capture_forward_moder0  r  require_mlp_tp_gather_rM  rN  rI  rJ  rK  rL  r>  r<  rV  r`  r   )r=  r_  rR  rm  r  r   r   r   r  T  sp  






(
	
 !
$
TzModelRunner._dummy_runc                 C   s  d| _ d| _| jsdS | jj tjkrdS | jdkr!| jj	r!dS | jdkr,| jj
s,dS t }t| j| j}tdd ddd}td	|| j  d
|dd tdd ttd}|| j | | _ t| j| j}|| | _td	|| j  dt | dd| jdd|dd	 dS )zCapture device graphs.Nr   r   c                   S   s   dS )Nz
cuda graphr   r   r   r   r   r(  \      z0ModelRunner.init_device_graphs.<locals>.<lambda>z	cpu graphz	npu graph)r   r8  zCapture z7 begin. This can take up to several minutes. avail mem=r  r  c                   S   s   t S r   )rQ   r   r   r   r   r(  f  rt  z end. Time elapsed:  s. mem usage= GB. avail mem=r  )rd  re  r   r   r  r  r   r  r   r}  r  r  r  rl   r   r   r   r   rP   r7   )r   r  
before_memgraph_backendgraph_runners	after_memr   r   r   rc  G  sJ   
zModelRunner.init_device_graphsc           	   	   C   s<  d| _ | jjr|  sdS t| j| j_t| jd| j}g | _g | _g | _	|jj
D ]}t|drNt|jdr?| j|jj nKt|jdrM| j|jj n<t|dr[| j|j n/t|drwt|jdro| j|jj n| j|j nt|drt|jdr| j|jj d}d}t|drt|jd	r|jj}|j}t|d
rt|jd	r|jj}|j}t|drt|jd	r|jj}|j}| j| | j	| q)t| j| jjk rttd dS t }t| j| j}t d|dd t!| | _ t| j| j}|| }t dt | dd|dd|dd dS )z'Initialize piecewise CUDA graph runner.Nlanguage_model	self_attnattnattn_mqalinear_attn	attentionmlpexpertsblock_sparse_moemoezJDisable piecewise CUDA graph because some layers do not apply Standard GQAz.Capture piecewise CUDA graph begin. avail mem=r  r  z0Capture piecewise CUDA graph end. Time elapsed: ru  rv  r  )"piecewise_cuda_graph_runnerr   enable_piecewise_cuda_graphr  r   r   r   attention_layers
moe_layersmoe_fusionslayersr*  r|  r   r}  r~  r  r  r  r  r  r  rN  r   rI  rs   r   r  r  rl   r   r   r   rZ   )	r   r{  layer	moe_block
moe_fusionr  rw  rz  	mem_usager   r   r   rh  u  s~   




z&ModelRunner.init_piecewise_cuda_graphsc              	   C   s   t jdd}t }t|}|dkrA| j|ks#J d| j d| d| j|k r9td| d| j d| j d	 || j | _	d S |
d
}| jt|ksVJ d| j d|| j | _	| j|krptd| j d| d d S d S )NSGLANG_CPU_OMP_THREADS_BINDallz>SGLANG_CPU_OMP_THREADS_BIND is not set, in this case, tp_size zG should be smaller than or equal to number of numa node on the machine aR  . If you need tp_size to be larger than number of numa node, please set the CPU cores for each tp rank via SGLANG_CPU_OMP_THREADS_BIND explicitly. For example, on a machine with 2 numa nodes, where core 0-31 are on numa node 0 and core 32-63 are on numa node 1, it is suggested to use -tp 2 and bind tp rank 0 to core 0-31 and tp rank 1 to core 32-63. This is the default behavior if SGLANG_CPU_OMP_THREADS_BIND is not set and it is the same as setting SGLANG_CPU_OMP_THREADS_BIND=0-31|32-63. If you do need tp_size to be larger than the number of numa nodes, you could set SGLANG_CPU_OMP_THREADS_BIND explicitly for example SGLANG_CPU_OMP_THREADS_BIND=0-15|16-31|32-47|48-63 and run with -tp 4. If you don't want each tp rank to use all the cores on one numa node, you could set for example SGLANG_CPU_OMP_THREADS_BIND=0-15|32-47 and run with -tp 2.z!Detected the current machine has z- numa nodes available, but tp_size is set to z
, so only z numa nodes are used.|zLSGLANG_CPU_OMP_THREADS_BIND setting must be aligned with TP size parameter (z%). Please double check your settings.z	TP size (z")is larger than numa node number (z), in this case the available memory amount of each rank cannot be determined in prior. Please set proper `--max-total-tokens` to avoid the out-of-memory error.)r  r  r   rm   rN  r   r   rv  r   r  r  )r   
omp_cpuidscpu_ids_by_noden_numa_nodethreads_bind_listr   r   r   r     s4   



z ModelRunner.init_threads_bindingc                 C   sD   t d| j d ddlm} tj| j| jf}|| j	| d S )Nz%Enabling torch tensor parallelism on z	 devices.r   )tensor_parallel)
r   r   r    sglang.srt.layers.model_parallelr  r  r  init_device_meshr   r   )r   r  device_meshr   r   r   rP    s   zModelRunner.apply_torch_tp
stream_idxc                 C   s   | j | | _d S r   )r  r  )r   r  r   r   r   update_decode_attn_backend     z&ModelRunner.update_decode_attn_backendr_  skip_attn_backend_initc                 C   s\   |s| j jr| j| | j|_n| j| i }| jr ||d< | jj|j|j	|fi |S )Nr   )
r   r  r  rp  r  r  r   r  r@  r(  )r   r_  r  r   r]  r   r   r   forward_decode  s   
zModelRunner.forward_decodec                 C   s   i }| j r	||d< |jd ur|j |d< | jsd|d< | jd uo&| j|}|r5| jj|fi ||fS |s=| j| | j	j
|j|j|fi ||fS )Nr   input_embedsTrX  )r  r  r  r   r  can_runreplayr  rp  r   r  r@  r(  )r   r_  r  r   r]  r   r   r   r   forward_extend	  s2   


zModelRunner.forward_extendc                 C   sD   |j dkr| j| i }| jr||d< | jj|j|j|fi |S )Nr   r   )r  r  rp  r  r   r  r@  r(  )r   r_  r   r]  r   r   r   forward_idle+	  s   
zModelRunner.forward_idler   reinit_attn_backendforward_countc                 C   sT   |j dks|r| j| t|j | | jj}| j|j|j	||j |f}||_ |S )Nr   )
split_indexr  rp  r[  r   rI  r   forward_split_prefillr@  r(  )r   r_  r  r  next_split_indexretr   r   r   r  >	  s   z!ModelRunner.forward_split_prefillr   split_forward_countc           
   	   C   s  |  j d7  _ t | j |J}| |||||}t }|d urR| sR|  |  t	
d | j }		 zt|	 W n	 tyG   Y nw q8| |||||}W d    n1 s\w   Y  |d|_t j||jt| jdd d | jd ur| j  |S )Nr   zEPLB due to rank faultsTmetricsbs)r_  r   cuda_graph_batch)r   r0   with_forward_pass_forward_rawr+   r*  is_active_equal_lastsnapshot_active_to_lastsync_active_to_cpur   r   rA  	rebalancenextStopIterationr   r   rD   on_forward_endr   r   rd  on_forward_pass_end)
r   r_  r  r   r  r  recorder_outputsoutputelastic_ep_stategenr   r   r   r  S	  s\   

 

zModelRunner.forwardc           	      C   sX  | j dkr	|jjn|jj}t| o| jo| j|}|r,| jj|||d}t||dS |j	d ur7|
|  n||  |jd urU|jd urUt| jrUt sU|j| jd |j rc| j|||d}n5|j rq| j|||d}n'|jjddr| j|||d\}}n|j r| j||d}ntd	|j |j	d ur| jjr|| t||dS )
Nr   )r  r   )r   r   )r   )r  r  T)include_draft_extend_v2)r   zInvalid forward mode: )r   r?  is_cpu_graphis_cuda_graphr   rd  r  r  r   global_num_tokens_cpuprepare_mlp_sync_batchprepare_attn_tp_scatter_inputrT  rO  rv   r   r;   'adjust_num_token_non_padded_for_attn_tp	is_decoder  is_split_prefillr  	is_extendr  is_idler  rU  r  is_last_rankpost_forward_mlp_sync_batch)	r   r_  r  r   r  r  
mode_checkr   r  r   r   r   r  	  sn   











zModelRunner._forward_rawr   sampling_infoc                 C   s   |   ||j d S r   )update_regex_vocab_maskapply_logits_biasnext_token_logits)r   r   r  r   r   r   _preprocess_logits	  s   zModelRunner._preprocess_logitsc              	      sr   t |trtj fdd|D ddS | j | j j j j	 j
 r1 j}|S  jd }|S )a  Sample and compute logprobs and update logits_output.

        Args:
            logits_output: The logits output from the model forward
            forward_batch: The forward batch that generates logits_output

        Returns:
            A list of next_token_ids
        c                    s   g | ]} | qS r   )sample)r,  valuesr_  r   r   r   r/  	  s    z&ModelRunner.sample.<locals>.<listcomp>r{  )axisr   )r=  tupler  stackr  r  rE  rH  top_logprobs_numstoken_ids_logprobsr?  r  r(  rB  )r   r   r_  next_token_idsr   r  r   r  	  s&   
zModelRunner.samplec                 C   s8   |j sdS | ||j | j||j|j|j|j  dS )a  
        Compute token_ids_logprobs without performing sampling.

        Optimized path for prefill-only requests that need token_ids_logprobs but don't
        require next token generation. Skips expensive sampling operations
        while still providing requested probability information.

        Args:
            logits_output: The logits output from the model forward
            forward_batch: The forward batch that generates logits_output
        N)r  r  r  rE  compute_logprobs_onlyrH  r  )r   r   r_  r   r   r   r  
  s   z!ModelRunner.compute_logprobs_onlyc                 C   s8   t | jjddpt | jjdi }|du rdS d|v }|S )zDetect if the model has "mrope" rope_scaling type.
        mrope requires keep "rope_deltas" between prompt and decoding phases.rope_parametersNrope_scalingFmrope_section)r   r   r  )r   r  is_mrope_enabledr   r   r   model_is_mrope&
  s   
zModelRunner.model_is_mropeurlc                 C   s4   ddl m} td|  || j| jj| d S )Nr   )RemoteModelLoaderzSaving model to )sglang.srt.model_loader.loaderr  r   r   
save_modelr   r   r   )r   r  r  r   r   r   save_remote_model2
  s   zModelRunner.save_remote_modelpathr  max_sizec                 C   s>   ddl m} td| d| d|  || j||| d S )Nr   )ShardedStateLoaderzSave sharded model to z with pattern z and max_size )r  r  r   r   r  r   )r   r  r  r  r  r   r   r   save_sharded_model8
  s
   zModelRunner.save_sharded_modelactionc                 C   s   | j j|d d S )N)r  )r  r}  )r   r  r   r   r   check_weightsB
  s   zModelRunner.check_weightsc              
   C   s   zddl m} || }||j W dS  ty, } zdd| fW  Y d}~S d}~w tyK } ztd|  dt|fW  Y d}~S d}~ww )z:Update weights from IPC for checkpoint-engine integration.r   ))SGLangCheckpointEngineWorkerExtensionImpl)Tz(IPC weight update completed successfullyFz&IPC weight update failed: ImportError NzIPC weight update failed: )	5sglang.srt.checkpoint_engine.checkpoint_engine_workerr  update_weights_from_ipczmq_handlesru  r  r   rP  r  )r   recv_reqr  workerry  r   r   r   r  E
  s   z#ModelRunner.update_weights_from_ipcc              	   C   s   | j s| jjrtj dkrd S t| j	| j
F tdtj  d tt  tjtj d d d ftj| jd W d    n1 sKw   Y  W d    d S W d    d S 1 scw   Y  d S )Nr   z*Pre-allocating symmetric memory pool with z GiBi   rj  )r   r   enable_symm_memr,    SGLANG_SYMM_MEM_PREALLOC_GB_SIZEr   r  r  r   r  r  r   r   r)   r"   rt  uint8r.  r   r   r   ri  V
  s(   "z*ModelRunner.prealloc_symmetric_memory_pool)NNNFNNN)NF)r  r   )r  )F)FN)Fr   )FNFr   )NN)\r   r   r   r   r   floatintrc   r   r   rO   rN   r   r  r  r_  r=  r   r  r   r   rF  r2   r   r/  r  r   r  r,  rW  rb  re  rh  r  rr  r   r	   r  Tensorr  r  r  rR  rL   r  r  r  propertyr  r  r  r  r  r  r  r  rY  r`  ra  r  r  rV  rb  r  r  r  rc  rh  r   rP  r  rT   rB   rV   r  rG   r  r  r  r   r  r  rb   r  r  r  r  r  r  r  r  ri  r   r   r   r   r     s   	

  9"~( >


J
26
-
;"
!









0	6
	 t.K"


)



=
K


&
 

r   r  c                 C   s,   t |  }|D ]\}}t|| | qd S r   )dictr\  ra   )r   r  params_dictr'  r  r   r   r   r  l
  s   r  c                 C   s   t | tr
| |} | |S r   )r=  r  r   to)r  r   r   r   r   r   r  r
  s   


r  c                   @   s,   e Zd ZU dZee ed< defddZdS )r  ztorch.Tensor that gets serialized by MultiprocessingSerializer (which only serializes a pointer and not the data).
    The i-th element in the list corresponds to i-th rank's GPU.r  r  c                 C   s   t | j| S r   )rg   deserializer  )r   r  r   r   r   r   
  r  zLocalSerializedTensor.getN)	r   r   r   r   r   bytesr   r  r   r   r   r   r   r  x
  s   
 r  )r   r  r>  r  r  r   r  r  r  r  collectionsr   dataclassesr   typingr   r   r   r   r	   r  torch.distributedr  r  r
   sglang.srt.configsr   r   r   r   r   r   r   r   r   r   r   r   r    sglang.srt.configs.device_configr   sglang.srt.configs.load_configr   r   sglang.srt.configs.model_configr   r   r    sglang.srt.configs.update_configr   sglang.srt.constantsr   /sglang.srt.debug_utils.tensor_dump_forward_hookr    sglang.srt.distributedr!   r"   r#   r$   r%   r&   r'   r(   <sglang.srt.distributed.device_communicators.pynccl_allocatorr)   %sglang.srt.distributed.parallel_stater*    sglang.srt.elastic_ep.elastic_epr+   sglang.srt.environr,   sglang.srt.eplb.eplb_managerr-   #sglang.srt.eplb.expert_distributionr.   r/   r0   r1   sglang.srt.eplb.expert_locationr2   r3   r4   r5   'sglang.srt.eplb.expert_location_updaterr6   =sglang.srt.hardware_backend.npu.graph_runner.npu_graph_runnerr7   sglang.srt.layersr8   .sglang.srt.layers.attention.attention_registryr9   r:   %sglang.srt.layers.attention.nsa.utilsr;   'sglang.srt.layers.attention.tbo_backendr<   sglang.srt.layers.dp_attentionr=   r>   r?   r@   rA   "sglang.srt.layers.logits_processorrB   -sglang.srt.layers.moe.routed_experts_capturerrC   rD   rE   sglang.srt.layers.moe.utilsrF   sglang.srt.layers.poolerrG   )sglang.srt.layers.quantization.fp8_kernelrH   sglang.srt.layers.samplerrI   sglang.srt.layers.torchao_utilsrJ   sglang.srt.lora.lora_managerrK   sglang.srt.lora.lora_registryrL   "sglang.srt.managers.schedule_batchrM   sglang.srt.mem_cache.allocatorrN    sglang.srt.mem_cache.memory_poolrO   *sglang.srt.model_executor.cpu_graph_runnerrP   +sglang.srt.model_executor.cuda_graph_runnerrQ   rR   ,sglang.srt.model_executor.forward_batch_inforS   rT   rU   rV   &sglang.srt.model_executor.hook_managerrW   'sglang.srt.model_executor.input_buffersrX   5sglang.srt.model_executor.model_runner_kv_cache_mixinrY   5sglang.srt.model_executor.piecewise_cuda_graph_runnerrZ   r  r[   r\   ;sglang.srt.model_loader.remote_instance_weight_loader_utilsr]   r^   r_   sglang.srt.model_loader.utilsr`   $sglang.srt.model_loader.weight_utilsra   'sglang.srt.sampling.sampling_batch_inforb   sglang.srt.server_argsrc   rd   re    sglang.srt.speculative.spec_inforf   sglang.srt.utilsrg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   #sglang.srt.utils.nvtx_pytorch_hooksr{   sglang.srt.utils.offloaderr|   r}   r~   sglang.srt.utils.patch_torchr   r   +sglang.srt.utils.torch_memory_saver_adapterr   sglang.srt.utils.weight_checkerr   $sglang.srt.weight_sync.tensor_bucketr   r   r  r   r  r  %sglang.srt.hardware_backend.npu.utilsr   r   r   r  float8_e4m3fnuzr  r  r  r   r   r   	getLoggerr   r   Moduler   Filterr   r   r   r  r  r  r  r  r   r   r   r   <module>   s   <(
X
	
                  h