o
    پi!                    @  s  U d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlZddlZddlZddlZddlZddlZddlZddlZddlZddlZddlZddlZddlZddlZddlZddlZddlZddlZddlZddl Z ddl!Z!ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddlm)Z)m*Z* ddl+m,Z,m-Z- dd	l.m/Z/ dd
l
m0Z0 ddlm1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZB ddlCmDZD ddlEmFZFmGZG ddlHZIddlJZJddlKZKddlLZLddlMZMddlNZNddlOZNddlOmPZQ ddlRZRddlSZSddlTm-ZU ddlVmWZW ddlXmYZY ddlNmZZZ ddl[m\Z\ ddl]m^Z^m_Z_m`Z` ddlambZb ddlcmdZd ddlemfZf ddlgmhZh e7rjddlimjZj ddlkmlZl emenZoeUpeNjqjrZse)dddFd!d"Ztet rd#ZueuZvneNweNjxjyZvev Zzeve_veze_zd$Z{e)ddd%d& Z|e)ddd'd( Z}e)dddFd)d*Z~e)dddFd+d,Ze)dddFd-d.Ze)dddFd/d0ZdFd1d2Ze)dddFd3d4Ze)dddFd5d6ZdFd7d8Zd9d: Ze&dGd=d>ZdHdCdDZe)dde*edEgdFdGZe)dde*edHgdFdGZe)dde*eg dIdJdG ZZe)dde*edKgdJdGZe)dde*edLgdJdGZe)dde*edHgdFdGZzddlZeeNjjdMZW n   dNZY zeNjj ZW n   dNZY dOdP ZdQdR ZdSdT ZdUdV Ze)dddWdX ZdYdZ ZdId\d]Ze ZdJdKdadbZdLdMdddeZdNdOdhdiZdPdkdlZedmd^aG dndo doebZdNai adpdq ZG drds dsZdQdudvZdwdx ZdRdydzZ	dSd|d}ZdFd~dZG dd de>Z				N	dTdUddZ	dVdWddZe)dddd ZdXddZdYddZ	{dZd[ddZdd Zdd Zdd Z	{d\d]ddZe(G dd dZd^ddZd_ddZd`daddZdbddZdcddĄZddƄ ZddȄ Zdddd̈́ZÐdeddτZĐdfdgddӄZddՄ ZƐdhdd؄ZǐdiddۄZȐdVdjdd݄ZɐdkddZʐdlddZ			{dmdnddZ				NdodpddZdaddddZ		{d\dqddZ	dcdrddZѐdsd	d
ZҐdd ZӐdd ZԐdd ZՐdd Z֐deאd< dd Zؐdd ZG dd dZڐdd ZېdtddZܐd d! Zݐd"d# Zސd$d% Zߐd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zdcdud2d3Z									dvd4d5Zd6d7 Ze)ddwd8d9Ze)ddwd:d;ZdLdxd=d>Ze)dddFd?d@Ze)dEddcdydAdBZe)dddzdCdDZdLd{dEdFZdLd|dGdHZdcdIdIdJZe\dKdLZ		d}d~dVdWZdd[d\Zdd^d_ZG d`da daZG dbdc dcejZddde ZddgdhZdidj Zdkdl ZG dmdn dnejZdcdodpZ	dcddsdtZ dzdudvZddxdyZddzd{Zdd}d~Zdd Zdd Zdd ZdddZdddZ	dddZ
eeRde	 G dd dZdd ZdddZdddZdddZdddd fdddZdddZdddZdd Zdd Zdd Zdd ZdcdddZdddZdcdddZdd Zdd ZG dd dZddÄ ZddĐdńZddƐdǄZ eAdȃZ!G dɐdʄ de;e! Z"dd͐d΄Z#ddϐdЄZ$ddѐd҄Z%ddӐdԄZ&dcddאd؄Z'ddڐdۄZ(dܐd݄ Z)dސd߄ Z*dcdddZ+G dd dZ,G dd dZ-dddZ.dd Z/dd Z0dddZ1dd Z2dddZ3dddZ4dd Z5dd Z6dd Z7dd Z8dddZ9d d Z:dd Z;dd Z<e)dddd Z=g dZ>d	Z?G d
d dZ@e)dddFddZAdd ZBdddZCdd ZDdd ZEdddZFdd ZGe&dddZHdd!d"ZI		Nddd'd(ZJG d)d* d*ZKdcd+d,ZL	dcd-d.ZMd/d0 ZNe&d1d2 ZOdaPd3d4 ZQdd6d7ZRd8d9 ZSdzd:d;ZTdFd<d=ZUdzd>d?ZVe)dddzd@dAZWdFdBdCZXdDdE ZYdS (  zCommon utilities.    )annotationsN)OrderedDictdefaultdict)contextmanager)	dataclass)	lru_cachepartial)PackageNotFoundErrorversion	find_spec)BytesIO)JSONDecodeError)ForkingPickler)Path)TYPE_CHECKINGAnyCallableDictGenericListOptionalProtocolSequenceTupleTypeVarUnion)SkipTest)unquoteurlparse)r
   Image)Mount)nn)Library)ProfilerActivityprofilerecord_function)_DecoratorContextManager)Literalenvs)enable_func_timer)VideoReader)
ServerArgs   )maxsizereturnboolc                   C  s   t jjd uS N)torchr
   hip r6   r6   K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/utils/common.pyis_hipm      r8   g      l@z_{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]
c                   C  s   t j ot jjS r3   r4   cudais_availabler
   r6   r6   r6   r7   is_cuda   s   r=   c                   C  s   t  pt S r3   )r=   r8   r6   r6   r6   r7   is_cuda_alike   r9   r>   c                   C     t tdo	tj S )Nhpu)hasattrr4   r@   r<   r6   r6   r6   r7   is_hpu      rB   c                   C  r?   )Nxpu)rA   r4   rD   r<   r6   r6   r6   r7   is_xpu   rC   rE   c                   C  s$   t tdsdS tj stddS )NnpuFz?torch_npu detected, but NPU device is not available or visible.T)rA   r4   rF   r<   RuntimeErrorr6   r6   r6   r7   is_npu   s   

rH   c                  C  (   t   } | dv ottdotj S )N)x86_64amd64i386i686cpuplatformmachinelowerrA   r4   rN   r<   rQ   r6   r6   r7   is_host_cpu_x86   s   rT   c                  C  rI   )N)aarch64arm64rN   rO   rS   r6   r6   r7   is_host_cpu_arm64   s   rW   c                  C  s    t  pt } tdddko| S )NSGLANG_USE_CPU_ENGINE01)rT   rW   osgetenv)is_host_cpu_supportedr6   r6   r7   is_cpu   s   r^   c                  C  s:   zdd l } W n
 ty   Y dS w ttjdotjjd uS )Nr   Fmusa)torchadaImportErrorrA   r4   r
   r_   )r`   r6   r6   r7   is_musa   s   rb   c                 C  s   t tdd}t o| |kS )z9Check if dtype is float4_e2m1fn_x2 and CUDA is available.float4_e2m1fn_x2N)getattrr4   r=   )dtypetarget_dtyper6   r6   r7   is_float4_e2m1fn_x2   s   rg   c                   C  s$   t jjrtttt jjdS dS )N.r   r   )r4   r
   r;   tuplemapintsplitr6   r6   r6   r7   get_cuda_version   s   rn   devicetorch.devicec                 c  s    | j dkr$t r$td d V  W d    d S 1 sw   Y  d S t| }|d urI|| j d V  W d    d S 1 sBw   Y  d S td|  )NrN   zUnknown device module: )typer^   r4   ro   get_device_moduleindex
ValueError)ro   moduler6   r6   r7   device_context   s   "
"rv   device_capability_majors	List[int]cuda_versionTuple[int, int]c                 C  s@   t  sdS tj d | v ottttjjdd d |kS )NFr   rh      )	r=   r4   r;   get_device_capabilityrj   rk   rl   r
   rm   rw   ry   r6   r6   r7   _check_cuda_device_version   s
   "r~      )      r}   	   )
      r   )r   r   r   r   convert_weight_packedFc                   C  s   t otS r3   )is_amx_tile_supportedis_intel_amx_backend_availabler6   r6   r6   r7   cpu_has_amx_support     r   c                 C  s   t | ddS )Nuse_intel_amx_backendF)rd   )layerr6   r6   r7   r   #     r   c                   C  s   t  r	tj jS dS NF)rE   r4   rD   get_device_propertieshas_fp64r6   r6   r6   r7   xpu_has_xmx_support'  s   r   c                   C  s   t dot S )NSGLANG_USE_SGL_XPU)get_bool_env_varrE   r6   r6   r6   r7   use_intel_xpu_backend/     r   c                   C  s&   t dddsdS tjdduot S )zm
    Check whether flashinfer is available.
    As of Oct. 6, 2024, it is only available on NVIDIA GPUs.
    SGLANG_IS_FLASHINFER_AVAILABLEtrue)defaultF
flashinferN)r   	importlibutilr   r=   r6   r6   r6   r7   is_flashinfer_available3  s   r   c                  C  s   dD ]
} t | dr dS qdS )z7
    temporary fix for issue #11272 (cublas 12.9+)
    )znvidia-cublasznvidia-cublas-cu12z12.9TF)check_pkg_version_at_least)pkgr6   r6   r7    is_nvidia_cublas_version_ge_12_9>  s
   
r   strc                   C  s   t t jS r3   )r   uuiduuid4hexr6   r6   r6   r7   random_uuidH  r   r   falsenamer   c                 C  s^   t | |}| }d}d}||vr+||vr+| tvr&td|  d| d t|  ||v S )N)r   rZ   )r   rY   zget_bool_env_var(z) see non-understandable value=z and treat as false)r[   r\   rR   _warned_bool_env_var_keysloggerwarningadd)r   r   valuetruthy_valuesfalsy_valuesr6   r6   r7   r   O  s   
r   rl   c                 C  @   t | }|d u s| s|S zt|W S  ty   | Y S w r3   )r[   r\   striprl   rt   r   r   r   r6   r6   r7   get_int_env_varc     

r           floatc                 C  r   r3   )r[   r\   r   r   rt   r   r6   r6   r7   get_float_env_varn  r   r   backendc                 C  s   | dvS )N)torch_native	intel_amxr6   )r   r6   r6   r7   support_tritony  r   r   "SGLANG_ENABLE_TORCH_INFERENCE_MODEc                      sf   e Zd ZdZedddZd fdd	Zerdnd	f fd
d	ZdddZ	dddZ
dddZ  ZS )DynamicGradModez
    A combination of torch.no_grad and torch.inference_mode,
    with their behavior controlled by an environment variable. Just refer to them.
    moder2   c                 C  s    t | tr	| ad S td d S )Nzmode is not a boolean object)
isinstancer2   _ENABLE_TORCH_INFERENCE_MODEr   r   )r   r6   r6   r7   set_inference_mode  s   
z"DynamicGradMode.set_inference_modeTc                   s,   t j s
t   tr|| _d S d| _d S r   )r4   _jit_internalis_scriptingsuper__init__r   r   prev)selfr   	__class__r6   r7   r     s
   



zDynamicGradMode.__init__Nc                   s(   |d u s	t |trt | S |  |S r3   )r   r2   r   __new__)clsmode_or_orig_funcr   r6   r7   r     s   
zDynamicGradMode.__new__r1   Nonec                 C  s:   t rtj| j| _| j  d S t | _t	d d S r   )
r   r4   _C_InferenceModer   _inference_mode_context	__enter__is_grad_enabledr   set_grad_enabledr   r6   r6   r7   r     s
   
zDynamicGradMode.__enter__exc_typer   	exc_value	tracebackc                 C  s(   t r| j||| d S t| j d S r3   )r   r   __exit__r4   r   r   r   r   r   r   r6   r6   r7   r     s   zDynamicGradMode.__exit__'DynamicGradMode'c                 C  s   t r| | jS |  S )z-
        Create a copy of this class
        )r   r   r   r   r6   r6   r7   clone  s   zDynamicGradMode.clone)r   r2   Tr1   r   )r   r   r   r   r   r   r1   r   )r1   r   )__name__
__module____qualname____doc__staticmethodr   r   r   r   r   r   r   __classcell__r6   r6   r   r7   r     s    

r   c                   C  s   da d S NT)show_time_costr6   r6   r6   r7   enable_show_time_cost     r   c                   @  s&   e Zd Zd
ddZdd Zdd Zd	S )TimeInfo皙?r   c                 C  s(   || _ || _|| _|| _d| _d| _d S Nr   )r   intervalcolorindentacc_timelast_acc_time)r   r   r   r   r   r6   r6   r7   r     s   
zTimeInfo.__init__c                 C  s"   | j | j | jkr| j | _dS dS NTF)r   r   r   r   r6   r6   r7   check  s   zTimeInfo.checkc                 C  sJ   t d| j ddd t d| j d dd t | j d| jdd	 d S )
Nz[m )end-r{   : z.3fzs[0m)printr   r   r   r   r   r6   r6   r7   pretty_print  s   zTimeInfo.pretty_printNr   r   r   )r   r   r   r   r   r   r6   r6   r6   r7   r     s    
	r   r   c                 C  sN   t sd S tj  t| d d u rt| |||t| < t|   jt	 8  _d S r3   )
r   r4   r;   synchronize
time_infosgetr   r   timeperf_counter)r   r   r   r   r6   r6   r7   
mark_start  s   
r   c                 C  sH   t sd S tj  t|   jt 7  _t|   r"t|  	  d S d S r3   )
r   r4   r;   r   r   r   r   r   r   r   )r   r6   r6   r7   mark_end  s   
r   c                       fdd}|S )Nc                   s    fdd}|S )Nc                    sf   t j  rt } | i |}t j  r1t | d }|kr1td j d| d |S )N  	Function z took z ms to run.)r4   r;   r   r   r   r   r   )argskwargs
start_timeresult	cost_time)funcmin_cost_msshowr6   r7   
inner_func  s   

z3calculate_time.<locals>.wrapper.<locals>.inner_funcr6   )r  r	  r  r  r  r7   wrapper  s   zcalculate_time.<locals>.wrapperr6   )r  r  r  r6   r
  r7   calculate_time  s   r  Tc                 C  s  | dkrEt j }||k sJ t j |kr%td| dt j  dd |r,t j  t j|}|jr<t	 j
}nt j|\}}n| dkrt j }||k sTJ t j |krjtd| dt j  dd |rqt j  t j }	t j|j}
|
|	 }n| dkrt j }||k sJ t j |krtd| dt j  dd	 t j \}}
n| d
krt	 j
}tt }t|| d}nz| dkrt j }||k sJ t j |krtd| dt j  dd |rt j  t j \}}
nF| dkr?t j }||k s
J t j |kr!td| dt j  dd |r)t j  t j|}|jr8t	 j
}t j \}}
|rZt j|t jd}t jj|t jjj|d | }|d S )z
    Get available memory for cuda:gpu_id device.
    When distributed is True, the available memory is the minimum available memory of all GPUs.
    r;   zWARNING: current device is not z, but z, zAwhich may cause useless memory allocation for torch CUDA context.rD   z@which may cause useless memory allocation for torch XPU context.r@   z@which may cause useless memory allocation for torch HPU context.rN   r   rF   z@which may cause useless memory allocation for torch NPU context.r_   zAwhich may cause useless memory allocation for torch MUSA context.re   )opgroup   @)r4   r;   device_countcurrent_devicer   empty_cacher   is_integratedpsutilvirtual_memory	availablemem_get_inforD   memory_allocatedtotal_memoryr@   lenget_cpu_ids_by_noderoundrF   r_   tensorfloat32distributed
all_reduceReduceOpMINitem)ro   gpu_idr!  r  	cpu_groupnum_gpuspropsfree_gpu_memory_used_memorytotal_gpu_memorytotal_free_memoryn_numa_noder  r6   r6   r7   get_available_gpu_memory  s   














r0  c                   C  s
   t j S r3   )r4   r;   r<   r6   r6   r6   r7   is_pin_memory_availabled     
r1  c                   @  s   e Zd Zd
ddZd	S )LayerFnidxrl   prefixr   r1   torch.nn.Modulec                 C     d S r3   r6   )r   r4  r5  r6   r6   r7   __call__j      zLayerFn.__call__N)r4  rl   r5  r   r1   r6  )r   r   r   r8  r6   r6   r6   r7   r3  h  s    r3  r   num_hidden_layerslayer_fnpp_rankOptional[int]pp_sizer5  return_tupleoffloader_kwargsOptional[Dict[str, Any]] Tuple[torch.nn.Module, int, int]c                   s   ddl m} ddlm  ddlm} |r| |ksJ |dur(|dur(|| ||nd| f\}	}
tj fddt	|	D | j
fdd	t	|	|
D fi |pPi   fd
dt	|
| D  }|du sj|du rl|S ||	|
fS )z3Make a list of layers with the given layer functionr   )get_pp_indices)PPMissingLayerget_offloaderNc                      g | ]} d qS )r?  r6   .0r+  rD  r?  r6   r7   
<listcomp>  s    zmake_layers.<locals>.<listcomp>c                 3  "    | ]} |t |d V  qdS )r4  r5  N
add_prefixrJ  r4  r;  r5  r6   r7   	<genexpr>  
    
zmake_layers.<locals>.<genexpr>c                   rG  rH  r6   rI  rK  r6   r7   rL    s    )sglang.srt.distributedrC  sglang.srt.layers.utilsrD  sglang.srt.utils.offloaderrF  r4   r#   
ModuleListrangewrap_modules)r:  r;  r<  r>  r5  r?  r@  rC  rF  start_layer	end_layermodulesr6   )rD  r;  r5  r?  r7   make_layersm  s<   	
r^  torch.nn.ModuleListc                   s8   ddl m} tj|  fddt| D }|S )Nr   rE  c                 3  rM  rN  rO  rQ  rR  r6   r7   rS    rT  z%make_layers_non_pp.<locals>.<genexpr>)rW  rF  r4   r#   rX  rZ  rY  )r:  r;  r5  rF  layersr6   rR  r7   make_layers_non_pp  s   ra  c                   C  s   t  S r3   )r4   rr   r6   r6   r6   r7   rr     r   rr   seedr   c                 C  s>   t |  tj |  t|  tj rtj|  dS dS )z&Set the random seed for all libraries.N)randomrb  npr4   manual_seedr;   r<   manual_seed_all)rb  r6   r6   r7   set_random_seed  s   


rg  portOptional[psutil.Process]c              	   C  sJ   t jddD ]}|jj| kr"z	t |jW   S  t jy!   Y qw qd S )Ninet)kind)r  net_connectionsladdrrh  ProcesspidNoSuchProcess)rh  connr6   r6   r7   find_process_using_port  s   rr     	port_name	timeout_sraise_exceptionc                 C  s   t |D ]S}t| r dS |dkrR|d dkrRt| }|d u r(td|  d |j}| d| d| d	| d
|	}t	d|  d| d| d|  t
d q|rit| d|  d| d| dS )NTr      r   z	The port z; is in use, but we could not find the process that uses it.z. is used by a process already. process.name()=z' process.cmdline()=z process.status()=z pid=zport z is in use. Waiting for z seconds for z to be available. r   z at z is not available in z
 seconds. F)rY  is_port_availablerr  r   r   ro  r   cmdlinestatusinfor   sleeprt   )rh  rt  ru  rv  iprocessro  error_messager6   r6   r7   wait_port_available  s(   
,r  c              	   C  s   t  t jt j@}z|t jt jd |d| f |d W W d   dS  t jy8   Y W d   dS  t	yG   Y W d   dS w 1 sKw   Y  dS )z#Return whether a port is available.r/   r   NTF)
socketAF_INETSOCK_STREAM
setsockopt
SOL_SOCKETSO_REUSEADDRbindlistenerrorOverflowErrorrh  sr6   r6   r7   rx    s   
rx  c                  C  s   z't  t jt j} | d |  d W  d    W S 1 s w   Y  W d S  tyV   t  t jt j} | d |  d W  d     Y S 1 sNw   Y  Y d S w )Nr   r   r/   )r  r  r  r  getsocknameOSErrorAF_INET6)r  r6   r6   r7   get_free_port  s   

(

*r  c                 C  s  ddl m} tj| dd}g }d}|dv sJ d|dkr|d}|t|d k r{|| d	kro||d
  dkro||d  dkro||d  dkro||d  dkro||d  dkro||d  dkro||d  dkro|| |d7 }n|d
7 }|t|d k s'n.d}|t|d
 k r|| dkr||d
  dkr|| |d7 }n|d
7 }|t|d
 k sg }|D ]1}||d
 t|k r|||d
  nt|}||| }	|t|	}
t	
|
}|| q|rt	j|dd|
jfS t	
g dfS )Nr   r    TvalidatePNG)r  JPEGz+FRAME_FORMAT must be either 'PNG' or 'JPEG'      r/   P   r{   N   r   G         rw  r         r         axisri   )PILr!   pybase64	b64decoder  appendrs   openr   rd  arraystacksize)video_base64r!   video_bytes
img_startsframe_formatr}  frames	start_idxend_idx	img_bytesimgframer6   r6   r7   decode_video_base64  sV   






r  
audio_filesrmono
np.ndarrayc           
      C  sh  dd l }ddlm} |d u rd}t| tr|t| \}}nn| dr:| dd } |tt	j
| dd\}}nS| d	sD| d
rettdd}tj| d|d}t|j} |  || \}}n(| drytt| j} || \}}nt| tr|| \}}ntd|  ||krtt|t| | }	|||	}|rt|jdkrtj|dd}|S )Nr   )resamplei>  data:,r/   Tr  http://https://REQUEST_TIMEOUT5streamtimeoutfile://zInvalid audio format: r  )	soundfilescipy.signalr  r   bytesreadr   
startswithrm   r  r  rl   r[   r\   requestsr   contentcloser   r   pathr   rt   r  r   shaperd  mean)
r  r  r  sfr  audiooriginal_srr  responsenum_samplesr6   r6   r7   
load_audioK  s:   






r  c                   @  s.   e Zd ZU ded< dZded< dZded< dS )		ImageDatar   urlautoz(Optional[Literal['auto', 'low', 'high']]detailNr=  max_dynamic_patch)r   r   r   __annotations__r  r  r6   r6   r6   r7   r  x  s   
 r  
image_file)Union[Image.Image, str, ImageData, bytes]#tuple[Image.Image, tuple[int, int]]c                 C  s  t | tr| j} d  }}t | tjr| }|j|jf}||fS t | tr.tt| }||fS | 	ds8| 	dret
tdd}tj| d|d}z|  t|j}|  W |  ||fS |  w | 	drztt| j} t| }||fS |  drt| }||fS | 	d	r| d
d } tttj| dd}||fS t | trtttj| dd}||fS td|  )Nr  r  r  3Tr  r  pngjpgjpegwebpgifr  r  r/   r  Invalid image: )r   r  r  r!   widthheightr  r  r   r  rl   r[   r\   r  r   raise_for_statusrawloadr  r   r   r  rR   endswithrm   r  r  r   rt   )r  image
image_sizer  r  r6   r6   r7   
load_image  sF   






	

r  Union[str, bytes]c                 C  s$  t | tr| S | ds| dr#ttdd}tj| |d}|jS | drJt	t
| j} t| d}| W  d    S 1 sCw   Y  d S |  drlt| d}| W  d    S 1 sew   Y  d S | d	r| d
d } tj| ddS t | trtj| ddS td|  )Nr  r  r  r  r  r  rbr  r  r  r/   Tr  r  )r   r  r  rl   r[   r\   r  r   r  r   r   r  r  r  rR   r  rm   r  r  r   NotImplementedError)r  r  r  fr6   r6   r7   get_image_bytes  s*   

$$

r  
video_fileuse_gpuc              	   C  sx  ddl m}m}m} zddlm} |d}||}W n ty(   |d}Y nw d }d }	zt| t	rKt
jddd}||  |  ||j|d}	nt| tr| drttd	d
}
tj| d|
d}|  t
jddd}|jddD ]}|| qv|  ||j|d}	n| dr| dd\}}tj|dd}t
jddd}|| |  ||j|d}	n\| drtt| j} || |d}	nItjtt| jr|| |d}	n7tj| dd}t
jddd}|| |  ||j|d}	nt| tt t!j"t#j$fr	| }	n	t%dt&|  |	W |r&tj'|jr't(|j S S S |r:tj'|jr;t(|j w w w )Nr   )r-   rN   gpu)decord_bridgeFz.mp4)deletesuffixctx)r  r  r  10Tr  i    )
chunk_sizer  r  r/   r  r  zUnsupported video input type: ))decordr-   rN   r  decord.bridger  get_ctx_device	Exceptionr   r  tempfileNamedTemporaryFilewriter  r   r   r  rl   r[   r\   r  r   r  iter_contentrm   r  r  r   r   r  isfilelistrj   r4   Tensorrd  ndarrayrt   rq   existsunlink)r  r  r-   rN   r  r  r  r+  tmp_filevrr  r  chunkencodedr  r6   r6   r7   
load_video  sh   







r  video'VideoReader'desired_fps
max_frames	list[int]c                C  s   t | }|dksJ d||   }t||  }t|| }t|||}td|}||kr5tt|S tj	d|d |t
d S )Nr   z"Video must have at least one framer/   r  )r  get_avg_fpsminmathfloormaxr	  rY  rd  linspacerl   tolist)r  r  r  total_framesdurationfps
num_framesr6   r6   r7   sample_video_frames  s   
r#  c           	      C  s   ddl m}m} tj| std|  d g S |dkrg S dd }|| |dd}t|	 d }d	d
 t
dt||D }|d urOt||krO|||}|| }dd
 |D }|S )Nr   )r-   rN   zVideo z does not existc                   s4   t |   fddt|D }fdd|D S )Nc                   s    g | ]}t |   d   qS )r{   )rl   rJ  r}  )gapr6   r7   rL    s     z8encode_video.<locals>.uniform_sample.<locals>.<listcomp>c                   s   g | ]} | qS r6   r6   r$  )lr6   r7   rL        )r  rY  )r&  nidxsr6   )r%  r&  r7   uniform_sample  s   z$encode_video.<locals>.uniform_sampler  r/   c                 S     g | ]}|qS r6   r6   r$  r6   r6   r7   rL        z encode_video.<locals>.<listcomp>c                 S  s   g | ]
}t |d qS )uint8)r!   	fromarrayastyperJ  vr6   r6   r7   rL  !  s    )r   r-   rN   r[   r  r  r   r  r  r  rY  r  	get_batchasnumpy)	
video_pathframe_count_limitr-   rN   r*  r  
sample_fpsframe_indicesr  r6   r6   r7   encode_video
  s   
r8  c                   C  s4   t jdtdd t jddtd t jddtd dS )z9Suppress known noisy warnings from third-party libraries.ignorez%The given NumPy array is not writable)categorymessagez$The cuda.cudart module is deprecated)r;  r:  z#The cuda.nvrtc module is deprecatedN)warningsfilterwarningsUserWarningFutureWarningr6   r6   r6   r7   suppress_noisy_warnings%  s   
r@  c                  C  sr   t   zddlm}  W n
 ty   Y d S w | tj tdtj tdtj tdtj d S )Nr   )r   z,vllm.distributed.device_communicators.pyncclz3vllm.distributed.device_communicators.shm_broadcastzvllm.config)	r@  vllm.loggerr   ra   setLevelloggingWARN	getLoggerERROR)vllm_default_loggerr6   r6   r7   suppress_other_loggers6  s   

rH  r   min_versionr;  c                 C  sh   z t | }t|t|k rt|  d| d| d| W d S  ty3   t|  d| d| w )Nz is installed with version z2, which is less than the minimum required version . z with minimum required version z is not installed. )r
   pkg_versionparser  r	   )r   rI  r;  installed_versionr6   r6   r7   assert_pkg_versionH  s$   rN  c                 C  s4   zt | }t|t|kW S  ty   Y dS w )aE  
    Check if a package is installed and meets the minimum version requirement.

    Args:
        pkg: Package name (distribution name, e.g., "flashinfer-python")
        min_version: Minimum version required (e.g., "0.6.3")

    Returns:
        True if package is installed and version >= min_version, False otherwise
    F)r
   rK  rL  r	   )r   rI  rM  r6   r6   r7   r   W  s   r   include_parentskip_pidc              	   C  s   | du r
t  } d}zt| }W n tjy   Y dS w |jdd}|D ]}|j|kr-q%z|  W q% tjy=   Y q%w |rhz| t  krP|  t	d |  |
tj W dS  tjyg   Y dS w dS )z-Kill the process and all its child processes.NFT)	recursiver   )r[   getpidr  rn  rp  childrenro  killsysexitsend_signalsignalSIGQUIT)
parent_pidrO  rP  itselfrS  childr6   r6   r7   kill_process_treei  s8   

r]  c                  C  sN   ddl m  m  m  m}  t| ddd  ddlm} t|ddd  dS )	z
    Monkey patch the slow p2p access check.
    NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
    r   Ngpu_p2p_access_checkc                  _     dS r   r6   )argr  r6   r6   r7   <lambda>  r9  z/monkey_patch_p2p_access_check.<locals>.<lambda>)CustomAllreduce__del__c                  _  r7  r3   r6   r  r  r6   r6   r7   ra    r9  )Csglang.srt.distributed.device_communicators.custom_all_reduce_utilssrtr!  device_communicatorscustom_all_reduce_utilssetattr=sglang.srt.distributed.device_communicators.custom_all_reducerb  )tgtrb  r6   r6   r7   monkey_patch_p2p_access_check  s   rl    c              
   C  s   t j}t |\}}|| k r3z
t || |f W n ty2 } ztd|  W Y d }~nd }~ww t j}t |\}}d|  }||k rlzt |||f W d S  tyk } ztd|  W Y d }~d S d }~ww d S )NzFail to set RLIMIT_NOFILE:    zFail to set RLIMIT_STACK: )resourceRLIMIT_NOFILE	getrlimit	setrlimitrt   r   r   RLIMIT_STACK)target_soft_limitresource_typecurrent_softcurrent_hardetarget_soft_limit_stack_sizer6   r6   r7   
set_ulimit  s,   
rz  msgc                 C  s2   ddl m}m} | r| dkrt|  d S d S )Nr   )get_tensor_model_parallel_rankmodel_parallel_is_initialized)rU  r|  r}  r   r{  )r{  r|  r}  r6   r6   r7   	rank0_log  s   r~  c                 C  s   t d }r:t j|std| dt|dd}t| }W d    n1 s-w   Y  t	j
| d S tj rAdnd}d| | d	}t	jtt	| j |d
dd d S )NSGLANG_LOGGING_CONFIG_PATHz1Setting SGLANG_LOGGING_CONFIG_PATH from env with z but it does not exist!utf-8encodingz.%(msecs)03dr   z[%(asctime)sz] %(message)s%Y-%m-%d %H:%M:%ST)levelformatdatefmtforce)r[   r\   r  r  r  r  orjsonloadsr  rC  config
dictConfigr+   SGLANG_LOG_MSr   basicConfigrd   	log_levelupper)server_argsr5  r  filecustom_configmaybe_msr  r6   r6   r7   configure_logger  s(   
r  model	nn.Modulemodule_name
new_modulec                 C  s<   |  d|ddd }|dd }t||| |S )z1Replace a submodule in a model with a new module.rh   N)get_submodulejoinrm   ri  )r  r  r  parenttarget_namer6   r6   r7   replace_submodule  s   r  weighttorch.Tensorweight_attrsc                 C  sF   |du rdS |  D ]\}}t| |rJ d| t| || q
dS )a  Set attributes on a weight tensor.

    This method is used to set attributes on a weight tensor. This method
    will not overwrite existing attributes.

    Args:
        weight: The weight tensor.
        weight_attrs: A dictionary of attributes to set on the weight tensor.
    Nz'Overwriting existing tensor attribute: )itemsrA   ri  )r  r  keyr   r6   r6   r7   set_weight_attrs  s   r  data	List[Any]rank
dist_group(Optional[torch.distributed.ProcessGroup]srcforce_cpu_devicec           
      C  sD  t t j r|sdnt r|sdnd}||krft| dkr3t jdgt j|d}tj	|||d | S t
| }t|}t tj|tjd|}	t j|gt j|d}tj	|||d tj	|	||d | S t jdgt j|d}tj	|||d | }|dkrg S t j|t j|d}	tj	|	||d t|	  }t
|} | S )zBroadcast inputs from src rank to all other ranks with torch.dist backend.
    The `rank` here refer to the source rank on global process group (regardless
    of dist_group argument).
    r;   r_   rN   r   re   ro   r  r  r  )r4   ro   r;   r<   rb   r  r  longdist	broadcastpickledumps
ByteTensorrd  
frombufferr-  tor%  emptyr  rN   numpyr  )
r  r  r  r  r  ro   tensor_sizeserialized_datar  tensor_datar6   r6   r7   broadcast_pyobj  sB   

r  r  dst
async_sendc                 C  s  ddl m} |rtj}ntj}||krxg }t| dkr7tjdgtjd}	||	||d}
|r5|	||
|	 |S t
| }t|}ttj|tjd}tj|gtjd}	||	||d}
|re|	||
|	 ||||d}
|rv|	||
| |S ||krtjdgtjd}	tj|	||d}
|
  |	 }|dkrg S tj|tjd}tj|||d}
|
  t|  }t
|} | S g S )z#Send data from src to dst in group.r   )P2PWorkr  )r  r  )%sglang.srt.distributed.parallel_stater  r  isendsendr  r4   r  r  r  r  r  r  rd  r  r-  irecvwaitr%  r  r  rN   r  r  )r  r  r  r  r  r  r  	send_func	p2p_worksr  workr  r  r  r6   r6   r7   point_to_point_pyobj%  s`   	

r  r  )	data_sizec             
   G  s   t jddd ttjtjgddddA}t| - tdt dd}t	
d|i| W d	   n1 s5w   Y  || }W d	   n1 sHw   Y  W d	   n1 sWw   Y  |d
|  dt d td7 a|S )a   
    Args:
        name (string): the name of recorded function.
        func: the function to be profiled.
        args: the arguments of the profiled function.
        data_size (int): some measurement of the computation complexity.
            Usually, it could be the batch size.
    traceTexist_ok)
activitiesrecord_shapesprofile_memory
with_stackztrace/size_z.jsonwr  Nztrace/r+  r/   )r[   makedirsr&   r%   CPUCUDAr'   r  step_counterjsondumpexport_chrome_trace)r   r  r  r  profr  r  r6   r6   r7   pytorch_profilel  s&   



r  contextzmq.Contextsocket_typezmq.SocketTypeendpointOptional[str]r  )Union[zmq.Socket, Tuple[int, zmq.Socket]]c                 C  st   |  |}|du rt|| |d}||fS |ddkr%|tjd t|| |r3|| |S || |S )a  Create and configure a ZeroMQ socket.

    Args:
        context: ZeroMQ context to create the socket from.
        socket_type: Type of ZeroMQ socket to create.
        endpoint: Optional endpoint to bind/connect to. If None, binds to a random TCP port.
        bind: Whether to bind (True) or connect (False) to the endpoint. Ignored if endpoint is None.

    Returns:
        If endpoint is None: Tuple of (port, socket) where port is the randomly assigned TCP port.
        If endpoint is provided: The configured ZeroMQ socket.
    Ntcp://*[r  r/   )	r  config_socketbind_to_random_portfindr  zmqIPV6r  connect)r  r  r  r  r  rh  r6   r6   r7   get_zmq_socket  s   





r  hostTuple[int, zmq.Socket]c                 C  s8   |  |}t|| |rd| nd}||}||fS )ah  Create and configure a ZeroMQ socket.

    Args:
        context: ZeroMQ context to create the socket from.
        socket_type: Type of ZeroMQ socket to create.
        host: Optional host to bind/connect to, without "tcp://" prefix. If None, binds to "tcp://*".

    Returns:
        Tuple of (port, socket) where port is the randomly assigned TCP port.
    tcp://r  )r  r  r  )r  r  r  r  	bind_hostrh  r6   r6   r7   get_zmq_socket_on_host  s
   


r  c                   s   t  }|jd }|jd }|dkr|dkrtd nd  fdd} fdd	}|tjkr5|  d S |tjkr?|  d S |tjtj	tj
fv rQ|  |  d S td
| )Nr         g      Ar  c                          tjd  tj  d S r   )r  r  SNDHWMSNDBUFr6   buf_sizer  r6   r7   set_send_opt     z#config_socket.<locals>.set_send_optc                     r  r   )r  r  RCVHWMRCVBUFr6   r  r6   r7   set_recv_opt  r  z#config_socket.<locals>.set_recv_optzUnsupported socket type: )r  r  totalr  rl   r  PUSHPULLDEALERREQREPrt   )r  r  mem	total_memavailable_memr  r  r6   r  r7   r    s    







r  c                 C  s   ddl m} | dkrd S tj| dd |jtju r| }| 	 }tj
| d| d}td| d|j  t|| d S )	Nr   r|  Tr  pytorch_dump_z.npyzDump a tensor to z
. Shape = )rU  r|  r[   r  re   r4   bfloat16r   rN   r  r  r  r   r{  r  rd  save)dirpathr   r   r|  output_filenamer6   r6   r7   dump_to_file  s   
r
  c                   C  s   t jdS )Nz3.)triton__version__r  r6   r6   r6   r7   is_triton_3  r   r  c                    r   )zy
    torch.compile does not work for triton 2.2.0, which is needed in xlm1's jax.
    Therefore, we disable it here.
    c                   s   t  rtj i | S | S r3   )r  r4   compiler  rd  r6   r7   	decorator  s   z&maybe_torch_compile.<locals>.decoratorr6   )r  r  r  r6   rd  r7   maybe_torch_compile  s   r  c              
   C  sN   zt |  W d S  ty& } ztd|  d|j  W Y d }~d S d }~ww )Nz	Warning: z : )shutilrmtreer  r   strerror)r  rx  r6   r6   r7   delete_directory  s   $r  ztempfile.TemporaryDirectoryprometheus_multiproc_dirc                   C  sV   dt jv rtd tjt jd dan
t atjt jd< tdt jd   d S )NPROMETHEUS_MULTIPROC_DIRz+User set PROMETHEUS_MULTIPROC_DIR detected.)dirzPROMETHEUS_MULTIPROC_DIR: )r[   environr   debugr  TemporaryDirectoryr  r   r6   r6   r6   r7   set_prometheus_multiproc_dir  s   

r  c                 C  sP   ddl m}m}m} | }|| td||d}td|_| j	
| d S )Nr   )CollectorRegistrymake_asgi_appmultiprocessz/metrics)registryz^/metrics(?P<path>.*)$)prometheus_clientr  r  r  MultiProcessCollectorr"   rer  
path_regexroutesr  )appr  r  r  r  metrics_router6   r6   r7   add_prometheus_middleware#  s   
r'  c                   @  s(   e Zd Zdd Zd
ddZd
ddZd	S )RefCountedGaugec                 C  s   || _ i | _d S r3   )_gauge	_refcount)r   gauger6   r6   r7   r   1     
zRefCountedGauge.__init__r  r   c                 C  s8   || j v r| j |  d7  < d S d| j |< | j  d S Nr/   )r*  r)  incr   r  r6   r6   r7   r.  5  s   

zRefCountedGauge.incc                 C  sH   || j v r | j |  d8  < | j | dkr"| j |= | j  d S d S d S )Nr/   r   )r*  r)  decr/  r6   r6   r7   r0  <  s   
zRefCountedGauge.decN)r  r   )r   r   r   r   r.  r0  r6   r6   r6   r7   r(  0  s    
r(  c                   sz   ddl m}m} |ddddgd |dd	g d
d|ddddgddt|dddd| d fdd}d S )Nr   )CounterGaugezsglang:http_requests_totalz4Total number of HTTP requests by endpoint and methodr  method)r   documentation
labelnameszsglang:http_responses_totalz:Total number of HTTP responses by endpoint and status code)r  status_coder3  zsglang:http_requests_activez(Number of currently active HTTP requestslivesum)r   r4  r5  multiprocess_modezsglang:routing_keys_activez2Number of unique routing keys with active requests)r   r4  r8  httpc              
     s   t | \}}| j}| jd} j||d  j||d  |r)| z(|| I d H }j||t|jd  |W j||d  |rQ| S S j||d  |rc| w w )Nzx-smg-routing-key)r  r3  )r  r3  r6  )	_get_fastapi_request_pathr3  headersr   labelsr.  r   r6  r0  )request	call_nextr  is_handled_pathr3  routing_keyr  http_request_counterhttp_requests_activehttp_response_counterrouting_keys_activer6   r7   track_http_status_codeb  s2   
zHadd_prometheus_track_response_middleware.<locals>.track_http_status_code)r   r1  r2  r(  
middleware)r%  r1  r2  rF  r6   rA  r7   (add_prometheus_track_response_middlewareD  s4   rH  Tuple[str, bool]c                 C  sN   ddl m} | jjD ]}|| j\}}||jkr |jdf  S q
| jjdfS )Nr   )MatchTF)	starlette.routingrJ  r%  r$  matchesscopeFULLr  r  )r=  rJ  routematchchild_scoper6   r6   r7   r:    s   
r:  c                 C  s>   t  t jt j}|t jt jd |d| f |d |S )z1Bind to a specific port, assuming it's available.r/   r   )r  r  r  r  r  r  r  r  )rh  sockr6   r6   r7   	bind_port  s
   
rS  c                  C     z4t jdgt jt jddd} | jdkrtd| j  dd | j dD }|s0t	d	t
|W S  ty?   td
w )NzSrocminfo | grep 'gfx' -A 100 | grep 'Pool 1' -A 5 | grep 'Size:' | awk '{print $2}'Tstdoutstderrshelltextr   zrocm-smi error: c                 S  s&   g | ]}t |d d  d qS )(r   rn  )r   rm   r   rJ  r  r6   r6   r7   rL    s    z.get_amdgpu_memory_capacity.<locals>.<listcomp>
No GPU memory values found.zIrocm-smi not found. Ensure AMD ROCm drivers are installed and accessible.
subprocessrunPIPE
returncoderG   rW  r   rV  rm   rt   r  FileNotFoundErrorr  memory_valuesr6   r6   r7   get_amdgpu_memory_capacity  s,   
	
rf  c                  C  s(   t j rt j \} }| d | S dS )Nr   r   )r4   r;   r<   r|   )majorminorr6   r6   r7   get_device_sm  s   
ri  c                  C  s   zJt jg dt jt jdd} | jdkrtd| j  dd | j dD }|sFt	j
 rBtd	 t	j
 d
 d d W S tdt|W S  tyU   tdw )N)
nvidia-smiz--query-gpu=memory.totalz--format=csv,noheader,nounitsT)rV  rW  rY  r   znvidia-smi error: c                 S  $   g | ]}t d | rt|qS z^\d+(\.\d+)?$r"  rP  r   r   r[  r6   r6   r7   rL        z-get_nvgpu_memory_capacity.<locals>.<listcomp>r\  z]Failed to get GPU memory capacity from nvidia-smi, falling back to torch.cuda.mem_get_info().r/   rn  r]  zInvidia-smi not found. Ensure NVIDIA drivers are installed and accessible.)r_  r`  ra  rb  rG   rW  r   rV  rm   r4   r;   r<   r   r   r  rt   r  rc  rd  r6   r6   r7   get_nvgpu_memory_capacity  s2   


ro  c                  C  rT  )Nzhl-smi --query | grep 'Total'TrU  r   zhl-smi error: c                 S  s   g | ]}t |d d qS ) )r   rm   r[  r6   r6   r7   rL    s    z+get_hpu_memory_capacity.<locals>.<listcomp>r\  r]  zEhl-smi not found. Ensure Habana drivers are installed and accessible.r^  rd  r6   r6   r7   get_hpu_memory_capacity  s*   

rr  c               
   C  sB   zdd l } tj d d d W S  ty  } ztdd }~ww )Nr   r/   rn  z-torch_npu is required when run on npu device.)	torch_npur4   rF   r  ra   )rs  rx  r6   r6   r7   get_npu_memory_capacity  s   rt  c            	   
   C  s(  t jddr	d S tt } | dkrtt jd S z_t	 }d}t
| D ]J}d| d}tt j||d0}| }| }t|d	krV|d
 dkrV|t|d  n
td| d| W d    n1 sjw   Y  q%tt|d }|W S  tttfy   t j|  }t|d  Y S w )NSGLANG_CPU_OMP_THREADS_BINDr   r   i   z/sys/devices/system/node/nodez/meminforr  r{   z	MemTotal:r   zUnexpected format in r   rn  )r[   r  r   r  r  r   r  r  r  r	  rY  r  r  r  readlinerm   r  rl   rt   r  rc  
IndexError)	r/  numa_mem_listfile_prefixnuma_idfile_meminfor  linepartsnuma_memr6   r6   r7   get_cpu_memory_capacity  s2   

r  c                   C  sB   zt j rt j d d d W S td ty    tdw )Nr/   rn  r]  ztorch.xpu is not available.)r4   rD   r<   r  rt   AttributeErrorrG   r6   r6   r6   r7   get_xpu_memory_capacity*  s   
r  c                  C  s   zOt jdgt jt jddd} | jdkrtd| j  dd | j dD }|sKt	t
d	rGt
j rGtd
 t
j d d d W S tdt|W S  tyZ   tdw )Nz}mthreads-gmi --query | grep 'FB Memory Usage' -A 2 | grep 'Total' | awk -F':' '{print $2}' | awk '{print $1}' | sed 's/MiB//'TrU  r   zmthreads-gmi error: c                 S  rk  rl  rm  r[  r6   r6   r7   rL  D  rn  z-get_mtgpu_memory_capacity.<locals>.<listcomp>r\  r_   z_Failed to get GPU memory capacity from mthreads-gmi, falling back to torch.musa.mem_get_info().r/   rn  r]  zRmthreads-gmi not found. Ensure Moore Threads drivers are installed and accessible.)r_  r`  ra  rb  rG   rW  r   rV  rm   rA   r4   r_   r<   r   r   r  rt   r  rc  rd  r6   r6   r7   get_mtgpu_memory_capacity3  s6   


r  c                 C  s   t  rt }|S t rt }|S | dkrt }|S | dkr"t }|S | dkr+t }|S | dkr4t }|S | dkr=t }|S d }|S )Nr@   rF   rN   rD   r_   )	r=   ro  r8   rf  rr  rt  r  r  r  )ro   gpu_memr6   r6   r7   get_device_memory_capacity\  s.   	r  c	                 C  s$  ddl m}	m}
m}m}m}m} |d u s|d u sJ d|d ur1|dks(J d|dks0J dn|d u r7d}| r>|	| } n|	d} |d u rH|}|d u re|||||d}t|\}}}|| |
||}t	d	krkd
nd}|||g | |fd|i||i||d\}}dd t
|D |j|< |S )Nr   )BackendPrefixStore_new_process_group_helper_worlddefault_pg_timeout
rendezvousz*Cannot specify both init_method and store.z*world_size must be positive if using storez(rank must be non-negative if using storezenv://	undefinedr  )r{   r  backend_options
pg_options
group_name)r  	device_idc                 S  s   i | ]}||qS r6   r6   r$  r6   r6   r7   
<dictcomp>  s    z-init_custom_process_group.<locals>.<dictcomp>)"torch.distributed.distributed_c10dr  r  r  r  r  r  nextset_timeouttorch_releaserY  pg_group_ranks)r   init_methodr  
world_sizer  storer  r  r  r  r  r  r  r  r  rendezvous_iteratorpg_options_param_namepgr+  r6   r6   r7   init_custom_process_groupu  sL    	



r  c                   C  s   t dS )NSGLANG_IS_IN_CI)r   r6   r6   r6   r7   crash_on_warnings  r   r  c                 C     t |  d S r3   )r   r   r{  r6   r6   r7   print_warning_once  s   r  c                 C  r  r3   )r   r{  r  r6   r6   r7   print_info_once  s   r  r  c                 C  s   t tdr
tj st rtj| S t tdr#tj r#tj| S t tdr3tj r3tj| S t tdrCtj rEtj| S d S d S )Nr;   rD   r@   rF   )	rA   r4   r;   r<   rb   get_device_namerD   r@   rF   r  r6   r6   r7   r    s   r  c                   C  s   t dd uS )Nhabana_frameworksr   r6   r6   r6   r7   is_habana_available  r9   r  c              
   C  s  t  rt rtd dS td dS ttdr)tj r)| d u r$dS d	| S ttdr>tj
 r>| d kr9dS d	| S t rL| d krGdS d		| S t rxzd
d l}tj rf| d kr`W dS d	| W S W n tyw } ztdd }~ww t r| d krdS d	| S td)Nz8Intel AMX is detected, using CPU with Intel AMX support.zICPU device enabled, using torch native backend, low performance expected.rN   r;   zcuda:{}rD   zxpu:{}rF   znpu:{}r   r@   zhpu:{}zOHabana frameworks detected, but failed to import 'habana_frameworks.torch.hpu'.r_   zmusa:{}z8No accelerator (CUDA, XPU, HPU, NPU, MUSA) is available.)r^   r   r   r{  r   rA   r4   r;   r<   r  rD   rH   r  habana_frameworks.torch.hpur@   ra   rb   rG   )r  r  rx  r6   r6   r7   
get_device  sN   





r  c               	   C  s   t tdr
tj st rztj W S  ty   Y dS w t tdr9tj r9ztj W S  ty8   Y dS w t r[zdd l	} tj
 rLtj
 W S W dS  ttfyZ   Y dS w dS )Nr;   r   rD   )rA   r4   r;   r<   rb   r  rG   rD   r  r  r@   ra   )r  r6   r6   r7   get_device_count  s.   
r  c                 C  s,   t tdr
tj st rtj| jS dS )Nr;   r   )rA   r4   r;   r<   rb   r   multi_processor_countr  r6   r6   r7   get_device_core_count(  s   r  c              
   C  s   d\}}t tdrtj st rtj| \}}t tdr5tj r5tj| d d^}}}d\}}t tdr^tj r^z	d\}}W ||fS  t	y] } z	t
d| d|d }~ww ||fS )NNNr;   rD   r
   rh   r@   z:An error occurred while getting device capability of hpu: )rA   r4   r;   r<   rb   r|   rD   rm   r@   r  rG   )r  rg  rh  r+  rx  r6   r6   r7   r|   /  s*   


r|   c              
   C  s   t tdrtj rdS t tdrPtj rPzdd l}dd l}ddlm} W n t	y6 } zt	dd }~ww | }d|_
| dkrHd	|_
d
|j_|j|d}|S dS )Nr@   hpu_backendrF   r   )CompilerConfigznNPU detected, but torchair package is not installed. Please install torchair for torch.compile support on NPU.zmax-autotunenpugraph_exzreduce-overheadT)compiler_configinductor)rA   r4   r@   r<   rF   torchairMtorchair.ge_concrete_graph.ge_converter.experimental.patch_for_hcom_allreduce torchair.configs.compiler_configr  ra   r   r  run_eagerlyget_npu_backend)r   r  r  rx  r  npu_backendr6   r6   r7   get_compiler_backendI  s*   r  sglangFRAGMENTop_nameop_funcr   mutates_args	List[str]	fake_implOptional[Callable]
target_libOptional[Library]c           
   
   C  sf  ddl }|pt}z t|jdr|jjnd}t|j|r'tt|j|| r'W dS W n ttfy3   Y nw t|j	drC|j	j
||d}nddl}|jj
||}z4|| |  t rb|| |d nt rm|| |d n|| |d	 |dur|| | W dS W dS  ty }	 zd
t|	v rdt|	v rn|	W Y d}	~	dS d}	~	w ty }	 z|	d}	~	ww )a  
    `torch.library.custom_op` can have significant overhead because it
    needs to consider complicated dispatching logic. This function
    directly registers a custom op and dispatches it to the CUDA backend.
    See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
    for more details.

    By default, the custom op is registered to the vLLM library. If you
    want to register it to a different library, you can pass the library
    object to the `target_lib` argument.

    IMPORTANT: the lifetime of the operator is tied to the lifetime of the
    library object. If you want to bind the operator to a different library,
    make sure the library object is alive when the operator is used.

    Note: This function will silently skip registration if the operator
    with the same name is already registered to avoid RuntimeError in
    multi-engine scenarios (e.g., VERL framework).
    r   Nr   r  infer_schema)r  PrivateUse1XPUr  zTried to register an operatorzmultiple times)torch.library
sglang_librA   r   r   opsrd   r  rG   libraryr  torch._custom_op.impl
_custom_opimpldefinerH   rE   _register_faker   )
r  r  r  r  r  r4   my_liblib_name
schema_strr  r6   r6   r7   direct_register_custom_ope  sL   r  tp_sizennodesr&  c              	     s   t  }t|}t||  d}|| }tjdd  | }||   }	|	| }
t tjddkrQdd t|	|
D } fddt|	|
D }tt	||}n
dd t|	|
D }|
| td| d	| d
|
   d S )Nr/   F)logicalc                 S  r+  r6   r6   rJ  idr6   r6   r7   rL    r,  z)set_gpu_proc_affinity.<locals>.<listcomp>c                   s   g | ]}|  qS r6   r6   r  total_pcoresr6   r7   rL    r'  c                 S  r+  r6   r6   r  r6   r6   r7   rL    r,  zProcess z gpu_id z is running on CPUs: )r[   rR  r  rn  r  	cpu_countrY  r	  	itertoolschaincpu_affinityr   r{  )r>  r  r  r&  ro  pnnodes_per_tp_grouptp_size_per_nodenum_cores_bindstart_cpu_id
end_cpu_idlower_cpu_idsupper_cpu_idsbind_cpu_idsr6   r  r7   set_gpu_proc_affinity  s   

$r  xc                 C  s   | j d }| j d }| j d }| }| jtjks| jtjkr2|t|t|d dt|d dd}n5| jtjks>| jtjkrS|t|t|d dt|d dd}n|t|t|d dt|d dd}|	ddd	ddd
}|
 }|j| j  }|S )Nr   r/   r{   r  r  r  r   @   r   rw  )r  re   r4   r  float16viewrl   float8_e4m3fnuzint8permute
contiguous)r  b_n_k_x_r6   r6   r7   permute_weight  s   


**(r  c                   @  s(   e Zd Zed	d
ddZedd ZdS )MultiprocessingSerializerF
output_strr2   c                 C  s@   t  }t||  |d | }|rt|d}|S )a  
        Serialize a Python object using ForkingPickler.

        Args:
            obj: The object to serialize.
            output_str (bool): If True, return a base64-encoded string instead of raw bytes.

        Returns:
            bytes or str: The serialized object.
        r   r  )	ior   r   r  seekr  r  	b64encodedecode)objr  bufoutputr6   r6   r7   	serialize  s   
z#MultiprocessingSerializer.serializec                 C  s*   t | trtj| dd} tt|  S )z
        Deserialize a previously serialized object.

        Args:
            data (bytes or str): The serialized data, optionally base64-encoded.

        Returns:
            The deserialized Python object.
        Tr  )r   r   r  r  SafeUnpicklerr  r   r  r  r6   r6   r7   deserialize	  s   
z%MultiprocessingSerializer.deserializeN)F)r  r2   )r   r   r   r   r   r  r6   r6   r6   r7   r    s
    r  c                      s,   e Zd Zh dZh dZ fddZ  ZS )r  >   peft.torch.types.copyreg.weakref.	builtins.	operator.
functools.
itertools.collections.pickletools.transformers.torch._tensor.torch.storage.huggingface_hub.sglang.srt.utils.sglang.srt.layers.torch.distributed.torch.nn.parameter.torch.autograd.function.torch.distributed.optim.torch.distributed._shard.multiprocessing.reduction.torch._C._distributed_c10d.torch._C._distributed_fsdp.torch.distributed._composable. multiprocessing.resource_sharer.%sglang.srt.weight_sync.tensor_bucket.'sglang.srt.model_executor.model_runner.>	   r[   systemtypesCodeTyper$  FunctionTypecodecsr  builtinsevalr+  execr+  r  r_  r`  r_  Popenc                   s`    |f| j v rtd  d| dt fdd| jD r%t  |S td  d| d)NzBlocked unsafe class loading (rh   z,), to prevent exploitation of CVE-2025-10164c                 3  s    | ]
} d   |V  qdS )rh   Nr  rJ  r5  ru   r6   r7   rS  P	  s    
z+SafeUnpickler.find_class.<locals>.<genexpr>)DENY_CLASSESrG   anyALLOWED_MODULE_PREFIXESr   
find_class)r   ru   r   r   r5  r7   r9  H	  s   zSafeUnpickler.find_class)r   r   r   r8  r6  r9  r   r6   r6   r   r7   r  	  s    &r  c                       fdd}|S )Nc            	        s   t tjr`tjjdd}tjjdd}|   | i |}|  |  |	|}|
dt| dkr9| d nd }|d urDt|nd}|dkrP|| d nd}t d| d| d	 |S  | i |S )
NT)enable_timingindicesr/   r   r   zTransfer time: z ms, throughput: z	 tokens/s)r   isEnabledForrC  DEBUGr4   r;   Eventrecordr   elapsed_timer   r  r  )	r  r  tictocr  elapsedr<  
num_tokens
throughputr  r6   r7   r  ^	  s    
 zdebug_timing.<locals>.wrapperr6   r  r  r6   r  r7   debug_timing\	  s   rH  valc                 C  s   | r| dkrd S | S )Nr   r6   )rI  r6   r6   r7   nullable_strt	  s   rJ  c               
   C  s   z#t  j} d|  }tj|ddddd}td|  d|j  W dS  tjyC } ztd|  d|j	  W Y d}~dS d}~ww )	z-py-spy dump on all scheduler in a local node.zpy-spy dump --native --pid T)rX  capture_outputrY  r   zPyspy dump for PID z:
zPyspy failed to dump PID z	. Error: N)
r  rn  ro  r_  r`  r   r  rV  CalledProcessErrorrW  )ro  cmdr  rx  r6   r6   r7   pyspy_dump_schedulersz	  s   


&rN  c                  C  s8   t jdkrd} td}|| tj d S td d S )Nlinuxr/   z	libc.so.6z8kill_itself_when_parent_died is only supported in linux.)	rU  rP   ctypesCDLLprctlrX  SIGKILLr   r   )PR_SET_PDEATHSIGlibcr6   r6   r7   kill_itself_when_parent_died	  s
   

rV  c                      s,   e Zd ZdZd fdd	Zdd	d
Z  ZS )UvicornAccessLogFilterzFilter uvicorn access logs by request path.

    Notes:
    - Uvicorn access records usually provide `request_line` like: "GET /metrics HTTP/1.1".
    - We defensively fall back to parsing `record.getMessage()` if needed.
    Nc                   s*   t    |pg }tdd |D | _d S )Nc                 s  s    | ]	}|rt |V  qd S r3   )r   rJ  r  r6   r6   r7   rS  	  s    z2UvicornAccessLogFilter.__init__.<locals>.<genexpr>)r   r   rj   excluded_path_prefixes)r   rY  r   r6   r7   r   	  s   
zUvicornAccessLogFilter.__init__r@  logging.LogRecordr1   r2   c                   s*  d  t |dd }|rt| }t|dkr|d   saz| }W n ty-   d }Y nw |ra|d}|dkrA|d|d nd}|dkra|dkra||d | }| }t|dkra|d   sedS t  d v rz	t jpt  W n	 ty   Y nw  ddd	  t	 fd
d| j
D  S )Nrequest_liner{   r/   "r  Tz://?r   c                 3  s    | ]}  |V  qd S r3   r3  r4  r  r6   r7   rS  	  s    

z0UvicornAccessLogFilter.filter.<locals>.<genexpr>)rd   r   rm   r  
getMessager  r  r   r  r7  rY  )r   r@  r[  r  r{  q1q2rlr6   r^  r7   filter	  sB   

zUvicornAccessLogFilter.filterr3   )r@  rZ  r1   r2   )r   r   r   r   r   rc  r   r6   r6   r   r7   rW  	  s    rW  c                 C  sZ   ddl m} d|d d d< d|d d d< d	|d d
 d< d|d d
 d< t||  d S )Nr   )LOGGING_CONFIGz)[%(asctime)s] %(levelprefix)s %(message)s
formattersr   fmtr  r  zR[%(asctime)s] %(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)saccess)uvicorn.configrd  $_configure_uvicorn_access_log_filter)r  rd  r6   r6   r7   set_uvicorn_logging_configs	  s   

rj  uvicorn_logging_configdictc                 C  s*  |du rdS d}t |dd}|sdS t|tr|g}dd |D }tt|}|s,dS | di  d|d| d |< | d	i }d
|v re|d
 dg }t|ts\t|}||d
 d< ||vre|| | di }d|v r|d dg }t|tst|}||d d< ||vr|| dS dS dS )a  Configure uvicorn access log path filter into uvicorn LOGGING_CONFIG.

    This optionally filters uvicorn access logs (e.g., suppress noisy /metrics polling).

    Args:
        uvicorn_logging_config: The dict-like LOGGING_CONFIG from uvicorn.
        server_args: Parsed server args object that may contain:
            - uvicorn_access_log_exclude_prefixes (list[str] | tuple[str] | None)
    N!sglang_uvicorn_access_path_filter#uvicorn_access_log_exclude_prefixesc                 S  s   g | ]}|r|qS r6   r6   rX  r6   r6   r7   rL  	  r'  z8_configure_uvicorn_access_log_filter.<locals>.<listcomp>filtersz.sglang.srt.utils.common.UvicornAccessLogFilter)z()rY  handlersrg  loggerszuvicorn.access)	rd   r   r   r	  rl  fromkeys
setdefaultr   r  )rk  r  filter_nameexcluded_prefixesrp  filters_listloggers_cfgr6   r6   r7   ri  	  sH   



ri  c                  C  sB  t d} | d urJt| } 	 z$ttjtj}|d| f | W  d    W S 1 s,w   Y  W n tyH   | d7 } t	d| d |  Y nw qz'ttjtj}|d |
 d W  d    W S 1 sjw   Y  W d S  ty   ttjtj}|d |
 d W  d     Y S 1 sw   Y  Y d S w )NSGLANG_PORTTr   r/   z)Port %d is already in use, trying port %dr  )r[   r\   rl   r  r  r  r  r  r   r{  r  r  r  r6   r6   r7   get_open_port
  s2   
&	

(

*ry  addressc                 C  s&   zt |  W dS  ty   Y dS w r   )	ipaddressIPv6Addressrt   rz  r6   r6   r7   is_valid_ipv6_address.
  s   
r~  c                 C  s   t | r
d|  dS | S )Nr  ])r~  r}  r6   r6   r7   maybe_wrap_ipv6_address6
  s   r  ipc                 C  s   dt |  d| S )Nr  :)r  )r  rh  r6   r6   r7   format_tcp_address<
     r  c                 C  s   | }| d}|dkrtd|d |d  }t|d| s&td| d }t||d krE||d  dkrA||d d  }ntd|sKtd	z	t|}W ||fS  tyc   td
| dw )Nr  r  z(invalid IPv6 address format: missing ']'r/   zinvalid IPv6 address: r  r{   z4received IPv6 address format: expected ':' after ']'z>a port must be specified in IPv6 address (format: [ipv6]:port)zinvalid port in IPv6 address: '')r  rt   r~  r  rl   )dist_init_addraddrr   r  port_strrh  r6   r6   r7   configure_ipv6@
  s,   

r  c              	     s   dd l dd l}ddlm}m  | }|d fdd}|d fdd}|d	 fd
d}|r=t| t  |j|ddd dd}	|j	|	dfdd}
t
j|
ddd}|  td d  d S )Nr   )FastAPIResponsez/pingc                         ddS )zQCould be used by the checkpoint-engine update script to confirm the server is up.   r6  r6   r6   r  r6   r7   pingh
     
z.launch_dummy_health_check_server.<locals>.pingz/healthc                     r  z$Check the health of the http server.r  r  r6   r6   r  r6   r7   healthm
  r  z0launch_dummy_health_check_server.<locals>.healthz/health_generatec                     r  r  r6   r6   r  r6   r7   health_generater
  r  z9launch_dummy_health_check_server.<locals>.health_generaterw  r  r   )r  rh  timeout_keep_aliveloop
log_configr  )r  c                    st   z-z	    W n ty }  z	td|    d } ~ ww W td d  d S td d  w )Nz+Dummy health check server failed to start: z%Dummy health check server stopped at r  )r`  server  r   r  r{  rx  )asyncior  rh  serverr6   r7   
run_server
  s   2z4launch_dummy_health_check_server.<locals>.run_serverTzhealth-check-server)targetdaemonr   z:Dummy health check server started in background thread at r  )r  uvicornfastapir  r  r   r'  r,   ConfigServer	threadingThreadstartr   r{  )r  rh  enable_metricsr  r  r%  r  r  r  r  r  threadr6   )r  r  r  rh  r  r7    launch_dummy_health_check_server`
  s>   		r  c                  C  sP   t  r&tj } | d  d| d  }| | d dkrdnd tjd< d S d S )Nr   rh   r/   r   ar   FLASHINFER_CUDA_ARCH_LIST)r   r4   r;   r|   r[   r  )
capabilityarchr6   r6   r7   set_cuda_arch
  s   
r  r  bc                 C  s   | |   S )zCeiling division.r6   )r  r  r6   r6   r7   cdiv
  r9   r  r(  c                 C  s   | dkrd| d   > S dS Nr   r/   )
bit_length)r(  r6   r6   r7   next_power_of_2
  s   r  yc                 C  s   | d | d | S r-  r6   r  r  r6   r6   r7   round_up
  r  r  c                   @  s   e Zd Zdd Zdd ZdS )EmptyContextManagerc                 C  s   | S r3   r6   r   r6   r6   r7   r   
     zEmptyContextManager.__enter__c                 C  r7  r3   r6   r   r6   r6   r7   r   
  r  zEmptyContextManager.__exit__N)r   r   r   r   r   r6   r6   r6   r7   r  
  s    r  c                  O  s   t  S r3   )r  rd  r6   r6   r7   empty_context
  s   r  c                 C  s   |s| S | d|  S )a  Add a weight path prefix to a module name.

    Args:
        name: base module name.
        prefix: weight prefix str to added to the front of `name` concatenated with `.`.

    Returns:
        The string `prefix.name` if prefix is non-empty, otherwise just `name`.
    rh   r6   )r   r5  r6   r6   r7   rP  
  s   
rP  r  Union[str, Path]c                 C  s&   t | trdS d}t|| }|duS )zk
    Check if the URL is a remote URL of the format:
    <connector_type>://<host>:<port>/<model_name>
    F(.+)://(.*)N)r   r   r"  rP  r  patternr   r6   r6   r7   is_remote_url
  s
   
r  c                 C  s&   d}t || }|du rdS |dS )z\
    Parse the connector type from the URL of the format:
    <connector_type>://<path>
    r  Nr   r/   )r"  rP  r  r  r6   r6   r7   parse_connector_type
  s
   
r  g       @g      N@c                 C  r_  r   r6   r  r6   r6   r7   ra  
  r9  ra  	max_retryinitial_delay	max_delayshould_retryCallable[[Any], bool]c                 C  s   t  D ]]}z|  W   S  ty     tya } zBt  ||kr'td||s/tdt|d|  |ddt    }t	d| d| d|d	d
|  t
| W Y d }~qd }~ww d S )Nz)retry() exceed maximum number of retries.z2retry() observe errors that should not be retried.r{   g      ?g      ?zretry() failed once (zth try, maximum z retries). Will delay z.2fzs and retry. Error: )r  countr   r  r   	print_excr  rc  r   r   r   r|  )fnr  r  r  r  	try_indexrx  delayr6   r6   r7   retry
  s*   r  
model_pathc                 C  sP   t jt j| drdS zddlm} | }|| dW S  ty'   Y dS w )zCheck if the model path contains hf_quant_config.json file.

    Args:
        model_path: Path to the model, can be local path or remote URL.

    Returns:
        True if hf_quant_config.json exists, False otherwise.
    zhf_quant_config.jsonTr   )HfApiF)r[   r  r  r  huggingface_hubr  file_existsr  )r  r  hf_apir6   r6   r7   has_hf_quant_config  s   	r  
str | Nonec                 C  s"   t | dd}|dur|dS dS )z4Extract quantization method from HuggingFace config.quantization_configNquant_method)rd   r   )	hf_configr  r6   r6   r7   get_quantization_config  s   
r  c                 C  s   t | trdd | D S | gS )Nc                 S  s   g | ]}t |D ]}|qqS r6   )flatten_nested_list)rJ  sublistr%  r6   r6   r7   rL  &  s
    z'flatten_nested_list.<locals>.<listcomp>)r   r	  )nested_listr6   r6   r7   r  $  s
   
r  c                 C  s    | d uo|    o|jd dkS r   )is_idler  )forward_modehidden_statesr6   r6   r7   is_non_idle_and_non_empty-  s
   r  c                 C  s(   |dkrt j| |ddS t j| ||dS )Nr/   T)dimkeepdim)r  )r4   r  topk)valuesr  r  r6   r6   r7   	fast_topk5  s   r  c                 C  s   | d ur|  | | S |S r3   )copy_)r  sourcer6   r6   r7   bind_or_assign>  s   
r  	interfacec              
   C  s,  | pt jdd  } sd S zdd l}W n ty$ } ztd|d }~ww zS|| }|j|v rL||j D ]}|d}|rK|dkrK|dkrK|  W S q5|j|v rs||j D ]}|d}|rr|dsr|dkrr|	d	d   W S qVW d S W d S  t
tfy } zt| d
 W Y d }~d S d }~ww )NSGLANG_LOCAL_IP_NICr   zvEnvironment variable SGLANG_LOCAL_IP_NIC requires package netifaces, please install it through 'pip install netifaces'r  	127.0.0.10.0.0.0zfe80::z::1%z[ Can not get local ip from NIC. Please verify whether SGLANG_LOCAL_IP_NIC is set correctly.)r[   r  r   	netifacesra   ifaddressesr  r  r  rm   rt   r  r   r   )r  r  rx  	addresses	addr_infor  r6   r6   r7   get_local_ip_by_nicF  sJ   





	r  c                  C  s   t  t jt j} z| d |  d W S  ty   Y nw zt  }t |}|r5|dkr5|dkr5|W S W n	 ty?   Y nw zt  t jt j} | d |  d W S  tyc   t	
d Y d S w )N)z8.8.8.8r  r   r  r  )z2001:4860:4860::8888r  zCan not get local ip by remote)r  r  
SOCK_DGRAMr  r  r  gethostnamegethostbynamer  r   r   )r  hostnamer  r6   r6   r7   get_local_ip_by_remotec  s0   


r  fallbackc                 C  sj   t ddpt dd}|r|S td t  }r|S td t  }r(|S td | r1| S td)a  
    Automatically detect the local IP address using multiple fallback strategies.

    This function attempts to obtain the local IP address through several methods.
    If all methods fail, it returns the specified fallback value or raises an exception.

    Args:
        fallback (str, optional): Fallback IP address to return if all detection
            methods fail. For server applications, explicitly set this to
            "0.0.0.0" (IPv4) or "::" (IPv6) to bind to all available interfaces.
            Defaults to None.

    Returns:
        str: The detected local IP address, or the fallback value if detection fails.

    Raises:
        ValueError: If IP detection fails and no fallback value is provided.

    Note:
        The function tries detection methods in the following order:
        1. Direct IP detection via get_ip()
        2. Network interface enumeration via get_local_ip_by_nic()
        3. Remote connection method via get_local_ip_by_remote()
    SGLANG_HOST_IPr   HOST_IPzget_ip failedzget_local_ip_by_nic failedzget_local_ip_by_remote failedzCan not get local ip)r[   r\   r   r  r  r  rt   )r  host_ipr  r6   r6   r7   get_local_ip_auto  s   




r  c                 C  s(   | j d u p| j dko| jdkp| jd u S r-  )speculative_eagle_topk	page_sizer  r6   r6   r7   is_no_spec_infer_or_topk_one  s   

r  c                 C  s2   t | dd }t|tr|sdS h d}|d |v S )NarchitecturesF>   LlamaForCausalLMOlmo2ForCausalLMQwen2ForCausalLMQwen3ForCausalLMGemma2ForCausalLMGlm4MoeForCausalLMMixtralForCausalLMQwen3MoeForCausalLMMiMoV2FlashForCausalLMGlm4vForConditionalGenerationGemma3ForConditionalGenerationGlmOcrForConditionalGenerationLlama4ForConditionalGenerationStepVLForConditionalGenerationQwen3VLForConditionalGenerationStep3VLForConditionalGeneration Glm4vMoeForConditionalGeneration"Qwen3VLMoeForConditionalGenerationr   )rd   r   r	  )r  r  default_archsr6   r6   r7   is_fa3_default_architecture  s
   r  c                   @  s    e Zd Zd	ddZd
ddZdS )BumpAllocatorbuffer_sizerl   c                 C  s   t j|f||d| _d| _d S )Nr  r   )r4   zeros_buffer_pointer)r   r  re   ro   r6   r6   r7   r     s   
zBumpAllocator.__init__r  c                 C  s@   | j | t| jksJ | j| j | j |  }|  j |7  _ |S r3   )r  r  r  )r   r  r  r6   r6   r7   allocate  s   zBumpAllocator.allocateN)r  rl   )r  rl   )r   r   r   r   r  r6   r6   r6   r7   r        
r  c                 C  sT   ddl m} ztj r| dkr| | W d S W d S W d S    | | Y d S )Nr   r  )rU  r|  r4   r!  is_initializedr{  )r   r{  r|  r6   r6   r7   log_info_on_rank0  s   r  c                 C  s2   zt | W S  ty   t t|   Y S w r3   )r  r  r   r   	read_textr  r6   r6   r7   load_json_config  s
   r  c                 C  s4   ddl m} | rdS | tjd| j| jd dS )z
    Dispose a tensor by freeing its memory.
    During piecewise CUDA graph capture/replay, we skip disposal to avoid
    interfering with torch.compile's memory tracking and graph recording.
    r   )is_in_piecewise_cuda_graphNr   )ro   re   )0sglang.srt.compilation.piecewise_context_managerr  set_r4   r  ro   re   )r  r  r6   r6   r7   dispose_tensor  s   	r   Tc                   @  s0   e Zd Zdd ZedddZeddd	Zd
S )Withablec                 C  s
   d | _ d S r3   _valuer   r6   r6   r7   r     r2  zWithable.__init__r1   r!  c                 C     | j S r3   r#  r   r6   r6   r7   r     s   zWithable.value	new_valuec                 c  sN    | j d u sJ || _ zd V  W | j |u sJ d | _ d S | j |u s#J d | _ w r3   r#  )r   r&  r6   r6   r7   
with_value  s   
zWithable.with_valueN)r1   r!  )r&  r!  )r   r   r   r   propertyr   r   r'  r6   r6   r6   r7   r"    s    r"  r  r.   c                 C  s`   ddl m} | jr.| jdksJ d| jdu rdS | jsdS |  r%dS | j| j| j kS dS )z
    Check if the input of MLP is obtained by all-gather rather than all-reduce. This only happens when each MLP TP group contains multiple attention DP groups.
    r   get_moe_a2a_backendr/   zdp_size must be greater than 1NTF)sglang.srt.layers.moe.utilsr*  enable_dp_attentiondp_sizemoe_dense_tp_sizeenable_dp_lm_headis_noner  r  r*  r6   r6   r7   require_mlp_tp_gather  s   


r2  c                 C  sH   ddl m} | jdv sJ |  r| jdkr"| jr | j| jk S dS dS )z7
    Check if the input of attention is scattered.
    r   r)  )r/   Nr/   TF)r+  r*  r.  r0  r,  r-  r  r1  r6   r6   r7   require_attn_tp_gather/  s   r3  c                 C  s   t | pt| S r3   )r2  r3  r  r6   r6   r7   require_gathered_buffer?     r4  c                 C  s   | j pt| S r3   )r,  r4  r  r6   r6   r7   require_mlp_syncC  r   r6  repo_idrevisionc              	   C  s   dd l }tj|jj|jjdg| d}|sBtj|dd}tj|rBt	|}|
  }W d    n1 s=w   Y  |rTtj|d|}tj|rT|S d S )Nr   models/refsmain	snapshots)r  r[   r  r  	constantsHF_HUB_CACHEREPO_ID_SEPARATORrm   r  r  r  r   isdir)r7  r8  hf
cache_pathref_pathr  rev_dirr6   r6   r7   find_local_repo_dirG  s    
rF  
model_namec                 C  sz   z2t | }|r0tj|d}tj|r0t|ddd}| W  d   W S 1 s+w   Y  W dS  ty<   Y dS w )zRead system prompt from a file in the HuggingFace cache directory.

    Args:
        model_name: The model name to construct the file path

    Returns:
        The system prompt content from the file, or empty string if file not found
    zSYSTEM_PROMPT.txtrw  r  r  Nr   )rF  r[   r  r  r  r  r  r  )rG  local_repo_dirsystem_prompt_filer  r6   r6   r7   read_system_prompt_from_file`  s   	"rJ  c                 C  s,   | j t dkr
| S t s| S tjj| S )NrN   )ro   r4   r   r  
sgl_kernelr   r  r6   r6   r7   prepack_weight_if_neededw  s
   rM  c                 C  s$   |  dd dko|  dd dkS )Nr   r  r/   r  )r  rL  r6   r6   r7   dim_is_supported  s   $rN  c           	        sF   fdd|D }t |dksJ d| }|r%t |t |ks%J dt|D ]L\}}t |}t|sTtd|dd|dd	  d
  d	 d _ d S |ra|| ra|j	||  }t
jjt|dd}|j|_t || q)|t
dkot  _ jrt dr jd urt
jj jj dd _d S d S d S d S )Nc                   s   h | ]}t  |jqS r6   )rd   ro   )rJ  weight_namer5  r6   r7   	<setcomp>  s    z0_process_weight_after_loading.<locals>.<setcomp>r/   z,Expects all weights to be on the same devicez8len(weight_names) should be equal to len(transpose_dims)zXExpects weight.size(0) % 16 == 0 and weight.size(1) % 32 == 0 but weight_tensor.size(0)=r   z and weight_tensor.size(1)=z in rJ  z won't use intel amx backend.F)requires_gradrN   bias)r  pop	enumeraterd   rN  r   r   r  r   	transposer4   r#   	ParameterrM  __dict__ri  ro   r   rA   rR  r  r   )	ru   weight_namestranspose_dimsdevicesro   r}  rO  weight_tensorpacked_weightr6   r5  r7   _process_weight_after_loading  sV   

r]  c                   @  s    e Zd ZdddZd	ddZdS )
PackWeightMethodNc                 C  s   || _ || _d S r3   )rX  rY  )r   rX  rY  r6   r6   r7   r     r,  zPackWeightMethod.__init__r1   r   c                 C  s   t || j| j d S r3   )r]  rX  rY  )r   ru   r6   r6   r7   process_weights_after_loading  r  z.PackWeightMethod.process_weights_after_loadingr3   r   )r   r   r   r   r_  r6   r6   r6   r7   r^    r  r^  c                   @  s"   e Zd ZdddZedd ZdS )		LazyValuecreatorr   c                 C  s   || _ d | _d S r3   _creatorr$  )r   ra  r6   r6   r7   r     r,  zLazyValue.__init__c                 C  s    | j d ur|   | _d | _ | jS r3   rb  r   r6   r6   r7   r     s   

zLazyValue.valueN)ra  r   )r   r   r   r   r(  r   r6   r6   r6   r7   r`    s    
r`  	func_pathc                 C  sP   |  d}t|dk rtdd|d d }|d }t|}t||}|S )Nrh   r{   zOfunc_path should contain both module name and func name (such as 'module.func')r  )rm   r  rt   r  r   import_modulerd   )rd  r  module_path	func_nameru   r  r6   r6   r7   dynamic_import  s   


rh  c                  C  s<   dd l } t| d}t| d}t| d}|||fS )Nr   r/   r{   )gcr  get_objects)ri  g0g1g2r6   r6   r7   gc_object_counts  s
   
rn  c                   s*   dd l }i   fdd}|j| d S )Nr   c                   s   | dd}| dkrt  |< d S | dkrDt   |t  }|krFt \}}}td| d|dd| d	| d
| d d S d S d S )N
generationr]  r  stopz.LONG GARBAGE COLLECTION DETECTED | Generation  | Duration: .4fzs | # Objects: gen0=z, gen1=z, gen2=zi | This may cause latency jitter. Consider calling the freeze_gc API after sending a few warmup requests.)r   r   rn  r   warn)phaser{  genr   rk  rl  rm  gc_start_timewarn_threshold_secsr6   r7   gc_callback  s   $z)configure_gc_warning.<locals>.gc_callback)ri  	callbacksr  )rx  ri  ry  r6   rv  r7   configure_gc_warning  s   r{  c                 C  s`   dd l }t \}}}|  t \}}}td|  d| d| d| d| d| d|  d S )Nr   zFreezing GC in z process. gen0: z->z, gen1: z, gen2: )ri  rn  freezer   r{  )r  ri  	g0_before	g1_before	g2_beforeg0_afterg1_afterg2_afterr6   r6   r7   	freeze_gc  s&   r  c                    s2   t d dd l} i   fdd}| j| d S )NzEnable GC Loggerr   c                   s   | dd}| dkrt  |< tdt  d|  d S | dkr]t   |t  }| dd}| dd}td	t  d| d
|dd| d| d|dkrVdnd  d S d S )Nro  r]  r  zGC start: Time z | Generation rp  	collecteduncollectablezGC end: Time rq  rr  zs | Collected: z | Uncollectable: rp  r   z	(LONG GC)r   )r   r   r   r{  )rt  r{  ru  r   r  r  rw  r6   r7   ry    s(   z(configure_gc_logger.<locals>.gc_callback)r   r{  ri  rz  r  )ri  ry  r6   r  r7   configure_gc_logger  s
   
r  c                 C  s   t | || S r3   )ceil_divr  r6   r6   r7   
ceil_align  r   r  c                 C  s   | | d | S r-  r6   r  r6   r6   r7   r  #  r5  r  c               
   C  s   zt jddgdd} W n ty } ztd| d }~ww g }|  D ]}|dsCtt| 	d\}}}}|
||||f q%|S )Nlscpuz-p=CPU,Core,Socket,NodeT)rY  z"Unexpected error running 'lscpu': #r  )r_  check_outputr  rG   
splitlinesr  rk   rl   r   rm   r  )r  rx  cpu_infor~  rN   corer  rv  r6   r6   r7   parse_lscpu_topology'  s   

r  c                  C  s   t  } tt}| D ]\}}}}||f}||| vr||| |< q	t  }i }| D ]\}}	t|	 }
t	|

|}|||< q,|S r3   )r  r   rl  r  rn  r  r  sortedr  setintersection)r  physical_by_noderN   r  r  rv  r  cpus_allowed_listnode_to_cpuscore_to_cpucpusallowed_cpusr6   r6   r7   get_physical_cpus_by_numa;  s"   
r  c                    s    t    fddt D } | S )Nc              	     s$   g | ]}d  ttt | qS )r  )r  rk   r   r  )rJ  rv  r  r6   r7   rL  a  s    z'get_cpu_ids_by_node.<locals>.<listcomp>)r  r  )cpu_idsr6   r  r7   r  ^  s
   
r  c                 C  s0   t  st o| tjtjtjfv o|dko||kS r-  )r   rW   r4   r  r  r   )re   r  
local_sizer6   r6   r7   is_shm_availablei  s   r     c                   s    fdd  fdd}|S )Nc                   s   zt |  | W S  ty]   t| tr"t fdd|  D  Y S t| tr4t fdd| D  Y S t| ttfsGt| t	rTt| t
tfsTt fdd| D  Y S tdt|  w )Nc                 3  $    | ]\}} | |fV  qd S r3   r6   rJ  kr1  _to_hashabler6   r7   rS  z      
z<lru_cache_frozenset.<locals>._to_hashable.<locals>.<genexpr>c                 3      | ]} |V  qd S r3   r6   r0  r  r6   r7   rS  ~      c                 3  r  r3   r6   r0  r  r6   r7   rS    r  zCannot make hashable: )hash	TypeErrorr   rl  	frozensetr  r  r	  rj   r   r   r  rq   )or  r6   r7   r  s  s$   

z)lru_cache_frozenset.<locals>._to_hashablec                   s.   t   t fdd} j|_|S )Nc                    s   t  fdd| D }t fdd| D }||f}|v r)| | S | i |}||< d urDtkrDjdd |S )Nc                 3  r  r3   r6   )rJ  r  r  r6   r7   rS    r  zJlru_cache_frozenset.<locals>.decorator.<locals>.wrapper.<locals>.<genexpr>c                 3  r  r3   r6   r  r  r6   r7   rS    r  F)last)rj   r  r  move_to_endr  popitem)r  r  h_argsh_kwargsr  r  )r  cacher  r0   r6   r7   r    s   
z7lru_cache_frozenset.<locals>.decorator.<locals>.wrapper)r   	functoolswrapsclearcache_clearrG  r  r0   )r  r  r7   r    s
   z&lru_cache_frozenset.<locals>.decoratorr6   )r0   r  r6   r  r7   lru_cache_frozensetr  s   r  c                 C  s   t | |d\}}t|}|}|D ]}||}q|d ur!t||| tj  D ]=\}}	z|d urEt|	|rEtt|	||krEt|	|| W q( t	ye }
 zt
d|	 dt|
  W Y d }
~
q(d }
~
ww d S )NFzIgnore z reports ImportError with:
)parse_module_pathr  ri  rU  r]  copyr  rA   rd   ra   r   r   r   )target_moduletarget_functionwrappersoriginal_moduleoriginal_functionoriginal_function_id	candidater  r  r   rx  r6   r6   r7   apply_module_patch  s,   
&r  c              	     s  ddl m  d fdd	}dd }| d}d }g }t|D ]\}}	d|d |d  }
|dkr:d|d | nd }zt|
}W nc ty   |rQt|nd }|rt||	rt	||	}|rqt||rq|t	||f Y   S |r|r||}t
||| ||f Y   S |rtd	| d
|
 n|s ||
|rt|nd d}Y nw ||	 qtj|  }|d urt||s|r||}t
||| nt
||d  |t	||fS |d fS )Nr   
ModuleSpecc                   sD   t | }d|_ | d|_|tj| < |r t|| dd | |S )z(Create and register a placeholder modulezvllm_ascend.dummy_module.pyNrh   r  )r$  
ModuleType__file____spec__rU  r]  ri  rm   )	full_pathr  dummyr  r6   r7   create_dummy_module  s   

z.parse_module_path.<locals>.create_dummy_modulec                   s    fdd} |_ |S )z-Create dummy function that raises when calledc                    s   t d  d)Nr   z is a placeholder)r  rd  rg  r6   r7   placeholder  r5  zKparse_module_path.<locals>.create_placeholder_function.<locals>.placeholder)r   )rg  r  r6   r  r7   create_placeholder_function  s   z6parse_module_path.<locals>.create_placeholder_functionrh   r/   r   z missing in )r  r3   )importlib.machineryr  rm   rT  r  r   re  ModuleNotFoundErrorrA   rd   ri  r  r  rU  r]  )rf  function_namecreate_dummyr  r  r]  current_moduleprocessed_pathr4  partcurrent_pathparent_pathr  ph_funcfinal_moduler6   r  r7   r    sX   

	


r  c                     0   t jjrt jdj t fdddD S dS )A
    Returns whether the current platform supports MX types.
    r   c                 3      | ]}| v V  qd S r3   r6   rJ  gfxgcn_archr6   r7   rS    r  z!mxfp_supported.<locals>.<genexpr>gfx95Fr4   r
   r5   r;   r   gcnArchNamer7  r6   r6   r  r7   mxfp_supported  s   r  c                     r  )r  r   c                 3  r  r3   r6   r  r  r6   r7   rS    r  z%is_gfx95_supported.<locals>.<genexpr>r  Fr  r6   r6   r  r7   is_gfx95_supported  s   r  )q_projk_projv_projo_proj	gate_projup_proj	down_projqkv_projgate_up_projembed_tokenslm_headallc                   @  sZ   e Zd ZdZddddZddd	Zd ddZd!d"ddZd!d"ddZd#ddZ	dd Z
dS )$ConcurrentCountera  
    An asynchronous counter for managing concurrent tasks that need
    coordinated increments, decrements, and waiting until the count reaches zero.

    This class is useful for scenarios like tracking the number of in-flight tasks
    and waiting for them to complete.
    r   initialrl   c                 C  s   || _ t | _dS )z
        Initialize the counter with an optional initial value.

        Args:
            initial (int): The initial value of the counter. Default is 0.
        N)_countr  	Condition
_condition)r   r  r6   r6   r7   r   7  s   zConcurrentCounter.__init__r1   c                 C  r%  )a  
        Return the current value of the counter.

        Note:
            This method is not synchronized. It may return a stale value
            if other coroutines are concurrently modifying the counter.

        Returns:
            int: The current counter value.
        r  r   r6   r6   r7   r   A  s   zConcurrentCounter.valuer   c                 C  s   d|    dS )z;Return an informative string representation of the counter.z<ConcurrentCounter value=>r   r   r6   r6   r7   __repr__N     zConcurrentCounter.__repr__r/   Tr(  
notify_allr2   c              	     st   | j 4 I dH % |  j|7  _|r"| j   W d  I dH  dS W d  I dH  dS 1 I dH s3w   Y  dS )a  
        Atomically increment the counter by a given amount and notify all waiters.

        Args:
            n (int): The amount to increment the counter by. Default is 1.
            notify_all (bool): Whether to notify all waiters after incrementing. Default is True.
        Nr  r  r  r   r(  r  r6   r6   r7   	incrementR     .zConcurrentCounter.incrementc              	     st   | j 4 I dH % |  j|8  _|r"| j   W d  I dH  dS W d  I dH  dS 1 I dH s3w   Y  dS )a  
        Atomically decrement the counter by a given amount and notify all waiters.

        Args:
            n (int): The amount to decrement the counter by. Default is 1.
            notify_all (bool): Whether to notify all waiters after decrementing. Default is True.
        Nr  r  r6   r6   r7   	decrement_  r  zConcurrentCounter.decrement	conditionCallable[[int], bool]c              	     s^   j 4 I dH  j  fddI dH  W d  I dH  dS 1 I dH s(w   Y  dS )a  
        Asynchronously wait until the counter satisfies a given condition.

        This suspends the calling coroutine without blocking the thread, allowing
        other tasks to run while waiting. When the condition is met, the coroutine resumes.

        Args:
            condition (Callable[[int], bool]): A function that takes the current counter value
                and returns True when the condition is satisfied.
        Nc                     s
    j S r3   r  r6   r  r   r6   r7   ra  x  s   
 z,ConcurrentCounter.wait_for.<locals>.<lambda>)r  wait_for)r   r  r6   r  r7   r  l  s   .zConcurrentCounter.wait_forc                   s   |  dd I dH  dS )z
        Asynchronously wait until the counter reaches zero.

        This suspends the calling coroutine without blocking the thread, allowing
        other tasks to run while waiting. When the counter becomes zero, the coroutine resumes.
        c                 S  s   | dkS r   r6   )r  r6   r6   r7   ra    s    z1ConcurrentCounter.wait_for_zero.<locals>.<lambda>N)r  r   r6   r6   r7   wait_for_zeroz  s   zConcurrentCounter.wait_for_zeroNr  )r  rl   r1   rl   r1   r   )r/   T)r(  rl   r  r2   )r  r  )r   r   r   r   r   r   r  r  r   r  r  r6   r6   r6   r7   r  .  s    



r  c                   C  s   t jdd uS )Ntriton_kernels)r   r   r   r6   r6   r6   r7   is_triton_kernels_available  r  r	  c                 C  s8   dd l m  m} | ^}}||jjkrtd| |S )Nr   zCUDA error: )cuda.bindings.runtimebindingsruntimecudaError_tcudaSuccessr  )
raw_outputcuda_rterrresultsr6   r6   r7   check_cuda_result  s
   r  pytorch_device_idc                 C  s4   t | }tjdd}|r|d}t || S |S )a  
    Convert PyTorch logical device ID to physical device ID.

    When CUDA_VISIBLE_DEVICES is set, maps the logical device ID (as seen by PyTorch)
    to the actual physical device ID. If CUDA_VISIBLE_DEVICES is not set, returns
    the device ID unchanged.

    Args:
        pytorch_device_id: The logical device ID from PyTorch (e.g., torch.cuda.current_device())

    Returns:
        The physical device ID
    CUDA_VISIBLE_DEVICESNr  )rl   r[   r  r   rm   )r  
device_idxcuda_visible_devicesdevice_listr6   r6   r7   get_physical_device_id  s   
r  c               
   C  s   z$t jg ddddd} | j dd }tt|d\}}||fW S  t jtt	fyB } zt
d|  W Y d }~dS d }~ww )	N)rj  z--query-gpu=compute_capz--format=csv,noheaderTrK  rY  r   r\  r   rh   z"Error getting compute capability: ri   )r_  r`  rV  r   rm   rk   rl   rL  rc  rt   r   )r  compute_cap_strrg  rh  rx  r6   r6   r7   get_device_sm_nvidia_smi  s   
r  c                  C  sd   d } dD ]+}zt |} W n ty' } zt|  d } W Y d }~nd }~ww | d ur/ | S q| S )N)z
libnuma.sozlibnuma.so.1)rP  rQ  r  r   r  )libnuma
libnuma_sorx  r6   r6   r7   get_libnuma  s   r  rv  c                 C  sL   t  }|d u s| dk rtd d S |t|  |t|  d S )Nr   z3numa not available on this system, skip bind action)r  numa_availabler   r  numa_run_on_noderP  c_intnuma_set_preferred)rv  r  r6   r6   r7   numa_bind_to_node  s
   r$  c                 C  s0   zt | W S  tjy   td|  dw )NzInvalid JSON list: z#. Please provide a valid JSON list.)r  r  r  r   argparseArgumentTypeErrorr  r6   r6   r7   json_list_type  s   
r'  c                 c  s    t j du st s| V  d S tjd}|r|d}ng }|r&||  nt| }|tjd< t	d|  dV  |rC|tjd< d S tjd= d S )NFr  r  zSet CUDA_VISIBLE_DEVICES to r   )
r+   %SGLANG_ONE_VISIBLE_DEVICE_PER_PROCESSr   r>   r[   r  rm   r   r   r  )r&  original_cuda_visible_devicesr  
str_gpu_idr6   r6   r7   maybe_reindex_device_id  s   
r+  sliding_window_sizechunked_prefill_sizer  c                 C  s   |dt | |  S )Nr{   )r  )r,  r-  r  r6   r6   r7   get_extend_input_len_swa_limit  s   
r.  seq_lensprefix_lensOptional[torch.Tensor]r  c           	      C  s   t d}| j|ksJ |du s|r"|sJ | | dk   S |j|ks)J | | d | }|| d | }|| }t |t j}| S )z
    Get the number of new pages for the given prefix and sequence lengths.
    We use cpu tensors to avoid blocking kernel launch.
    rN   Nr/   )r4   ro   rl   sumr%  r  int64)	r/  r  r0  r  
cpu_devicenum_pages_afternum_pages_beforenum_new_pagessum_num_new_pagesr6   r6   r7   get_num_new_pages	  s   

r9  c                   @  s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )CachedKernelz
    Wrapper that allows kernel[grid](...) syntax with caching based on a key function.

    This wrapper caches compiled Triton kernels based on keys extracted by a
    user-provided key function to avoid redundant compilations.
    Nc                 C  s   || _ t|tjjjsJ |j }t|| _t| jj	
 | _t| j| _| jj	 D ]\}}|jtjju s>J d| dq+t| | i | _|| _d S )NzParameter 'zN' has a default value. Default parameters are not supported in cached kernels.)r  r   r  r  jitJITFunctioninspect	signaturerj   
parameterskeysparam_namesr  num_argsr  r   rV  r  r  update_wrapperkernel_cachekey_fn)r   r  rE  original_fnr   paramr6   r6   r7   r   +  s   

zCachedKernel.__init__c                   sP   t  trt dksJ dt dk r ddt      fdd}|S )z
        Index with grid to get a launcher function.
        Returns a launcher that will handle caching based on the key function.
        r   z/Grid must be a tuple with at most 3 dimensions.)r/   c                    s^    | |}j|}|d u r!j  | i |}|j|< |S | |}|  |  |S r3   )rE  rD  r   r  _build_args)r  r  	cache_keycached_kernelall_argsgridr   r6   r7   launcherM  s   
z*CachedKernel.__getitem__.<locals>.launcher)r   rj   r  )r   rM  rN  r6   rL  r7   __getitem__@  s   zCachedKernel.__getitem__c                 C  s`   t |}tt|| jD ]!}| j| }||tjj}|tjjur'|	| qt
d| |S )zI
        Build the complete argument list for kernel invocation.
        zMissing argument: )r	  rY  r  rB  rA  r   r=  rV  r  r  rt   )r   r  r  complete_argsr}  r   r   r6   r6   r7   rH  _  s   
zCachedKernel._build_argsc                 C  s   | j   dS )z>
        Clear the kernel cache for testing purposes.
        N)rD  r  r   r6   r6   r7   _clear_cacheo  s   zCachedKernel._clear_cacher3   )r   r   r   r   r   rO  rH  rQ  r6   r6   r6   r7   r:  #  s    
r:  c                   r:  )a  
    Decorator that enables key-based caching for Triton kernels using a key function.

    It essentially bypasses Triton's built-in caching mechanism, allowing users to
    define their own caching strategy based on kernel parameters. This helps reduce
    the heavy overheads of Triton kernel launch when the kernel specialization dispatch
    is simple.

    Usage:
        @cached_triton_kernel(key_fn=lambda args, kwargs: kwargs.get('BLOCK_SIZE', 1024))
        @triton.jit
        def my_kernel(x_ptr, y_ptr, BLOCK_SIZE: tl.constexpr):
            ...

        # Invoke normally
        my_kernel[grid](x, y, BLOCK_SIZE=1024)

    Args:
        key_fn: A function that takes (args, kwargs) and returns the cache key(s).
                The key can be a single value or a tuple of values.

    Returns:
        A decorator that wraps the kernel with caching functionality.

    Note: Kernels with default parameter values are not supported and will raise an assertion error.
    c                   sd   t  rtj std t|  S tj r&ttjj d t|  S ttjj d | S )Nz9Detected platform CUDA, using custom triton kernel cache.z* = True. Using custom triton kernel cache.z+ = False. Using native triton kernel cache.)	r=   r+   %SGLANG_USE_CUSTOM_TRITON_KERNEL_CACHEis_setr   r  r:  r   r   )r  rE  r6   r7   r    s   



z'cached_triton_kernel.<locals>.decoratorr6   )rE  r  r6   rT  r7   cached_triton_kernelv  s   rU  c                   s   ddl m} |j }|j }|j }t|ddp/t|ddp/t|ddp/t|jddp/d}tt|d	dp8d}	tt|d
dpBd}
||	|
 |  |   | d | |   fdd|  dS )zBPre-expand RoPE cache for long sequences and speculative decoding.r   r*   context_lengthNcontext_lenmax_model_lenmax_position_embeddingsi   speculative_num_stepsspeculative_num_draft_tokensr/   c                   s>   |   D ]}t|drt|dr| d  q| qd S )N_ensure_cos_sin_cache_lengthcos_sin_cacher/   )rS  rA   r\  )ru   r\  reservereserve_rope_cache_recursiver6   r7   r`    s   
zKreserve_rope_cache_for_long_sequences.<locals>.reserve_rope_cache_recursive)	sglang.srt.environr+   #SGLANG_SPEC_EXPANSION_SAFETY_FACTORr   SGLANG_ROPE_CACHE_SAFETY_MARGINSGLANG_ROPE_CACHE_ALIGNrd   hf_text_configrl   )r  r  model_configr   r+   SAFETY_FACTORMARGINALIGNbase_ctxstepsdraftr6   r^  r7   %reserve_rope_cache_for_long_sequences  s&   




		rm  c                 C  sB   |   |  } }| |  ||   }d| |   | }d| S )Nr{   r/   )doubler2  )r  r  denominatorsimr6   r6   r7   	calc_diff  s   rq  c              
   c  sR    | d u r
d V  d S t | |}t| || zd V  W t| || d S t| || w r3   )rd   ri  )r  attrr   original_valuer6   r6   r7   temp_attr_context  s   
rt  c                   C  s"   t dkr
t  a t t S )Nr  )cached_device_indexr4   rr   r  current_streamr6   r6   r6   r7   get_current_device_stream_fast  s   rw  r   c                 C  sB   |rt |t| |d}|| dkrt| t| ||d  d S r  )rt   rd   r   r   ri  )r  strictcounter_namer;  log_intervalr  r6   r6   r7   raise_error_or_warn  s   
r{  c                  C  s4   zt  W S  ty   t  } t |  |  Y S w )zEGets the running event loop or creates a new one if it doesn't exist.)r  get_running_looprG   new_event_loopset_event_loop)r  r6   r6   r7   get_or_create_event_loop  s   

r  c                  C  s   t  } |  d S )z
    Get the number of NUMA nodes available on the system.
    Must be called after is_numa_available() is True.
    Returns:
        int: The number of NUMA nodes.
    r/   )r  numa_max_noder  r6   r6   r7   get_numa_node_count  s   r  c                  C  s*   z
t  } |  dkW S  ty   Y dS w )Nr   F)r  r   r  r  r6   r6   r7   is_numa_available  s   r  c                  C  s8   t jddgdddd} dd | j dD }t|S )z
    Get the total number of GPUs in the system (not affected by CUDA_VISIBLE_DEVICES).

    Returns:
        int: The total number of physical GPUs.
    rj  z--list-gpusTr  c                 S  s   g | ]}|  d r|qS )GPU)r   r  )rJ  r~  r6   r6   r7   rL  $  s    z*get_system_nvgpu_count.<locals>.<listcomp>r\  )r_  r`  rV  r   rm   r  )r  	gpu_linesr6   r6   r7   get_system_nvgpu_count  s   r  c                  C  s   ddl } | j }t|}tjddddt|gdddd}|j }d	}|	|r;|t
|d  }| r;t|S t }t }||krO|| }	||	 }
|
S td
| d| d d}
|
S )a9  
    Retrieve the NUMA node ID of the CPU socket closest to the currently active CUDA device.

    First tries to query nvidia-smi topology. If it returns a single NUMA ID, uses that directly.
    If it returns multiple NUMA IDs (comma/dash separated), falls back to distributing GPUs
    evenly across NUMA nodes based on GPU ID intervals.

    For example, with 8 GPUs and 2 NUMA nodes: GPUs 0-3 -> node 0, GPUs 4-7 -> node 1.

    Returns:
        int: The NUMA node ID (e.g., 0, 1).

    Raises:
        RuntimeError: If device information cannot be retrieved.
    r   Nrj  topoz-Cz-iTr  zNUMA IDs of closest CPU:z
GPU count z is less than NUMA count z. Using first NUMA node.)r4   r;   r  r  r_  r`  r   rV  r   r  r  isdigitrl   r  r  r   r   )r4   logical_device_idphysical_device_idr  output_liner5  numa_id_str
numa_count	gpu_countgpus_per_numa	numa_noder6   r6   r7   !get_current_device_numa_node_cuda,  s4   


r  c                   C  s"   t j sdS t jjd u rdS dS )NFTr:   r6   r6   r6   r7   nvgpu_availableb  s
   
r  c                  C  s&   t  rt rt } t|  dS dS dS )z
    Bind the current process to the NUMA node closest to the active CUDA device.

    Uses `numa` library calls via ctypes to set the CPU affinity of the process.
    N)r  r  r  r$  )node_idr6   r6   r7   bind_to_closest_numa_node_cudaj  s   r  )r1   r2   )ro   rp   )rw   rx   ry   rz   r  )r   )r   r   r   r   r1   r2   r  )r   r   r   rl   r1   rl   )r   )r   r   r   r   r1   r   )r   r   r1   r2   r   )Fr   )FTN)NNr   FN)r:  rl   r;  r3  r<  r=  r>  r=  r5  r   r?  r2   r@  rA  r1   rB  )r   )r:  rl   r;  r3  r5  r   r1   r_  )rb  rl   r1   r   )rh  rl   r1   ri  )rs  T)
rh  rl   rt  r   ru  rl   rv  r2   r1   r2   r   )r  r   r  r=  r  r2   r1   r  )r  r  r1   r  )r  r  r   )r  r  r  r2   )r  r  r  rl   r  rl   r1   r  r3   )r   r   rI  r   r;  r   )r   r   rI  r   r1   r2   )TN)rO  r2   rP  rl   )rm  )r{  r   )r5  r   )r  r  r  r   r  r  r1   r  )r  r  r  rA  )Nr   T)
r  r  r  rl   r  r  r  rl   r  r2   )Nr   r/   F)r  r  r  rl   r  r  r  rl   r  rl   r  r2   )
r  r  r  r  r  r  r  r2   r1   r  )r  r  r  r  r  r  r1   r  )r  r  )r1   rI  )ro   r   )	NNNr  r  NNNN)r{  r   r1   r   )r  rl   r1   r   )r  r=  r1   r   r  )r  rl   r1   rl   )r  rl   r1   rz   r  )
r  r   r  r   r  r  r  r  r  r  )r>  rl   r  rl   r  rl   r&  rl   )r  r  r1   r  )rI  r   )rk  rl  )rz  r   r1   r2   )rz  r   r1   r   )r  r   rh  rl   r1   r   )r  rl   r  rl   r1   rl   )r(  rl   )r  rl   r  rl   r1   rl   )r   r   r5  r   r1   r   )r  r  r1   r2   )r  r   r1   r   )r  rl   r  r   r  r   r  r  )r  r   r1   r2   )r1   r  )r  r   r1   r  )r1   r  )r  r   r1   r   )r  r   )r  r  )r  r.   )r7  r   r8  r  r1   r  )rG  r   r1   r   r   )rd  r   )r  r   )r  )r  rl   r1   rl   )rv  rl   )r&  rl   )r,  rl   r-  rl   r  rl   r1   rl   r   )
r/  r  r  rl   r0  r1  r  r2   r1   r  )r   (Z  r   
__future__r   r%  r  r+  rP  r  r   r=  r  r{  r  r  rC  r  r[   r  rP   rc  r"  ro  r  rX  r  r_  rU  r  r  r   r   r$  r   r<  collectionsr   r   
contextlibr   dataclassesr   r   r   importlib.metadatar	   r
   importlib.utilr   r   r   multiprocessing.reductionr   pathlibr   typingr   r   r   r   r   r   r   r   r   r   r   r   unittestr   urllib.parser   r   r  rd  r  r  r  r  r4   torch.distributedr!  r  r  r  	packagingrK  r  r!   rK  r"   r#   r  r$   torch.profilerr%   r&   r'   torch.utils._contextlibr(   typing_extensionsr)   ra  r+   sglang.srt.metrics.func_timerr,   r   r-   sglang.srt.server_argsr.   rE  r   r   rL  r  releaser  r8   HIP_FP8_E4M3_FNUZ_MAXFP8_E4M3_MAXfinfofloat8_e4m3fnr  FP8_E4M3_MIN
BAR_FORMATr=   r>   rB   rE   rH   rT   rW   r^   rb   rg   rn   rv   r~   is_ampere_with_cuda_12_3is_hopper_with_cuda_12_3is_blackwell_supportedis_blackwellis_sm120_supportedis_sm100_supportedis_sm90_supportedrK  rA   r  r   r   _cpu_is_amx_tile_supportedr   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r   r   r   r   r   r  r0  r1  r3  r^  ra  rr   rg  rr  r  rx  r  r  r  r  r  r  r  r#  r8  r@  rH  rN  r   r]  rl  rz  r~  r  r  r  r  r  r  r  r  r  r  r
  r  r  r  r  r  r'  r(  rH  r:  rS  rf  ri  ro  rr  rt  r  r  r  r  r  r  r  r  r  r  r  r  r  r|   r  r  r  r  r  r  	Unpicklerr  rH  rJ  rN  rV  FilterrW  rj  ri  ry  r~  r  r  r  r  r  r  r  r  ri  r  r  rP  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r!  r"  r2  r3  r4  r6  rF  rJ  rM  rN  r]  r^  r`  rh  rn  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  SUPPORTED_LORA_TARGET_MODULESLORA_TARGET_ALL_MODULESr  r	  r  r  r  r  r$  r'  r+  r.  r9  r:  rU  rm  rq  rt  ru  rw  r{  r  r  r  r  r  r  r  r6   r6   r6   r7   <module>   s
  8


		


4

h/
	P-%=#

4D+
<	!'		)G.N$*G
9< ;	
!		-
.		#	-LV

	S3)

5