o
    پi                     @   sv  d Z ddlZddlZddlZddlmZmZmZm	Z	m
Z
mZmZmZ ddlZddlZddlmZ ddlmZ ddlmZmZ ddlmZ eeZejdejfd	d
Zdedee fddZ dedeeej! ef fddZ"dedefddZ#dej!defddZ$dd Z%dej&de'fddZ(ddddej)j*de
ej)j+ d e'd!ed"ef d#e	e d$eeeef  ddfd%d&Z,dS )'z+Utilities for selecting and loading models.    N)AnyCallableDictIterableListOptionalTupleType)nn)get_class_from_dynamic_module)ModelConfig	ModelImpl)deep_gemm_wrapperdtypec                 c   s(    t  }t |  dV  t | dS )z0Sets the default torch dtype to the given dtype.N)torchget_default_dtypeset_default_dtype)r   	old_dtype r   Q/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/model_loader/utils.pyset_default_torch_dtype   s
   
r   model_configarchitecturesc                    s   t |D ]y\}}|dkrqt jdd pt } fddt| dd dD }tt|d }|d u rBd|vr>td	| d
|d } jt	j
kr]t|drY| sYtd| dd||<  jt	jkr}t|drs| sst| dtd| d||< q|S )NTransformersForCausalLMauto_mapc                    s$   i | ]\}}|t | j jd qS ))revision)r   
model_pathr   ).0namemoduler   r   r   
<dictcomp>-   s    
z-resolve_transformers_arch.<locals>.<dictcomp>c                 S   s   | d S )Nr   r   )xr   r   r   <lambda>1   s    z+resolve_transformers_arch.<locals>.<lambda>)key	AutoModelzCannot find model module. 'z' is not a registered model in the Transformers library (only relevant if the model is meant to be in Transformers) and 'AutoModel' is not present in the model config's 'auto_map' (relevant if the model is custom).is_backend_compatiblez#The Transformers implementation of z is not compatible with SGLang.z` has no SGlang implementation and the Transformers implementation is not compatible with SGLang.z%s has no SGLang implementation, falling back to Transformers implementation. Some features may not be supported and performance may not be optimal.)	enumerategetattr	hf_configdictsorteditemstransformers
ValueError
model_implr   TRANSFORMERShasattrr&   AUTOloggerwarning)r   r   iarchr   auto_modulesmodel_moduler   r    r   resolve_transformers_arch   sJ   




r9   returnc                    s   ddl m} t| jdg }g d}| jd ur"| j|vr"d|v r"dg}|  t fdd|D }| jtj	kr;d	g}n|rC| jtj
krHt| |}||S )
Nr   )ModelRegistryr   )fp8zcompressed-tensorsgptq_marlin
awq_marlinquark_int4fp8_moeMixtralForCausalLMQuantMixtralForCausalLMc                 3   s    | ]}| v V  qd S )Nr   )r   r6   supported_archsr   r   	<genexpr>o   s    z)get_model_architecture.<locals>.<genexpr>MindSporeForCausalLM)sglang.srt.models.registryr;   r(   r)   quantizationget_supported_archsanyr/   r   	MINDSPOREr0   r9   resolve_model_cls)r   r;   r   mixtral_supportedis_native_supportedr   rB   r   get_model_architectureY   s   
	


rN   c                 C   s   t | d S )N   )rN   r    r   r   r   get_architecture_class_namex   s   rP   modelc                 C   s:   t | dr|jjd dkr| jdd d S |   d S d S )Npost_load_weightsr   DeepseekV3ForCausalLMNextNT)is_nextn)r1   r)   r   rR   )rQ   r   r   r   r   rR   |   s
   
rR   c                 C   s   t jo	t jo	| duS )zFShould we requant fp8 weights into UE8M0 format when loading the modelN)r   ENABLE_JIT_DEEPGEMMDEEPGEMM_SCALE_UE8M0)weight_block_sizer   r   r   $should_deepgemm_weight_requant_ue8m0   s
   rX   weightc                 C   s"   t | dd}|du rdS |jdkS )a  Return True if we should load the given weight asynchronously.

    For host (CPU) tensors, using a threadpool can overlap H2D copies
    and improve throughput. For device tensors, threading often adds overhead
    (e.g., GIL contention) without benefit, so we do it synchronously.
    deviceNFcpu)r(   type)rY   rZ   r   r   r   should_async_load   s   
r]   r   )	func_argsfunc_kwargsexecutorfutures	use_asyncfunc.r^   r_   c                 C   sD   |du ri }|r| | j|g|R i | dS ||i | dS )a  Submit a task to the executor if async loading is enabled.

    Parameters (keyword-only):
    - executor: ThreadPoolExecutor used to submit background tasks
    - futures: a list collecting the submitted Future objects
    - use_async: whether to submit to executor or run inline
    - func: the callable to run
    - func_args: positional args for the callable (defaults to empty tuple)
    - func_kwargs: keyword args for the callable (defaults to empty dict)
    N)appendsubmit)r`   ra   rb   rc   r^   r_   r   r   r   maybe_executor_submit   s
   "rf   )-__doc__concurrent.futures
concurrent
contextlibloggingtypingr   r   r   r   r   r   r   r	   r   r-   r
   !transformers.dynamic_module_utilsr   sglang.srt.configs.model_configr   r   sglang.srt.layersr   	getLogger__name__r3   contextmanagerr   r   liststrr9   ModulerN   rP   rR   rX   Tensorboolr]   ra   ThreadPoolExecutorFuturerf   r   r   r   r   <module>   sJ   (
 ;	

