o
    پi                    @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlmZmZ d dlmZmZ d dlmZmZmZmZmZmZmZmZmZmZ d dl Z d dl!Z"d dl#Z#d dl$m%Z%m&Z&m'Z' d dl(m)Z) zd dl*m+Z+m,Z, d d	l-m.Z. d
Z/W n e0y   dZ/dZ+dZ,dZ.Y nw d dl m1Z1m2Z2 d dl#m3Z3 d dl4m5Z5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z;m<Z< d dl=m>Z>m?Z?m@Z@ d dlAmBZB d dlCmDZDmEZEmFZF d dlGmHZH d dlImJZJ d dl$mKZK d dlLmMZMmNZNmOZO dZPd dlQmRZR d dlSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZc d dldmeZemfZfmgZgmhZhmiZimjZj erOd dlkmlZl d dlmmnZn d dlImJZJ eg ZoepeqZredMd"d#ZsepeqZrdNd*d+Zt	dOdPd.d/ZuG d0d1 d1eZvG d2d3 d3evZwG d4d5 d5ewZxG d6d7 d7ewZyG d8d9 d9evZzG d:d; d;evZ{G d<d= d=evZ|G d>d? d?evZ}G d@dA dAevZ~G dBdC dCevZdQdFdGZG dHdI dIewZ	dOdRdKdLZdS )S    )annotationsN)ABCabstractmethod)contextmanagersuppress)
TYPE_CHECKINGAnyDict	GeneratorIterableListOptionalTupleUnioncast)!RemoteInstanceWeightLoaderBackend1get_remote_instance_transfer_engine_info_per_rankregister_memory_regionget_global_server_args)infer_auto_device_mapinit_empty_weights)get_max_memoryTF)HfApihf_hub_download)nn)
AutoConfigAutoModelForCausalLMAutoTokenizer)SAFE_WEIGHTS_INDEX_NAME)
LoadConfig
LoadFormat)ConnectorTypecreate_remote_connectorget_connector_type)parse_model_name)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizemodel_parallel_is_initialized)QUANT_CFG_CHOICES)QuantizationConfig)$trigger_transferring_weights_request)get_model_architecturepost_load_weightsset_default_torch_dtypeg?)envs)2buffered_multi_thread_safetensors_weights_iterator'download_safetensors_index_file_from_hfdownload_weights_from_hf fastsafetensors_weights_iterator"filter_duplicate_safetensors_files%filter_files_not_needed_for_inferenceget_gguf_extra_tensor_namesget_quant_configgguf_quant_weights_iteratorinitialize_dummy_weightsmaybe_add_mtp_safetensors multi_thread_pt_weights_iteratornp_cache_weights_iteratorpt_weights_iteratorsafetensors_weights_iteratorset_runai_streamer_env)get_bool_env_varget_device_capabilityis_npuis_pin_memory_available	rank0_logset_weight_attrs)DeviceConfig)ModelConfigmoduletorch.nn.Moduletarget_devicetorch.devicec                 c  sT   |j dkr| V  d S i }|  D ] \}}|jj dkr1|j}|j|}t|j||d||< ||_qz}| V  W t }|  D ]o\}}||v r|| }|d }|d }|d }	|j|jjkr| |j kr|j|jjkr|j	|jj	kr|
|j|j ||_q>|	j dkrtj|j |j |jj	|jjd|d}
|

|j |
|_q>|j|	|_q>d S t }|  D ]q\}}||v r(|| }|d }|d }|d }	|j|jjkr| |j kr|j|jjkr|j	|jj	kr|
|j|j ||_q|	j dkr!tj|j |j |jj	|jjd|d}
|

|j |
|_q|j|	|_qw )Ncpu)deviceoriginal_datadevice_datarO   rN   rM   )sizestridedtypelayoutrM   
pin_memory)typenamed_parametersrM   datatodictrC   data_ptrshaperR   copy_torchempty_stridedrP   rQ   rS   )rH   rJ   original_infosnameprN   rO   rT   original_infooriginal_devicecpu_data re   R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/model_loader/loader.pydevice_loading_context   s   



rg   model_configrG   load_configr    returnOptional[QuantizationConfig]c                 C  sZ  t | \}}t|di }t|dd}tr3|dgddgdg dd	gd
g dddgddgdd | jdurt| |||}|du rEdS tst \}}|dur|durd|  kr_dk sbJ  J |d | }	|	| k rtd| j d|  d|	 d|	 }
| j
|
vrt| j
 d| j d|
 t|dd}|dur|dur|| |S dS )zGet the quantization config.packed_modules_mappingremap_prefixNqkv	gate_projup_proj)qkv_projgate_up_projq_projk_projv_projout_proj)rq   projq_a_projkv_a_proj_with_mqa)rq   rr   fused_qkv_a_proj_with_mqa)visualvision_modelmodelr   
   zThe quantization method z; is not supported for the current GPU. Minimum capability: z. Current capability: .z* is not supported for quantization method z. Supported dtypes: hf_to_sglang_mapper)r,   getattr_is_npuupdatequantizationr7   rA   get_min_capability
ValueErrorget_supported_act_dtypesrR   apply_weight_name_mapper)rh   ri   model_class_rl   rm   quant_configmajorminor
capabilitysupported_dtypesr   re   re   rf   _get_quantization_config   sh   




r   r   	nn.Modulec                 C  s\   t | \}}| j|d}tj rtj |d< | j|d< |jdur'|j|d< |di |S )z1Initialize a model with the given configurations.)configr   sparse_head
model_pathNdraft_model_idxre   )r,   	hf_configr/   SGLANG_EMBEDDINGS_SPARSE_HEADis_setgetr   r   )rh   ri   r   r   r   kwargsre   re   rf   _initialize_model  s   



r   c                   @  s6   e Zd ZdZdddZedd
dZedddZdS )BaseModelLoaderzBase class for model loaders.ri   r    c                 C  s
   || _ d S N)ri   selfri   re   re   rf   __init__  s   
zBaseModelLoader.__init__rh   rG   rj   Nonec                 C     t )z6Download a model so that it can be immediately loaded.NotImplementedErrorr   rh   re   re   rf   download_model  s   zBaseModelLoader.download_modeldevice_configrF   r   c                C  r   )z+Load a model with the given configurations.r   r   rh   r   re   re   rf   
load_model#  s   zBaseModelLoader.load_modelNri   r    rh   rG   rj   r   rh   rG   r   rF   rj   r   )__name__
__module____qualname____doc__r   r   r   r   re   re   re   rf   r     s    
r   c                      s   e Zd ZU dZdZedZej	G dd dZ
dZded< dZded	< d6 fddZd7ddZd8ddZd9ddZed:d$d%Zd;d)d*Zd<d,d-Zd=d.d/Zd>d2d3Zed4d5 Z  ZS )?DefaultModelLoaderz:Model loader that can load different file types from disk.   zmodel\.mtp\.layers\.(\d+)\.c                   @  s^   e Zd ZU dZded< 	 ded< 	 dZded< 	 dZd	ed
< 	 dZded< 	 edddZ	dS )zDefaultModelLoader.SourcezA source for weights.strmodel_or_pathOptional[str]revision prefixTboolfall_back_to_ptNzOptional['ModelConfig']rh   rG   c                 C  s   | |j |jdt|dd|dS )Nr   fall_back_to_pt_during_loadT)r   r   rh   )r   r   r   )clsrh   r~   re   re   rf   init_newI  s   
z"DefaultModelLoader.Source.init_newrh   rG   )
r   r   r   r   __annotations__r   r   rh   classmethodr   re   re   re   rf   Source6  s   
 r           floatcounter_before_loading_weightscounter_after_loading_weightsri   r    c                   sH   t  | |j}ddh}t| | }|r"td|j d| d S )Nenable_multithread_loadnum_threads-Unexpected extra config keys for load format : )superr   model_loader_extra_configsetkeysr   load_format)r   ri   extra_configallowed_keysunexpected_keys	__class__re   rf   r   V  s   zDefaultModelLoader.__init__r~   r   r   r   rj   c                 C  sN   t dr%ddlm} tj|s!||| jjtj	j
|| jjd}|S |}|S |S )zDownload model from ModelScope hub if SGLANG_USE_MODELSCOPE is True.

        Returns the path to the downloaded model, or the original model path if
        not downloaded from ModelScope.SGLANG_USE_MODELSCOPEr   )snapshot_download)model_id	cache_dirlocal_files_onlyr   ignore_file_pattern)r@    modelscope.hub.snapshot_downloadr   ospathexistsri   download_dirhuggingface_hub	constantsHF_HUB_OFFLINEignore_patterns)r   r~   r   r   r   re   re   rf   _maybe_download_from_modelscopec  s   	z2DefaultModelLoader._maybe_download_from_modelscopemodel_name_or_pathr   r   Tuple[str, List[str], bool]c                 C  s  |  ||}tj|}| jj}d}t}|tjkrddg}n?|tj	ks(|tj
kr.d}dg}n/|tjkr;d}dg}d}n"|tjkrDdg}n|tjkrMdg}n|tjkrVtdtd	| |rd|dg7 }|stt|| jj||| jjd
}	n|}	t }
|
r|
jdurddlm} |
jp|}||	|d g }|D ]}|ttj|	|7 }t|dkr|dkrd} nq|r|st||| jj| t||	|}nt|}t|dkrtd| d|	||fS )zYPrepare weights for the model.

        If the model is not local, it will be downloaded.F*.safetensors*.binTzconsolidated*.safetensorsz#consolidated.safetensors.index.json*.ptzKDUMMY load_format should use DummyModelLoader and not call _prepare_weightszUnknown load_format: r   Nr   )verify)r   checksums_source$Cannot find any model weights with ``)r   r   r   isdirri   r   r   r!   AUTOSAFETENSORSFASTSAFETENSORSMISTRALPTNPCACHEDUMMYr   r2   r   r   r   model_checksum$sglang.srt.utils.model_file_verifierr   globjoinlenr1   r4   r5   RuntimeError)r   r   r   r   is_localr   use_safetensors
index_fileallow_patterns	hf_folderserver_argsr   r   hf_weights_filespatternre   re   rf   _prepare_weights}  s   











z#DefaultModelLoader._prepare_weightssource'Source'/Generator[Tuple[str, torch.Tensor], None, None]c           	        s8  | j j}|dd}|  j j j\}}}|r' jdur't||d jj	}| j j
tjkr?|du s4J t j| j j||}n:|rgt j}| j j
tjkrQt|}n(|r`t||d| j|d}nt||d}n|rut||d| jd}nt|}| j jdur| | j| j jS | jd	krt | _ fd
d|D S )z?Get an iterator for the model weights based on the load format.r   FNzmodel.safetensors.index.jsonr   )max_workersdisable_mmap)r
  )r	  r   c                 3  s"    | ]\}} j | |fV  qd S r   )r   ).0r`   tensorr  re   rf   	<genexpr>  s     z;DefaultModelLoader._get_weights_iterator.<locals>.<genexpr>)ri   r   r   r  r   r   r   rh   r:   r   r   r!   r   r<   r   r   weight_loader_disable_mmapr   r3   r0   DEFAULT_NUM_THREADSr>   r;   r=   r   _filter_mtp_weightsr   r   timeperf_counter)	r   r  r   use_multithreadr  r  r   weights_iteratorr  re   r  rf   _get_weights_iterator  sj   


z(DefaultModelLoader._get_weights_iteratorr   r   int$Tuple[Tuple[str, torch.Tensor], ...]c           
      C  sn   g }|D ].\}}| j |}|dur't|d}||krq|| d}	n|}	|||	 |f qt|S )zyFilter MTP (Multi-Token Prediction) weights to keep only the
        specified draft model layer and remap it to layer 0.N   zmodel.mtp.layers.0.)_MTP_PATTERNmatchr  groupreplaceappendtuple)
r   r  r   r   filtered_weightsr`   r  r  idxnew_namere   re   rf   r     s   z&DefaultModelLoader._filter_mtp_weightsrh   rG   r   c                 c  sV    t j||}| |E d H  ttt j t|dd}|D ]
}| |E d H  qd S )Nsecondary_weightsre   )r   r   r   r  r   r   r   )r   rh   r~   primary_weightsr#  r  re   re   rf   _get_all_weights3  s   z#DefaultModelLoader._get_all_weightsr   c                 C  s   | j |j|jdd d S )NT)r   r  r   r   r   re   re   rf   r   B  s   

z!DefaultModelLoader.download_modelc                 C  sf  t stdtj|jdtjjd}t  t	|dt
j}tj||dd}W d   n1 s.w   Y  t }t||d}d| v }dd	i}d	}	|rn| D ]}
t|
tr]||
  t9  < qNtd
| dt d ||d< tj|jfd|	i|dtjjd}t|dr|jr|j}td|  n| }td|j d|  t|tstdt| |S )zLoad and prepare the base model for ModelOpt quantization.

        This method handles the common model loading logic shared between
        DefaultModelLoader (conditional) and ModelOptModelLoader (dedicated).
        z`accelerate is required for ModelOpt quantization. Please install it with: pip install accelerateT)trust_remote_coder   torch_dtype)r(  r'  N)
max_memoryrL   autozYModel does not fit to the GPU mem. We apply the following memory limit for calibration: 
zH
If you hit GPU OOM issue, please adjust the memory fraction (currently z2) or reduce the calibration `batch_size` manually.r)  
device_mapmodelopt_quantz*ModelOpt quantization requested (legacy): z+ModelOpt quantization requested (unified): z -> z6Quantization type must be a string (e.g., 'fp8'), got )HAS_ACCELERATEImportErrorr   from_pretrainedr   r   r   r   r   r   r]   float16r   from_configr   r   valuesr   
isinstancer  +DEFAULT_GPU_MEMORY_FRACTION_FOR_CALIBRATIONloggerwarninghasattrr,  rD   _get_modelopt_quant_typer   r   	TypeErrorrU   )r   rh   r   r(  r~   r)  inferred_device_mapon_cpumodel_kwargsr+  rM   quant_choice_strre   re   rf   _load_modelopt_base_modelG  sr   


z,DefaultModelLoader._load_modelopt_base_modelr   rF   c             	   C  s   t |dr|jr| |}| S t|j}t|| j}t|j	, | t
|| j|}W d    n1 s7w   Y  | || ||| W d    n1 sQw   Y  t | _| S )Nr,  )r7  r,  r>  evalr]   rM   r   ri   r.   rR   r   load_weights_and_postprocessr%  r  r  r   )r   rh   r   r~   rJ   r   re   re   rf   r     s&   

zDefaultModelLoader.load_modelc              	   C  sv   |  | |  D ]/\}}t|dd }|d ur8t|| || W d    n1 s,w   Y  tr8tj  q	d S Nquant_method)	load_weightsnamed_modulesr   rg   process_weights_after_loadingr   r]   npuempty_cache)r~   weightsrJ   r   rH   rB  re   re   rf   r@    s   

z/DefaultModelLoader.load_weights_and_postprocessr   )r~   r   r   r   rj   r   )r   r   r   r   r   r   rj   r   )r  r  rj   r  )r   r   r   r  rj   r  )rh   rG   r~   r   rj   r  r   )rh   rG   rj   r   r   )r   r   r   r   r  recompiler  dataclasses	dataclassr   r   r   r   r   r   r  r  r   r  r%  r   r>  r   staticmethodr@  __classcell__re   re   r   rf   r   .  s(   
 



]F



Fr   c                      s,   e Zd ZdZd fddZdddZ  ZS )LayeredModelLoaderzModel loader that loads weights layer by layer so that one can quantize a
    layer before loading another to make the peak memory envelope smaller.ri   r    c                   s   t j|_t | d S r   )r!   r   r   r   r   r   r   re   rf   r     s   zLayeredModelLoader.__init__rh   rG   r   rF   rj   r   c             	     s   ddl m  ddlm} | jt|jt|| j}t	|j
I td t|| j|W d    n1 s9w   Y  tdsLtd|j d| |}d fd
dg | W d    n1 smw   Y  rwd_ S )Nr   )apply_torchao_config_to_modelr   metaload_weights_to_modulezQLayeredModelLoader requires the model to have a `load_weights_to_module` method. z does not support it.fqn	List[str]c                   sl   |   D ]\}}|||g | q| jdd d|}|| r2d|v r4 | d dS dS dS )zi
                fqn: list of strings representing the fully qualified name of `module`.
                F)rM   recurser   rx   N)named_childrento_emptyr   rR  )rH   rS  rH  r`   submodfqn_pathrP  fill_moduler~   rJ   torchao_configre   rf   r[    s   
z2LayeredModelLoader.load_model.<locals>.fill_moduleT)rS  rT  )sglang.srt.layers.torchao_utilsrP  sglang.srt.server_argsr   r\  r]   rM   r   ri   r.   rR   r   r7  r   r   r%  torchao_appliedr?  )r   rh   r   r   r   rH  re   rZ  rf   r     s4   
.zLayeredModelLoader.load_modelr   r   )r   r   r   r   r   r   rN  re   re   r   rf   rO    s    rO  c                      s   e Zd ZdZg dZg dZdg dfdddgfgZd	d
ddZd7 fddZd8ddZ	e
dd Zdd Ze
dd Ze
dd Ze
d9d#d$Ze
d:d*d+Ze
d;d1d2Ze
d3d4 Ze
d5d6 Z  ZS )<QuantizedRLModelLoaderu  
    Model loader for RL training with FP8 quantization (profile-free, native SGLang).

    Workflow:
      1. Initial load: Load base model → Record state → Apply FP8 quantization
      2. Training Actor in full precision
      3. Reload: Trainer sends full precision weights → Quantize to FP8 → Copy to original memory
      4. Use torch.as_strided to preserve memory locations across reloads

    Usage:
      --model-path Qwen/Qwen2.5-7B --quantization fp8 --load-format flash_rl
    )weight_loaderload_qkv_weightload_column_parallel_weightload_row_parallel_weightload_merged_column_weight
output_dim	input_dim_assert_and_load)weight_scaleinput_scaleoutput_scale.biaslm_head.weightzmodel.norm.weightembed_tokenszrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cached	projectorzinput_layernorm.weightzpost_attention_layernorm.weightrq   rs   rr   ro   rp   qkvri   r    c                   s    t  | td d| _d S )Nz3[QuantizedRL] Profile-free FP8 quantization enabledF)r   r   r5  info_initial_load_completer   r   re   rf   r   >  s   

zQuantizedRLModelLoader.__init__r   r   r   r   r   r   c                 C  s2   t d|  ttjd}t|}||||S )z2Standard weight preparation using base model path.z'[QuantizedRL] Loading from base model: )r   )r5  rs  r    r!   r   r   r  )r   r   r   r   temp_configtemp_loaderre   re   rf   r  C  s   z'QuantizedRLModelLoader._prepare_weightsc                 C  s*   ddl }t| dst| s| S || |S )z=Bind function to object instance (for weight_loader methods).r   N__self__)typesr7  callable
MethodType)funcobjrx  re   re   rf   _bind_method_to_clsN  s   z*QuantizedRLModelLoader._bind_method_to_clsc              	     sx  t d  j fdd}| _ | t  }i  _| D ]\}}|j| |j	|
  d j|< q$dd tjD }| D ]7\}}tjD ]/}	t||	r}t||	}
t|
se|
||	 |< qNt|
drw||
ju rw|
j||	 |< qN|
||	 |< qNqG| _  D ](\}}t|dd	}|d	urt|| || W d	   n1 sw   Y  qd
 _d
| _t d d	S )u   
        Initial load: Load BF16 → Record state → Apply FP8 quantization.
        Called ONCE during model initialization.
        z0[QuantizedRL] Initial load with FP8 quantizationc                   s2   t  rtd t  |  d S |  d S )Nz4[QuantizedRL] Using fast path reload in load_weights)r`  is_reload_scenarior5  rs  rebinding_and_load_weights)rH  r~   original_load_weightsre   rf   load_weights_proxy`  s   

zOQuantizedRLModelLoader.load_weights_and_postprocess.<locals>.load_weights_proxy)r[   rQ   rR   nbytesc                 S  s   i | ]}|t  qS re   )rY   )r  rq  re   re   rf   
<dictcomp>z  s    zGQuantizedRLModelLoader.load_weights_and_postprocess.<locals>.<dictcomp>rw  rB  NTz#[QuantizedRL] Initial load complete)r5  rs  rC  rY   rV   original_weights_rebuild_keysitemsr[   rQ   rR   untyped_storager  r`  RECORDED_LOADER_KEYSr7  r   ry  rw  __func__recorded_loaderrD  rg   rE  flash_rl_initial_load_completert  )r   r~   rH  rJ   r  original_weightsr`   ra   r  keyattrr   rH   rB  re   r  rf   r@  W  sL   
	




	z3QuantizedRLModelLoader.load_weights_and_postprocessc                 C  s    t | dot | dot| ddS )z?Check if model is ready for reloading (initial load completed).r  r  r  F)r7  r   )r~   re   re   rf   r~    s
   

z)QuantizedRLModelLoader.is_reload_scenarioc                 C  s"   t jD ]\}}|| v r dS qdS )z7Check if parameter is stacked (qkv_proj, gate_up_proj).TF)r`  STACKED_PARAMS_MAPPING)r`   stacked_namer   re   re   rf   _is_stacked_param  s
   z(QuantizedRLModelLoader._is_stacked_paramr`   rj   (Tuple[str, Optional[str], Optional[Any]]c                 C  sh   t jD ]+\}}t|D ]"\}}|| v r-|dkrt j||n|}| ||||f    S qq| d d fS )Nrq   )r`  r  	enumerate_QKV_SHARD_ALIASESr   r  )r`   targetshard_namesr!  shardshard_idre   re   rf   _resolve_stacked_info  s   
z,QuantizedRLModelLoader._resolve_stacked_infoscale_store7Dict[str, Union[torch.Tensor, Dict[Any, torch.Tensor]]]scaletorch.Tensorr   c                 C  sJ   t |\}}}|d u r|| |< d S | |i }t|tsJ |||< d S r   )r`  r  
setdefaultr3  rY   )r  r`   r  
param_namestacked_keyr  
shard_dictre   re   rf   _store_quantized_scale  s   z-QuantizedRLModelLoader._store_quantized_scale
all_paramsDict[str, torch.nn.Parameter]r  
scale_info2Union[torch.Tensor, Dict[Any, torch.Tensor], None]c                   s  |d u rd S t  t fdd} dr" d d  d}n  d}| |}|d u r8td| d S t|tjr`|	 
 }|jj|jkrS|j| d S td||jj|j d S t fdd	tjD d tfd
d	tjD g }|jjd tt|d }|t| |jjd krtd|jjd  dt|  d}	t|D ]=\}
}dkrtj||n|
}||}||}|d u r|	|7 }	q|jd }|	}|| }|	 
 |jd||f< |}	qd S )Nc                   s:   dkr| S | j d }| } | }|| }| || S )z+Get tp sharded scale from full scale tensorr  r   )r[   )full_scale_tensorfull_dim	shard_dim	start_idxend_idx)tp_ranktp_sizere   rf   _get_tp_sharded_scale  s   
zIQuantizedRLModelLoader._apply_scale_update.<locals>._get_tp_sharded_scale.weightiz.weight_scalez+[QuantizedRL] Scale parameter not found: %sz>[QuantizedRL] Scale shape mismatch for %s: expected %s, got %sc                 3  s     | ]\}}| v r|V  qd S r   re   )r  r  r   )r  re   rf   r        z=QuantizedRLModelLoader._apply_scale_update.<locals>.<genexpr>c                 3  s     | ]\}}| kr|V  qd S r   re   )r  r  names)r  re   rf   r    r  r  zScale param shape z not divisible by r   rq   .)r&   r'   endswithr   r5  r6  r3  r]   Tensort
contiguousrW   r[   r\   nextr`  r  maxr   r  r  )r  r  r  r  scale_param_namescale_param	new_scaler  rows_per_shardoffsetr!  r  r  shard_scale
shard_rowsstartendre   )r  r  r  r  rf   _apply_scale_update  st   






z*QuantizedRLModelLoader._apply_scale_updatec              	     s  t d t|}t|| \}}t|  }i }|D ]  |v r(|  j| < q| j	 D ]\ } |v rM |v rMt
|  j |d |d |  _q.| j	 D ]?\}	}
|
	 D ]6\}}||v r||v r|| }t||	st|rt|drt||	| q[t||	t|| q[t||	| q[qS~i fdd}||t| t|  }|D ]}  |vs |vrqt fddtjD rq|  }|  }d	 v sd
 v r||j ||_q|jt
jkr
|jt
jkr
t
|j|j| }|| ||_t|   q|j|jkr||j ||_qtd  d|j d|j ~|r7t  t
j   t d ||fS )u   
        Reload: VERL sends BF16 → Quantize to FP8 → Copy to original memory.

        Flow: Reset params → Restore attributes → Quantize in iterator → Load → Copy back
        z<[QuantizedRL] Reload: Updating weights with FP8 quantizationr[   rQ   rw  c                 3  s    ddl m} | D ]d\ }t fddtjD r,td  d|j d  |fV  q	|jtj	tj
tjfv r[|||jd \}}td	  d
|j d t |  |fV  q	td  d|j d  |fV  q	dS )z<Quantize individual shards before weight_loader stacks them.r   )per_token_group_quant_fp8c                 3      | ]}| v V  qd S r   re   r  skipr`   re   rf   r  X  
    
zgQuantizedRLModelLoader.rebinding_and_load_weights.<locals>.quantize_weights_iterator.<locals>.<genexpr>z[QuantizedRL] Skip: z ()r  z[QuantizedRL] Quantize:  u   →FP8z[QuantizedRL] Keep: N))sglang.srt.layers.quantization.fp8_kernelr  anyr`  SKIP_QUANTIZATION_PARAMSr5  rs  rR   r]   bfloat16float32r0  r[   r  )weights_iterr  weightqweightr  )quantized_scalesr  rf   quantize_weights_iteratorQ  s$   zTQuantizedRLModelLoader.rebinding_and_load_weights.<locals>.quantize_weights_iteratorc                 3  r  r   re   r  r  re   rf   r  r      
zDQuantizedRLModelLoader.rebinding_and_load_weights.<locals>.<genexpr>rn  lm_headzUnexpected dtype mismatch for z: new=z, old=z[QuantizedRL] Reload complete)!r5  rs  listr`  _get_updated_paramsrY   rV   rW   r  r  r]   
as_stridedcloner  r7  ry  setattrr}  iterr  r  r\   rR   float8_e4m3fnr[   rQ   r  r   r   gccollectcudarG  )r~   first_time_load_weightsrH  weights_listupdated_param_namesis_last_updateexisting_paramscurrent_param_datarebuild_inforq  loader_dictr  loaderparamr  r  	new_paramold_fp8_datastrided_datare   )r`   r  rf   r    s   







z1QuantizedRLModelLoader.rebinding_and_load_weightsc                   sB  g d}t | }t }d}| D ]\ } dkrd}t fddtjD r(qddlm} | }|d	urFt|d
rF||j	k sE||j
krFqt|drT|jjrTd v rTq dr^ |vr^qd}	|D ]"\}
}}| v r ||
  dr{ |vr{qb|  d}	 nqb|	s dr |vrq |v r|  qt||fS )z>Identify which parameters need updating from incoming weights.))rq   rt   rp  )rq   ru   rq  )rq   rv   rr  )rr   ro   r   )rr   rp   r  Frm  Tc                 3  r  r   re   r  r  re   rf   r    r  z=QuantizedRLModelLoader._get_updated_params.<locals>.<genexpr>r   )get_layer_idNstart_layerr   zmodel.vision_towerrl  )rY   rV   r   r  r`  r  sglang.srt.layers.utilsr  r7  r  	end_layerr   tie_word_embeddings
startswithr  r  addr  )r  r~   stacked_params_mappingparams_dictupdated_paramsr  r   r  layer_idmappedr  weight_namer  re   r  rf   r    sV   

z*QuantizedRLModelLoader._get_updated_paramsr   )r   r   r   r   r   r   )r`   r   rj   r  )r  r  r`   r   r  r  rj   r   )r  r  r  r   r  r  rj   r   )r   r   r   r   r  r  r  r  r   r  rM  r}  r@  r~  r  r  r  r  r  r  rN  re   re   r   rf   r`    s>    



=

T
 	r`  c                      s6   e Zd ZdZd fddZdd
dZdddZ  ZS )DummyModelLoaderz:Model loader that will set model weights to random values.ri   r    c                   &   t  | |jrtd|j d S Nz;Model loader extra config is not supported for load format r   r   r   r   r   r   r   re   rf   r        zDummyModelLoader.__init__rh   rG   rj   r   c                 C     d S r   re   r   re   re   rf   r        zDummyModelLoader.download_modelr   rF   r   c             	   C  s   t drt| ||dS t|| j}t|jS t|j t|| j|}W d    n1 s/w   Y  |	 D ]\}}t
|dd }|d urUt|drP| rPq8|| q8t| t|| W d    | S 1 slw   Y  | S )NSGL_CPU_QUANTIZATIONrh   r   rB  is_weights_quantized)r@    load_model_with_cpu_quantizationr   ri   r.   rR   r]   rM   r   rD  r   r7  r  rE  r9   r-   r?  )r   rh   r   r   r~   r   rH   rB  re   re   rf   r     s:   

zDummyModelLoader.load_modelr   r   r   )r   r   r   r   r   r   r   rN  re   re   r   rf   r    s
    
r  c                      sf   e Zd ZdZdZd% fddZed&d
dZd'ddZd(ddZ	d)ddZ
e		d*d+d#d$Z  ZS ),ShardedStateLoaderaC  
    Model loader that directly loads each worker's model state dict, which
    enables a fast load path for large tensor-parallel models where each worker
    only needs to read its own shard rather than the entire checkpoint. See
    `examples/runtime/engine/save_sharded_state.py` for creating a sharded checkpoint.
    z)model-rank-{rank}-part-{part}.safetensorsri   r    c                   sX   t  | |jd u ri n|j }|d| j| _|r*td|j d|j	  d S )Nr  r   r   )
r   r   r   copypopDEFAULT_PATTERNr  r   r   r   )r   ri   r   r   re   rf   r   #  s   
zShardedStateLoader.__init__tensorsDict[str, torch.Tensor]rj   c                 C  s   t t}|  D ]\}}| r#|  }||j|f ||f q	ddd}i }|	 D ]H}|D ]C\}}	|	 ||	}
}|D ]/\}}|
 sKqB| ||}}|
|k s\||k r]qB||
k si||k si|	
 sk n||k rq nqB|	||< q3q/|S )	zx
        Filter out all tensors that share the same memory or a subset of the
        memory of another tensor.
        r  r  rj   r  c                 S  s   |  dd  |   S )Nr  )viewrZ   element_size)r  re   re   rf   get_end_ptr@  s   z:ShardedStateLoader._filter_subtensors.<locals>.get_end_ptrN)r  r  rj   r  )collectionsdefaultdictr  r  numelr  rZ   rM   r  r2  is_contiguous)r
  same_storage_groupsr  r  ptrr  resultr  rq  r  abk2t2a2b2re   re   rf   _filter_subtensors2  s6   
z%ShardedStateLoader._filter_subtensorsr   r   r   r   c                 C  s0   t j|r|S dg}t|| jj||| jjdS )Nr   r   )r   r   r   r2   ri   r   r   )r   r   r   r   re   re   rf   r  V  s   z#ShardedStateLoader._prepare_weightsrh   rG   r   c                 C     |  |j|j d S r   r&  r   re   re   rf   r   c     z!ShardedStateLoader.download_modelr   rF   r   c             
   C  s  ddl m} ddlm} | |j|j}t|| j}t	|j
 t|j' t|| j|}| D ]\}}	t|	dd }
|
d urE|
|	 q2W d    n1 sPw   Y  | }tj|| jj|dd}t|}|sttd| d| | }|D ]]}||d	d
M}| D ]@}||}|| j}|| j}t|jD ]\}}||| k r||d|}q|j|krt !d|j|| |"| |#| qW d    n1 sw   Y  q}|rtdt$| dt%|| W d    |& S 1 sw   Y  |& S )Nr   )	safe_openr&   rB  *rankpartz!Could not find checkpoint files 'z8', only pre-sharded checkpoints are currently supported!pt)	framework:loading tensor of shape %s into parameter '%s' of shape %sMissing keys  in loaded state!)'safetensors.torchr  sglang.srt.distributedr&   r  r   r   r   ri   r.   rR   r]   rM   r   rD  r   rE  r   r   r   r  formatr   r   r  
state_dictr   
get_tensorrW   r[   r  narrowr5  r6  r\   r  r  r-   r?  )r   rh   r   r  r&   local_model_pathr   r~   r   rH   rB  r#  r  	filepathsr-  r   fr  r  
param_dataparam_shapedimrP   re   re   rf   r   f  sp   








//zShardedStateLoader.load_modelNr~   rI   r   r  max_sizeOptional[int]c                 C  s   ddl m} ddlm} |d u rtj}| }d}d}t|  }	i }
|	 D ]7\}}|	 |
  }|d urV|| |krV|j||d}||
tj|| |d7 }d}i }
||
|< ||7 }q't|
dkrx|j||d}||
tj|| d S d S )Nr   )	save_filer   r"  r  )r*  r8  r+  r&   r  r	  r  r-  r  nelementr  r,  r   r   r   r   )r~   r   r  r6  r8  r&   r#  part_idx
total_sizer-  state_dict_partr  r  
param_sizefilenamere   re   rf   
save_model  s:   
zShardedStateLoader.save_modelr   )r
  r  rj   r  )r   r   r   r   r   r   )NN)
r~   rI   r   r   r  r   r6  r7  rj   r   )r   r   r   r   r	  r   rM  r  r  r   r   r?  rN  re   re   r   rf   r    s    
#

Ar  c                      s   e Zd ZdZdgZg dZd; fddZd<ddZ	d=d>ddZd?ddZ	d@ddZ
dAd d!ZdBd#d$ZdBd%d&ZdCd(d)ZdCd*d+ZdCd,d-ZdDd3d4ZdEd5d6ZdFd9d:Z  ZS )GBitsAndBytesModelLoaderzAModel loader to load model weights with BitAndBytes quantization.zadapter_config.json)z.gate_proj.z.down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj.z.o_proj.z.fc1.z.fc2.z.dense.z.query_key_value.z
.qkv_proj.z.dense_h_to_4h.z.dense_4h_to_h.z
.out_proj.ri   r    c                   s~   t  | |jrd|jvrg | _d S |jd }| |}t|d}t|}|d | _W d    d S 1 s8w   Y  d S )Nqlora_adapter_name_or_pathrtarget_modules)r   r   r   rC  _get_config_fileopenjsonload)r   ri   qlora_adapterconfig_file_pathr2  r   r   re   rf   r     s   


"z BitsAndBytesModelLoader.__init__rH  r   rj   c                 C  s   t j|}d }|r | jD ]}t j||}t j|r nqnt }|j|d}| jD ]}||v r:t||d} nq,|sDt	d| |S )Nrepo_id)rK  r>  z#Cannot find adapter config file in )
r   r   r   possible_config_file_namesr   r   r   list_repo_filesr   r   )r   rH  r   rI  filehf_api
repo_filesre   re   rf   rD    s*   

z(BitsAndBytesModelLoader._get_config_fileNr   allowed_patternsrT  r   r   Tuple[List[str], str]c                 C  s   t j|}|r |D ]}tt j||}|r||f  S q
n2t }|j|d}|D ]&}t||}	|	rQt	|| j
j|g|| j
jd}
tt j|
||f  S q+td| d)znRetrieve weight files. Download the files if necessary.

        Return the weight files and the file pattern.rJ  r   zNo model weights found in: `r   )r   r   r   r   r   r   rM  fnmatchfilterr2   ri   r   r   r   )r   r   rQ  r   r   r  weight_filesrO  rP  matching_filesr  re   re   rf   _get_weight_files  s.   	
z)BitsAndBytesModelLoader._get_weight_filesTuple[List[str], bool]c                 C  sR   g d}|  |||\}}|dkrt|}t|dkr#td| d||dkfS )z#Prepare weight files for the model.)r   r   r   r   r   r   r   )rW  r5   r   r   )r   r   r   rQ  r  matched_patternre   re   rf   r  4  s   
z(BitsAndBytesModelLoader._prepare_weightsr   r   c                 C  s   |rt |S t|S r   )r>   r=   )r   r  r   re   re   rf   _hf_weight_iterI  s   z'BitsAndBytesModelLoader._hf_weight_iter	pre_quant	load_8bitFTuple[Generator[Tuple[str, torch.Tensor], None, None], Dict[str, Any]]c           
   
   C  s   zddl }|jdk rtdW n ty  } ztd|d}~ww | ||\}}i }	|rA|r8| |||	|	fS | |||	|	fS | |||	|	fS )zzGet an iterator to the model weights with bitsandbytes quantization,
        as well as the quantization state dictionary.r   Nz0.44.0zCbitsandbytes version is wrong. Please install bitsandbytes>=0.44.0.ziPlease install bitsandbytes>=0.44.0 via `pip install bitsandbytes>=0.44.0` to use bitsandbytes quantizer.)bitsandbytes__version__r.  r  _quantized_8bit_generator_quantized_4bit_generator_unquantized_generator)
r   r   r   r[  r\  r^  errr  r   quant_state_dictre   re   rf   _get_quantized_weights_iteratorO  sH   
z7BitsAndBytesModelLoader._get_quantized_weights_iteratorr  c                   s   ddh}t  fdd|D S )N.scbz.weight_formatc                 3  s    | ]
}   |V  qd S r   )lowerr  )r  suffixr  re   rf   r    s    z?BitsAndBytesModelLoader._is_8bit_weight_name.<locals>.<genexpr>)r  r   r  quantized_suffixre   ri  rf   _is_8bit_weight_name  s   z,BitsAndBytesModelLoader._is_8bit_weight_namec                   s,   h d}| dd  t fdd|D S )N>   absmax	quant_mapr^  nested_absmaxnested_quant_mapr   r  c                 3  r  r   re   )r  q_suffixrh  re   rf   r    s    z?BitsAndBytesModelLoader._is_4bit_weight_name.<locals>.<genexpr>)splitr  rj  re   rr  rf   _is_4bit_weight_name  s   z,BitsAndBytesModelLoader._is_4bit_weight_namer
   c                 c  s    |  ||D ]\}}| dsq| dd}|||< q|  ||D ] \}}| |r0q&||v rAt|ddi ||fV  q&||fV  q&d S )Nrf  r  load_in_8bitT)rZ  rg  r  r  rl  rE   )r   r  r   rd  r  weight_tensor
weight_keyre   re   rf   r`    s$   

z1BitsAndBytesModelLoader._quantized_8bit_generatorc           
      #  s    ddl m  | ||}i }|D ]\}}| |sqd|v r'| j||< q|||< qd fd
d}| ||D ],\}}| |rCq9| d|v sQ| d|v r`|||}	|	||< ||fV  q9||fV  q9d S )Nr   
QuantStatezquant_state.bitsandbytesr  r   temp_state_dictr	   rj   ry  c                   s4   i }|D ]}| d |v r|| ||< q j |ddS )Nr   r  rM   )	from_dict)r  rz  quant_staterq  rx  re   rf   _parse_quant_state  s   zMBitsAndBytesModelLoader._quantized_4bit_generator.<locals>._parse_quant_statez.quant_state.bitsandbytes__nf4z.quant_state.bitsandbytes__fp4)r  r   rz  r	   rj   ry  )bitsandbytes.functionalry  rZ  rt  rL   rW   )
r   r  r   rd  weight_iteratorrz  r  rv  r~  r}  re   rx  rf   ra    s.   




z1BitsAndBytesModelLoader._quantized_4bit_generatorc              	   #  sd   ddl m} t }t }| ||D ]\ }t fdd| jD r dr dd t fdd| j	D rV|
d}|| | }	|| |d	  }
|d
|	|
f }n|
d}|| | }	|| |d	  }
||	|
d
f }|jrw|}n| }| du r| }ttj ||ddd\}}W d    n1 sw   Y  || < n|} |fV  qd S )Nr   )quantize_4bitc                 3  r  r   re   )r  target_moduleri  re   rf   r    r  zABitsAndBytesModelLoader._unquantized_generator.<locals>.<genexpr>r  z.qweightc                 3  r  r   re   )r  rH   ri  re   rf   r    r  r  r  .FTnf4)compress_statistics
quant_type)r  r  r'   r&   rZ  r  rC  r  r  column_parallel_weights_modulesrP   is_cudar  r  r  r.   r]   r  )r   r  r   rd  r  r  r  rv  r;  start_index	end_indexweight_sub_tensorloaded_weightprocessed_weightr}  re   ri  rf   rb    sL   


z.BitsAndBytesModelLoader._unquantized_generatorrh   rG   r~   r   r   c                 C  s  t |dstdt|j dt |ds tdt|j dt| jdkr5t |dr1|j| _n| j| _t |d	r?|j| _ng | _t|j| _	t
d
 t|jdd }d}|d urn|d}|dkrfd}ntd| d|ryt dkrytdd}|r|dd}| |j|j||\}}|| tj  t| }	i }
|jj	}|D ]c}|}|dkrd|v r|dd}d}|j D ]\}\}}|dv rd|v r n||v r|}|||} nq|dv rd|v r|dd}||	vrtd| d||
vri |
|< || |
| |< q|	 D ]k\}}||
v ry|
| }t|d|i t|d d!}|d!kr6td"| ddgt| }| D ]\}}t|j | ||< qAt!"dgt!#|f}t$|% }t|d#|i |ryt|d$d gt| i qd S )%NrC  z;The required method 'load_weights' is not defined in class r   #bitsandbytes_stacked_params_mappingzModel z0 does not support BitsAndBytes quantization yet.r   #default_bitsandbytes_target_modulesr  zELoading weights with BitsAndBytes quantization.  May take a while ...quantization_configFrB  r^  Tz%BitsAndBytes loader does not support z quantizationr  zIPrequant BitsAndBytes models with TP is not supported.Please try with PP.ru  mllamar}   zself_attn.o_projzself_attn.proj)qwen2_vl
qwen2_5_vlr|   z	attn.qkv.zattn.qkv_proj.z
Parameter z not found in the model.bnb_quant_statepack_factorr  z"pack_factor not set for parameter bnb_shard_offsetsmatmul_state)&r7  AttributeErrorrU   r   r   rC  r  default_target_modulesr  
model_typer5  rs  r   r   r   r   r'   re  r   r   rC  r]   r  rG  rY   rV   r  r  r  rE   mathprodr[   npconcatenatecumsumr  rL   )r   rh   r~   r   r[  rB  r\  qweight_iteratorrd  
param_dictstacked_quant_state_dictr  quant_param_namenon_stacked_param_nameshard_index
shard_namer  indexr  r  quant_states
pack_rationum_elementsseqr}  offsetsre   re   rf   _load_weights  s   














z%BitsAndBytesModelLoader._load_weightsc                 C  r  r   r&  r   re   re   rf   r     r  z&BitsAndBytesModelLoader.download_modelr   rF   c             	   C  s   t || j}t|j8 t|j t|| j|}| || W d    n1 s*w   Y  W d    | S W d    | S 1 sFw   Y  | S r   )	r   ri   r.   rR   r]   rM   r   r  r?  )r   rh   r   r   r~   re   re   rf   r     s"   




z"BitsAndBytesModelLoader.load_modelr   )rH  r   rj   r   r   )r   r   rQ  rT  r   r   rj   rR  )r   r   r   r   rj   rX  )r   r   )
r   r   r   r   r[  r   r\  r   rj   r]  )r  r   )rj   r
   )rh   rG   r~   r   rj   r   r   r   )r   r   r   r   rL  r  r   rD  rW  r  rZ  re  rl  rt  r`  ra  rb  r  r   r   rN  re   re   r   rf   r@    s(    

!


7



-
7 
r@  c                      sT   e Zd ZdZd fddZddd	ZdddZdddZd ddZd!ddZ	  Z
S )"GGUFModelLoaderz
    Model loader that can load GGUF files. This is useful for loading models
    that are quantized with GGUF and saved in the GGUF format. This loader
    supports loading both full models and sharded models.
    ri   r    c                   r  r  r  r   r   re   rf   r     r  zGGUFModelLoader.__init__r   r   c                 C  s   t j|r|S t| d)Nz is not a file.)r   r   isfiler   )r   r   re   re   rf   r    s   z GGUFModelLoader._prepare_weightsrh   rG   c              
   C  s  zddl }W n ty } ztd|d}~ww |j}|j}|dkr$d}d}|j D ]\}}||kr7|} nq+|du rCtd| |j}	|||	}
t	
d t|}W d   n1 saw   Y  | }i }|D ]}|dd	\}}|
|}||| d| < qn|S )
au  
        GGUF uses this naming convention for their tensors from HF checkpoint:
        `blk.N.BB.weight` and `blk.N.BB.bias`
        where N signifies the block number of a layer, and BB signifies the
        attention/mlp layer components.
        See "Standardized tensor names" in
        https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
        r   NzAPlease install gguf via `pip install gguf` to use gguf quantizer.coherez	command-rzUnknown gguf model_type: rQ  r   r  )ggufr.  r   r  MODEL_ARCH_NAMESr  r   num_hidden_layersget_tensor_name_mapr]   rM   r   r1  r-  rsplitget_name)r   rh   r  rc  r   r  archr  value
num_layersname_mapdummy_modelr-  gguf_to_hf_name_maphf_namer`   rh  	gguf_namere   re   rf   _get_gguf_weights_map  sD   
z%GGUFModelLoader._get_gguf_weights_mapr  Dict[str, str]rj   r  c                 C  s
   t ||S r   )r8   )r   r   r  re   re   rf   r    s   
z%GGUFModelLoader._get_weights_iteratorr   c                 C  s   |  |j d S r   )r  r   r   re   re   rf   r        zGGUFModelLoader.download_modelr   rF   r   c             
   C  s  |  |j}| |}dt||v r|jddi t|j}t|| j	}t
|jX | t|| j	|}W d    n1 s@w   Y  || || | D ](\}}	t|	dd }
|
d urzt|	| |
|	 W d    n1 suw   Y  qRW d    |S 1 sw   Y  |S )Nrm  r  TrB  )r  r   r  r6   r   r   r]   rM   r   ri   r.   rR   r   rC  r  rD  r   rg   rE  )r   rh   r   r0  gguf_weights_maprJ   r   r~   r   rH   rB  re   re   rf   r     s6   


zGGUFModelLoader.load_modelr   )r   r   r   )r   r   r  r  rj   r  r   r   )r   r   r   r   r   r  r  r  r   r   rN  re   re   r   rf   r    s    


-
r  c                      sJ   e Zd ZdZd fddZdd
dZdddZdddZdddZ  Z	S )RemoteInstanceModelLoaderz?Model loader that can load Tensors from remote sglang instance.ri   r    c                   s,   t  | |jrtd|j d | _d S r  )r   r   r   r   r   +remote_instance_transfer_engine_weight_infor   r   re   rf   r     s   
z"RemoteInstanceModelLoader.__init__rh   rG   rj   r   c                 C  r   r   r   r   re   re   rf   r   "  r  z(RemoteInstanceModelLoader.download_modelr   rF   r   c          
   	   C  s  t d | j}|jtjksJ d| jj d|j t|| j}t|j% t	
|j
 t|| j|}W d    n1 s?w   Y  W d    n1 sNw   Y  |jtjkrd|j d|j|j  }t||j
%}t|}|tjkr| |||| ntd| dW d    | S 1 sw   Y  | S |jtjkr|jd u rtdt d	 t||j| _t d
 | ||jd|j d|j |j}	|	std| S td)Nz(Loading weights from remote instance ...Model loader " is not supported for load format zinstance://:zUnsupported connector type z! for remote tensor model loading.zdTransfer engine is not initialized for remote instance model loader with `transfer_engine` backend. zJTransferEngine registering memory regions (this may take a few seconds)...z@TransferEngine memory regions have been successfully registered.zhttp://z@Failed to load weights from remote instance via transfer engine.z.Invalid remote instance weight loader backend.) r5  rs  ri   r   r!   REMOTE_INSTANCEr   r.   rR   r]   rM   r   %remote_instance_weight_loader_backendr   NCCL.remote_instance_weight_loader_seed_instance_ip6remote_instance_weight_loader_send_weights_group_portsr  r#   r$   r"   INSTANCE'load_model_from_remote_instance_by_ncclr   TRANSFER_ENGINE-remote_instance_weight_loader_transfer_enginer   r   r  2load_model_from_remote_instance_by_transfer_engine8remote_instance_weight_loader_seed_instance_service_portr?  )
r   rh   r   ri   r   r~   model_weightsclientconnector_typesuccessre   re   rf   r   %  sx   



--
z$RemoteInstanceModelLoader.load_modelc                 C  s<  | j }tt }t }|j|j|j|d tj	
  t }td|| dd |jdkrDtjt|j|j|j|fd}	|	  t }
t|j* | D ]\}}tjj|jd|jd qRtj	
  t|drp|  W d    n1 szw   Y  t }td	||
 dd tjj|j tj	  d S )
N)gpu_idr  instance_ipz6finish building group for remote instance, time used: z.4fsr   )r  args)srcr  r-   z<finish getting all weights from remote instance, time used: ) ri   socketgethostbynamegethostnamer  build_groupr  r  r]   r  synchronizer5  debug	threadingThreadr+   r  r  r  r  r.   rR   rV   distributed	broadcastrW   _model_update_groupr7  r-   distributed_c10ddestroy_process_grouprG  )r   r~   r  rh   r   ri   r  start_build_group_ticend_build_group_ticr  start_get_weights_ticr   r  end_get_weights_ticre   re   rf   r  l  sX   

	

zARemoteInstanceModelLoader.load_model_from_remote_instance_by_ncclr   c                 C  sH  t ||\}}|d u s|d u rtd dS g }g }g }	| D ]b\}
}||
d }|d u r:td|
 d  dS |\}}}|| ksK|| krgtd|
 d| d| d|  d|  d	  dS | }| |  }|| || |	| q |	||||	}|d
k rtd|  dS t
|dr|  dS )Nz2Cannot get transfer engine session or weight info.FzCannot find weight info for r   zWeight info does not match for z, expected (z, z), got (r  r   zbatch transfer failed, error: r-   T)r   r5  errorrV   r   r  r  rZ   r  batch_transfer_sync_readr7  r-   )r   r~   transfer_engineseed_urlr  seed_transfer_engine_session_id seed_transfer_engine_weight_infoseed_ptr_listclient_ptr_listclient_len_listr`   r  weight_infoseed_ptr
seed_numelseed_element_size
client_ptr
client_lenretre   re   rf   r    s\   




zLRemoteInstanceModelLoader.load_model_from_remote_instance_by_transfer_enginer   r   r   )rj   r   )
r   r   r   r   r   r   r   r  r  rN  re   re   r   rf   r    s    
	

G3r  c                      sl   e Zd ZdZd! fddZd"dd	Zd"d
dZd#ddZed$ddZ	d%ddZ
d&ddZd&dd Z  ZS )'RemoteModelLoaderz8Model loader that can load Tensors from remote database.ri   r    c                   s   t  | t| d S r   )r   r   r?   r   r   re   rf   r     s   zRemoteModelLoader.__init__rj   r  c                 C  s"   t |tjks	J t }||S z:Get an iterator for the model weights from remote storage.)r$   r"   KVr&   r  )r   r  r#  re   re   rf   _get_weights_iterator_kv  s   
z*RemoteModelLoader._get_weights_iterator_kvc                 C  s   t |tjks	J | S r  )r$   r"   FSr  )r   r  re   re   rf   _get_weights_iterator_fs  s   z*RemoteModelLoader._get_weights_iterator_fsrh   rG   r   c                 C  r   r   re   r   re   re   rf   r     r  z RemoteModelLoader.download_modelr~   rI   r   r   urlc                 C  s*  t |}t|tjksJ t|}t }t|  }|	 D ]\}}| d| d| }	|
|	| q t|D ]H\}
}}|D ]@}|drIqAtj|d dv rtj|
|}t|dd}| }| d| }||| W d    n1 s|w   Y  qAq:W d    d S 1 sw   Y  d S )	Nz/keys/rank_/r   r  )z.jsonz.pyzutf-8)encodingz/files/)r#   r$   r"   r  r%   r&   r  r  r-  r  r   r   walkr  r   splitextr   rE  readsetstr)r~   r   r  r  
model_namer#  r-  r  r  r_keyrootr   files	file_name	file_pathrN  file_contentf_keyre   re   rf   r?    s.   

"zRemoteModelLoader.save_modelr   c                 C  s   |  D ]\}}t|dd }|d ur|| q| |}t| }|D ]=\}	}
||	 j}||	 j}t	|
jD ]\}}||| k rJ|
|d|}q9|
j|krYtd|
j|	| ||
 ||	 q&|rptdt| dt|| d S )NrB  r   r'  r(  r)  )rD  r   rE  r  r  r  r-  rW   r[   r  r/  r5  r6  r\   r  r   r  r-   )r   r~   rh   r  r   rH   rB  r  r-  r  r  r3  r4  r5  rP   re   re   rf   _load_model_from_remote_kv	  s4   





z,RemoteModelLoader._load_model_from_remote_kvr   rF   c           	   
   C  s   t |j}t|j> || | | D ](\}}t|dd }|d ur@t|| |	| W d    n1 s;w   Y  qW d    d S 1 sLw   Y  d S rA  )
r]   rM   r.   rR   rC  r  rD  r   rg   rE  )	r   r~   r  rh   r   rJ   r   rH   rB  re   re   rf   _load_model_from_remote_fs0	  s   "z,RemoteModelLoader._load_model_from_remote_fsc             	   C  sN  t d t }| j}|jtjksJ d| jj d|j |j}t	|dr*|j
}t|| j}t|jZ t|j t|| j|}W d    n1 sNw   Y  t||jd&}t|}	|	tjkrl| ||| n|	tjkry| |||| W d    n1 sw   Y  W d    n1 sw   Y  t }
t d|
|  | S )Nz'Loading weights from remote storage ...r  r  r  r{  z3Loaded weights from remote storage in %.2f seconds.)r5  rs  r  r  ri   r   r!   REMOTEr   r7  r  r   r.   rR   r]   rM   r   r#   r$   r"   r  r  r   r  r?  )r   rh   r   r  ri   r  r   r~   r  r  r  re   re   rf   r   C	  sD   



zRemoteModelLoader.load_modelr   )rj   r  r   )r~   rI   r   r   r  r   rj   r   )r~   r   rh   rG   r   )r   r   r   r   r   r  r  r   rM  r?  r  r  r   rN  re   re   r   rf   r    s    

	


 r  r   rF   c          	   
   C  s   t |j}t|| j}t|jR t|| j|}t| ts'|	| 
|| | D ](\}}t|dd }|d urSt|| || W d    n1 sNw   Y  q+|| W d    | S 1 sfw   Y  | S rA  )r]   rM   r   ri   r.   rR   r   r3  r  rC  r%  rD  r   rg   rE  rX   r?  )	r   rh   r   rJ   r   r~   r   rH   rB  re   re   rf   r  l	  s,   

r  c                      sf   e Zd ZdZd! fddZ			d"d#ddZd$ddZ		d%d&ddZd' fddZd'dd Z	  Z
S )(ModelOptModelLoaderzG
    Model loader that applies NVIDIA Model Optimizer quantization
    ri   r    c                   s   t  | d S r   )r   r   r   r   re   rf   r   	  r  zModelOptModelLoader.__init__Nquantized_ckpt_restore_path
str | Nonequantized_ckpt_save_pathexport_pathrj   r   c              
   C  s  zddl m  m} ddlm  m} ddlm}	 W n ty+ }
 ztd|
d}
~
ww |	|r6td dS |rpz|	|| td|  | 
|| W dS  tyo }
 ztd| d|
  td	 W Y d}
~
nd}
~
ww z|tt d
|_W d   n1 sw   Y  ddlm}m} |d|dd|jdd}||d}|j|||d t rt dkr|| |rz||| td|  W n ty }
 ztd| d|
  W Y d}
~
nd}
~
ww | 
|| W dS  ty }
 ztd|
 |
d}
~
ww )aZ  
        Set up ModelOpt quantization for the given model.

        Args:
            model: The model to quantize
            tokenizer: The tokenizer associated with the model
            quant_cfg: The quantization configuration
            quantized_ckpt_restore_path: Path to restore quantized checkpoint from
            quantized_ckpt_save_path: Path to save quantized checkpoint to
            export_path: Path to export the quantized model in HuggingFace format

        Raises:
            ImportError: If ModelOpt is not available
            Exception: If quantization setup fails
        r   N)is_quantizedz3ModelOpt is not available. Please install modelopt.z8Model is already quantized, skipping quantization setup.zRestored quantized model from zFailed to restore from r   z1Proceeding with calibration-based quantization...left)create_forward_loopget_dataset_dataloadercnn_dailymail$   i   F)dataset_name	tokenizer
batch_sizenum_samplesrM   include_labels)
dataloader)forward_loopzQuantized model saved to z'Failed to save quantized checkpoint to z(Failed to set up ModelOpt quantization: )modelopt.torch.optr]   optmodelopt.torch.quantizationr   !modelopt.torch.quantization.utilsr  r.  rD   restore_maybe_export_modelopt	Exceptionr5  r6  r   padding_side"modelopt.torch.utils.dataset_utilsr  r  rM   quantizer(   r&   print_quant_summarysave)r   r~   r   	quant_cfgr  r  r  mtomtqr  er  r  calib_dataloadercalibrate_loopre   re   rf   _setup_modelopt_quantization	  s   

	

z0ModelOptModelLoader._setup_modelopt_quantizationc              
   C  sr   |r7zt | dd}| ||| td|  W dS  ty6 } ztd| d|  W Y d}~dS d}~ww dS )z>Export model to HuggingFace format if export_path is provided._original_model_pathNz2Quantized model exported to HuggingFace format at z-Warning: Failed to export quantized model to r   )r   _export_modelopt_checkpointrD   r,  )r   r~   r  original_model_pathr5  re   re   rf   r+  	  s    
z*ModelOptModelLoader._maybe_export_modeloptTr   r   r'  r   c           	   
   C  s   zddl m} ddlm} W n ty } ztd|d}~ww tj|dd |||d |r`z|j||d	}|| t	d
|  W dS  t
y_ } zt	d|  W Y d}~dS d}~ww dS )a  
        Export the quantized model to HuggingFace format using ModelOpt export API.

        Args:
            model: The quantized model to export
            export_path: Directory path to export the model to
            model_path: Path to the original model (for tokenizer export)
            trust_remote_code: Whether to trust remote code for tokenizer loading

        Raises:
            ImportError: If ModelOpt export functionality is not available
            Exception: If export fails
        r   )export_hf_checkpoint)r   zpModelOpt export functionality is not available. Please ensure you have the latest version of modelopt installed.NT)exist_ok)
export_dir)r'  zTokenizer exported to z%Warning: Failed to export tokenizer: )modelopt.torch.exportr<  transformersr   r.  r   makedirsr/  save_pretrainedrD   r,  )	r   r~   r  r   r'  r<  r   r5  r   re   re   rf   r:  
  s2   
z/ModelOptModelLoader._export_modelopt_checkpointrh   rG   r   rF   r   c                  sJ   t d |j| _| rt d t j||dS t d | ||S )Nz*ModelOptModelLoader: Loading base model...z/Model is already quantized, loading directly...r  z9Standard quantization mode: Will quantize and export/save)r5  rs  r   r9  _is_already_quantizedr   r   _standard_quantization_workflowr   r   re   rf   r   ;
  s   


zModelOptModelLoader.load_modelc              
   C  sj  |  |}zddlm  m} W n ty   td  w t|dr*|jr*|j}n|	 }t
|}|sCtd| dtt
  zt||}W n tyY   td| dw td	|  | jj}|rk|jnd}	|rr|jnd}
|ry|jnd}tj|jd
d}z| j||||	|
|d W | S  ty } ztd|  td W Y d}~| S d}~ww )zUStandard quantization workflow: quantize, save checkpoint, export, then return model.r   NzdNVIDIA Model Optimizer (modelopt) library not found. Please install it to use ModelOpt quantization.r,  zInvalid quantization choice: 'z'. Available choices: zModelOpt quantization config 'z=' not found. Please verify the ModelOpt library installation.z1Quantizing model with ModelOpt using config: mtq.T)use_fast)r  r  r  zModelOpt quantization failed: z"Proceeding without quantization...)r>  r(  r]   r   r.  r5  r  r7  r,  r8  r)   r   r   r  r   r   r  rs  ri   modelopt_configcheckpoint_restore_pathcheckpoint_save_pathr  r   r/  r   r8  r,  r6  rD   r?  )r   rh   r   r~   r4  r=  quant_cfg_namer2  rF  r  r  r  r   r5  re   re   rf   rD  T
  sl   



z3ModelOptModelLoader._standard_quantization_workflowr   )NNN)r  r  r  r  r  r  rj   r   )r  r  rj   r   )NT)r  r   r   r   r'  r   rj   r   r   )r   r   r   r   r   r8  r+  r:  r   rD  rN  re   re   r   rf   r  	  s    	
e.r  Optional[ModelConfig]c                 C  s  | j tjkr
t| S |r"t|dr|js|jdv r"td t	| S |rLt|drL|jdv rL|
 r?td|j  t	| S td|j  t	| S t| j trW|  | S | j tjkrat| S | j tjkrkt| S | j tjkrut| S | j tjkrt| S | j tjkrtd td	 |r|jstd
 d|_t| S | j tjkrt| S | j tjkrt| S | j tjkrddl}z|d}|| W S  ty   t dw t!| S )z,Get a model loader based on the load format.r,  )modelopt_fp8modelopt_fp4modeloptz>Using ModelOptModelLoader due to ModelOpt quantization config.r   )rK  rL  z3Using ModelOptModelLoader for pre-quantized model: z,Using ModelOptModelLoader for quantization: zJUsing QuantizedRLModelLoader for RL training with native FP8 quantization.zoFP8 approach: Model loads with native SGLang FP8 quantization. Same model path for both training and inference.znQuantizedRL: Setting quantization to fp8 (native SGLang support). Model will be loaded with FP8 infrastructurefp8r   Nz#sglang.private.private_model_loaderz4Failed to import sglang.private.private_model_loader)"r   r!   r   r  r7  r,  r   r5  rs  r  rC  r3  rU   SHARDED_STATEr  BITSANDBYTESr@  GGUFr  LAYEREDrO  FLASH_RLr`  r  r  r  r  PRIVATE	importlibimport_modulePrivateModelLoaderr.  r   r   )ri   rh   rU  rH   re   re   rf   get_model_loader
  sx   







rX  )rH   rI   rJ   rK   )rh   rG   ri   r    rj   rk   r   )rh   rG   ri   r    r   rk   rj   r   r   )ri   r    rh   rJ  rj   r   )
__future__r   r  rK  rS  r  r   rF  loggingr  r   rI  r  r  r  abcr   r   
contextlibr   r   typingr   r   r	   r
   r   r   r   r   r   r   r   numpyr  r]   ;sglang.srt.model_loader.remote_instance_weight_loader_utilsr   r   r   r^  r   
accelerater   r   accelerate.utilsr   r-  r.  r   r   r   r@  r   r   r   transformers.utilsr   sglang.srt.configs.load_configr    r!   sglang.srt.connectorr"   r#   r$   sglang.srt.connector.utilsr%   r+  r&   r'   r(    sglang.srt.layers.modelopt_utilsr)   *sglang.srt.layers.quantization.base_configr*   r+   sglang.srt.model_loader.utilsr,   r-   r.   r4  sglang.srt.environr/   $sglang.srt.model_loader.weight_utilsr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   sglang.srt.utilsr@   rA   rB   rC   rD   rE    sglang.srt.configs.device_configrF   sglang.srt.configs.model_configrG   r   	getLoggerr   r5  rg   r   r   r   r   rO  r`  r  r  r@  r  r  r  r  r  rX  re   re   re   rf   <module>   s   0H 	

;
D   J   \8 6   aj D 
"  