o
    پi2                     @  s  d dl mZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZmZ d dlmZmZmZ d d	lmZ d d
lmZmZ d dlmZm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0 erd dl1m2Z2 e, Z3e* Z4e+ Z5e3rd dl6m7Z7 e0d	dddZ8e9e:Z;G dd de!Z<G dd de Z=G dd deZ>dS )    )annotationsN)MappingProxyType)TYPE_CHECKINGAnyDictListMappingOptionalcast)	Parameter)$get_tensor_model_parallel_world_size)CPUQuantMethod!_amx_process_weight_after_loading)	MoeRunnerMoeRunnerBackendMoeRunnerConfig)TritonMoeQuantInfo)ChannelQuantScaleParameterModelWeightParameter)FusedMoEMethodBaseLinearMethodBaseQuantizationConfigQuantizeMethodBase)should_ignore_layer)per_token_quant_int8)UnquantizedLinearMethod)cpu_has_amx_supportis_cpuis_cudaset_weight_attrsuse_intel_amx_backend)register_fake_if_exists)StandardDispatchOutput)int8_scaled_mmzsgl_kernel::int8_scaled_mmc                 C  s&   | j d }|j d }| j||f|dS )Ndtype)shape	new_empty)mat_amat_bscales_ascales_b	out_dtypebiasMN r2   \/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/w8a8_int8.py_int8_scaled_mm_abstract.   s   
	
r4   c                      s   e Zd ZdZi fd" fddZed#dd	Zed$ddZed%ddZed&ddZ	ed'ddZ
d(ddZei fd)ddZd&d d!Z  ZS )*W8A8Int8ConfigzConfig class for W8A8 Quantization.

    - Weight: static, per-channel, symmetric
    - Activation: dynamic, per-token, symmetric
    quant_configDict[str, Any]c                   sn   t    || _|dd| _ttt |dg }|d ur |ng | _|di }|d ur2|| _	d S i | _	d S )N
is_dynamicFignorepacked_modules_mapping)
super__init__quant_descriptiongetr8   r
   r   strr9   r:   )selfr6   r9   r:   	__class__r2   r3   r<   F   s   

zW8A8Int8Config.__init__returnList[torch.dtype]c                 C  s   t jt jgS N)torchfloat16bfloat16clsr2   r2   r3   get_supported_act_dtypesQ   s   z'W8A8Int8Config.get_supported_act_dtypesintc                 C     dS )NK   r2   rI   r2   r2   r3   get_min_capabilityU      z!W8A8Int8Config.get_min_capabilityr?   c                 C  rM   )N	w8a8_int8r2   r@   r2   r2   r3   get_nameY   rP   zW8A8Int8Config.get_name	List[str]c                 C  s   g }|S rE   r2   )rJ   	filenamesr2   r2   r3   get_config_filenames]   s   z#W8A8Int8Config.get_config_filenamesconfigc                 C  s   | |S rE   r2   )rJ   rW   r2   r2   r3   from_configb   s   zW8A8Int8Config.from_configlayertorch.nn.ModuleprefixOptional[QuantizeMethodBase]c                 C  sX   ddl m} ddlm} t|| j| jdrt S t||r!t	| S t||r*t
| S d S )Nr   )
LinearBase)FusedMoE)r9   fused_mapping)sglang.srt.layers.linearr]   &sglang.srt.layers.moe.fused_moe_tritonr^   r   r9   r:   r   
isinstanceW8A8Int8LinearMethodW8A8Int8MoEMethod)r@   rY   r[   r]   r^   r2   r2   r3   get_quant_methodf   s   


zW8A8Int8Config.get_quant_methodr_   Mapping[str, List[str]]c                   s     dd |v r; fdd| D }d }|D ]}| j|d  dk}|d u r-|}q||kr9td  dqn	| j d  dk}|d usJJ |S )	N.r%   c                   s   g | ]}  |qS r2   )replace).0shard_proj_namer[   	proj_namer2   r3   
<listcomp>~   s    
z3W8A8Int8Config.is_layer_skipped.<locals>.<listcomp>z.weightFLOATz$Detected some but not all shards of zF are quantized. All shards of fused layers to have the same precision.)splitr=   
ValueError)r@   r[   r_   shard_prefixes
is_skippedshard_prefixis_shard_skippedr2   rk   r3   is_layer_skippedx   s(   
zW8A8Int8Config.is_layer_skippedc                 C  s   g S rE   r2   rR   r2   r2   r3   get_scaled_act_names   s   z#W8A8Int8Config.get_scaled_act_names)r6   r7   )rC   rD   )rC   rL   )rC   r?   )rC   rT   )rW   r7   rC   r5   )rY   rZ   r[   r?   rC   r\   )r[   r?   r_   rf   )__name__
__module____qualname____doc__r<   classmethodrK   rO   rS   rV   rX   re   r   ru   rv   __classcell__r2   r2   rA   r3   r5   ?   s"    
r5   c                   @  s8   e Zd ZdddZdd	d
ZdddZ	dd ddZdS )!rc   quantization_configr5   c                 C  
   || _ d S rE   )r}   )r@   r}   r2   r2   r3   r<         
zW8A8Int8LinearMethod.__init__rY   rZ   rC   Nonec                 C  sH   t rtsJ dt|dg n
t|j dd|_t|jjdd|_d S )Nz=W8A8Int8LinearMethod on CPU requires that CPU has AMX supportweightFrequires_grad)_is_cpu_is_cpu_amx_availabler   r   r   tweight_scaledatar@   rY   r2   r2   r3   process_weights_after_loading   s   z2W8A8Int8LinearMethod.process_weights_after_loadinginput_size_per_partitionrL   output_partition_sizes	List[int]
input_sizeoutput_sizeparams_dtypetorch.dtypec                 K  sp   | d}|| _ttjt||tjddd|d}	|d|	 ttjt|dftj	dd|d}
|d|
 d S )	Nweight_loaderr&      r   )r   	input_dim
output_dimr   r   )r   r   r   r   )
r>   logical_widthsr   rF   emptysumint8register_parameterr   float32)r@   rY   r   r   r   r   r   extra_weight_attrsr   r   r   r2   r2   r3   create_weights   s"   
z#W8A8Int8LinearMethod.create_weightsNxtorch.Tensorr/   Optional[torch.Tensor]c           
      C  s   t |rtjj||j|j||jdS t|\}}|	d|j
d }|	d|j
d }g |j
d d |jj
d }t||j||j|j|d}	|		|S )NTr%   r   )r.   r/   )r    rF   ops
sgl_kernelint8_scaled_mm_with_quantr   r   r'   r   viewr(   r#   )
r@   rY   r   r/   x_qx_scalex_q_2d
x_scale_2doutput_shapeoutputr2   r2   r3   apply   s,   
	zW8A8Int8LinearMethod.apply)r}   r5   rY   rZ   rC   r   )rY   rZ   r   rL   r   r   r   rL   r   rL   r   r   rE   )rY   rZ   r   r   r/   r   )rw   rx   ry   r<   r   r   r   r2   r2   r2   r3   rc      s    



#rc   c                   @  sB   e Zd ZdZdddZdddZd ddZd!ddZd"ddZdS )#rd   au  MoE method for INT8.
    Supports loading INT8 checkpoints with static weight scale and
    dynamic/static activation scale.
    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
    activation scaling. The weight scaling factor will be initialized after
    the model weights are loaded.
    Args:
        quant_config: The quantization config.
    r6   r5   c                 C  r~   rE   )r6   )r@   r6   r2   r2   r3   r<      r   zW8A8Int8MoEMethod.__init__rY   rZ   num_expertsrL   hidden_sizeintermediate_size_per_partitionr   r   c                 K  s(  ddl m} t }tjjtj|d| |tjddd}	|d|	 t	|	| tjjtj|||tjddd}
|d|
 t	|
| tjjtj
|d| d	tjddd}tjjtj
||d	tjddd}|d
| |d| |d|jji t	|| t	|| d }|d| d }|d| d S )Nr   )FusedMoeWeightScaleSupported   r&   Fr   
w13_weight	w2_weightr   w13_weight_scalew2_weight_scalequant_methodw13_input_scalew2_input_scale)ra   r   r   rF   nnr   r   r   r   r   onesr   updateCHANNELvalue)r@   rY   r   r   r   r   r   r   tp_sizer   r   r   r   r   r   r2   r2   r3   r      sZ   		
	



z W8A8Int8MoEMethod.create_weightsrC   r   c                 C  sh   t rtsJ dt|ddg nt|jdd|_t|jdd|_t|jjdd|_t|jjdd|_d S )Nz:W8A8Int8MoEMethod on CPU requires that CPU has AMX supportr   r   Fr   )	r   r   r   r   r   r   r   r   r   r   r2   r2   r3   r   8  s   z/W8A8Int8MoEMethod.process_weights_after_loadingmoe_runner_configr   c                 C  s   || _ ttj|| _d S rE   )r   r   r   TRITONrunner)r@   rY   r   r2   r2   r3   create_moe_runnerH  s   z#W8A8Int8MoEMethod.create_moe_runnerdispatch_outputr"   r   c                 C  s   ddl m} |j}|j}t|rBddlm} |\}}}	|| jj||\}}t	j
j||j|j||dtj|j|jd d d d}
||
dS t|j|jdd|j|j|j|jd}| j||S )Nr   )StandardCombineInput)apply_topk_weights_cpuFT)hidden_states)r   r   use_int8_w8a8per_channel_quant	w13_scalew2_scale	a13_scalea2_scale)&sglang.srt.layers.moe.token_dispatcherr   r   topk_outputr    sglang.srt.layers.moe.topkr   r   apply_router_weight_on_inputrF   r   r   fused_experts_cpur   r   r   	INT8_W8A8r   r   r   r   r   r   run)r@   rY   r   r   r   r   r   topk_weightstopk_ids_r   
quant_infor2   r2   r3   r   N  sH   



zW8A8Int8MoEMethod.applyN)r6   r5   )
rY   rZ   r   rL   r   rL   r   rL   r   r   r   )rY   rZ   r   r   )rY   rZ   r   r"   rC   r   )	rw   rx   ry   rz   r<   r   r   r   r   r2   r2   r2   r3   rd      s    



@
rd   rE   )?
__future__r   loggingtypesr   typingr   r   r   r   r   r	   r
   rF   torch.nn.parameterr   sglang.srt.distributedr   sglang.srt.layers.amx_utilsr   r   sglang.srt.layers.moer   r   r   'sglang.srt.layers.moe.moe_runner.tritonr   sglang.srt.layers.parameterr   r   *sglang.srt.layers.quantization.base_configr   r   r   r   7sglang.srt.layers.quantization.compressed_tensors.utilsr   *sglang.srt.layers.quantization.int8_kernelr   &sglang.srt.layers.quantization.unquantr   sglang.srt.utilsr   r   r   r   r    sglang.srt.utils.patch_torchr!   r   r"   _is_cudar   r   r   r#   r4   	getLoggerrw   loggerr5   rc   rd   r2   r2   r2   r3   <module>   s>    $
\O