o
    پi(                     @  s
  d dl mZ d dlmZmZmZmZmZ d dlZd dl	m
Z
 d dlmZmZmZ d dlmZ d dlmZmZ d dlmZmZmZmZ d d	lmZmZmZ d d
lmZmZm Z m!Z! d dl"m#Z# erhd dl$m%Z%m&Z& e Z'G dd deZ(G dd deZ)G dd deZ*dS )    )annotations)TYPE_CHECKINGAnyDictListOptionalN)	Parameter)	MoeRunnerMoeRunnerBackendMoeRunnerConfig)TritonMoeQuantInfo)ChannelQuantScaleParameterModelWeightParameter)FusedMoEMethodBaseLinearMethodBaseQuantizationConfigQuantizeMethodBase)	fp8_dtypeis_fp8_fnuzper_token_group_quant_fp8)apply_fp8_linearcutlass_fp8_supportedinput_to_float8normalize_e4m3fn_to_e4m3fnuz)set_weight_attrs)CombineInputStandardDispatchOutputc                   @  sv   e Zd ZdZd!d"ddZed#d	d
Zed$ddZed%ddZed&ddZ	ed'ddZ
d(ddZd&ddZd S ))W8A8Fp8Configa  Config class for W8A8 FP8 Quantization.

    Weight Quantization:
    - Method: Static quantization
    - Granularity: Per-channel
    - Type: Symmetric

    Activation Quantization:
    - Method: Dynamic quantization
    - Granularity: Per-token
    - Type: Symmetric

    Note:
    - For models without offline quantization, weights will be quantized during model loading
    - If CUTLASS is supported: Per-channel weight quantization is used
    - If CUTLASS is not supported: Falls back to per-tensor weight quantization
    Fis_checkpoint_fp8_serializedboolc                 C  
   || _ d S Nr   )selfr    r$   [/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/w8a8_fp8.py__init__:      
zW8A8Fp8Config.__init__returnList[torch.dtype]c                 C  s   t jt jgS r!   )torchfloat16bfloat16clsr$   r$   r%   get_supported_act_dtypes=   s   z&W8A8Fp8Config.get_supported_act_dtypesintc                 C     dS )NY   r$   r-   r$   r$   r%   get_min_capabilityA      z W8A8Fp8Config.get_min_capabilitystrc                 C  r1   )Nw8a8_fp8r$   r#   r$   r$   r%   get_nameE   r4   zW8A8Fp8Config.get_name	List[str]c                 C     g S r!   r$   r-   r$   r$   r%   get_config_filenamesI   r4   z"W8A8Fp8Config.get_config_filenamesconfigDict[str, Any]c                 C  s(   |  |dg}d|v pd|v }| |dS )Nquant_methodzcompressed-tensorsr6   r"   )get_from_keys)r.   r<   r>   r   r$   r$   r%   from_configM   s   
zW8A8Fp8Config.from_configlayertorch.nn.ModuleprefixOptional[QuantizeMethodBase]c                 C  s@   ddl m} ddlm} t||rt| S t||rt| S d S )Nr   )
LinearBase)FusedMoE)sglang.srt.layers.linearrE   &sglang.srt.layers.moe.fused_moe_tritonrF   
isinstanceW8A8Fp8LinearMethodW8A8FP8MoEMethod)r#   rA   rC   rE   rF   r$   r$   r%   get_quant_methodU   s   

zW8A8Fp8Config.get_quant_methodc                 C  r:   r!   r$   r7   r$   r$   r%   get_scaled_act_namesc   s   z"W8A8Fp8Config.get_scaled_act_namesN)F)r   r   )r(   r)   )r(   r0   )r(   r5   )r(   r9   )r<   r=   r(   r   )rA   rB   rC   r5   r(   rD   )__name__
__module____qualname____doc__r&   classmethodr/   r3   r8   r;   r@   rL   rM   r$   r$   r$   r%   r   '   s    
r   c                   @  s8   e Zd ZdddZdd	d
ZdddZ	dd ddZdS )!rJ   quantization_configr   c                 C  s   t  | _ || _d S r!   )r   rS   )r#   rS   r$   r$   r%   r&   i   s   
zW8A8Fp8LinearMethod.__init__rA   rB   r(   Nonec                 C  s   |j }| jjr)|j }trt||d\}}}t| dd|_ t|dd|_d S | j	r?t
|j |j jd \}}|  }n	t|j td\}}t| dd|_ t|dd|_d |_d S )N)weightweight_scaleFrequires_graddtype)rU   rS   r   rV   detach_is_fp8_fnuzr   r   tr   r   shape
contiguousr   r   input_scale)r#   rA   rU   rV   _qweightr$   r$   r%   process_weights_after_loadingm   s$   

z1W8A8Fp8LinearMethod.process_weights_after_loadinginput_size_per_partitionr0   output_partition_sizes	List[int]
input_sizeoutput_sizeparams_dtypetorch.dtypec                 K  s   | j jrtjn|}|d}	|| _ttjt|||ddd|	d}
|	d|
 | j jrDt
tjt|dftjdd|	d}|	d| d S d |_d S )	Nweight_loaderrZ      r   )data	input_dim
output_dimrl   rU   )rn   rp   rl   rV   )rS   r   r*   float8_e4m3fngetlogical_widthsr   emptysumregister_parameterr   float32rV   )r#   rA   re   rf   rh   ri   rj   extra_weight_attrsweight_dtyperl   rU   rV   r$   r$   r%   create_weights   s2   


z"W8A8Fp8LinearMethod.create_weightsNxtorch.TensorbiasOptional[torch.Tensor]c                 C  s   t ||j|j|| jdS )N)r}   r   )r   rU   rV   r   )r#   rA   r{   r}   r$   r$   r%   apply   s   zW8A8Fp8LinearMethod.apply)rS   r   rA   rB   r(   rT   )rA   rB   re   r0   rf   rg   rh   r0   ri   r0   rj   rk   r!   )rA   rB   r{   r|   r}   r~   )rN   rO   rP   r&   rd   rz   r   r$   r$   r$   r%   rJ   g   s    


 -rJ   c                   @  sB   e Zd ZdZdddZdddZd ddZd!ddZd"ddZdS )#rK   as  MoE method for FP8.
    Supports loading FP8 checkpoints with static weight scale and
    dynamic/static activation scale.
    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
    activation scaling. The weight scaling factor will be initialized after
    the model weights are loaded.
    Args:
        quant_config: The quantization config.
    quant_configr   c                 C  r    r!   )r   )r#   r   r$   r$   r%   r&      r'   zW8A8FP8MoEMethod.__init__rA   rB   num_expertsr0   hidden_sizeintermediate_size_per_partitionrj   rk   c                 K  s  ddl m} tjjtj|d| |tddd}|d| t|| tjjtj|||tddd}	|d|	 t|	| tjjtj	|d| d	tj
ddd}
tjjtj	||d	tj
ddd}|d
|
 |d| |d|jji t|
| t|| d }|d| d }|d| d S )Nr   )FusedMoeWeightScaleSupported   rZ   FrW   
w13_weight	w2_weightrm   w13_weight_scalew2_weight_scaler>   w13_input_scalew2_input_scale)rH   r   r*   nnr   rt   r   rv   r   onesrw   updateCHANNELvalue)r#   rA   r   r   r   rj   rx   r   r   r   r   r   r   r   r$   r$   r%   rz      sX   		
	



zW8A8FP8MoEMethod.create_weightsr(   rT   c                 C  sH   t |jdd|_t |jdd|_t |jjdd|_t |jjdd|_d S )NFrW   )r   r   r   r   rn   r   )r#   rA   r$   r$   r%   rd     s   z.W8A8FP8MoEMethod.process_weights_after_loadingmoe_runner_configr   c                 C  s   || _ ttj|| _d S r!   )r   r	   r
   TRITONrunner)r#   rA   r   r$   r$   r%   create_moe_runner  s   z"W8A8FP8MoEMethod.create_moe_runnerdispatch_outputr   r   c              
   C  s2   t |j|jdd|j|j|j|jd}| j||S )NT)r   r   use_fp8_w8a8per_channel_quant	w13_scalew2_scale	a13_scalea2_scale)	r   r   r   r   r   r   r   r   run)r#   rA   r   
quant_infor$   r$   r%   r   !  s   
zW8A8FP8MoEMethod.applyN)r   r   )
rA   rB   r   r0   r   r0   r   r0   rj   rk   r   )rA   rB   r   r   )rA   rB   r   r   r(   r   )	rN   rO   rP   rQ   r&   rz   rd   r   r   r$   r$   r$   r%   rK      s    



>

rK   )+
__future__r   typingr   r   r   r   r   r*   torch.nn.parameterr   sglang.srt.layers.moer	   r
   r   'sglang.srt.layers.moe.moe_runner.tritonr   sglang.srt.layers.parameterr   r   *sglang.srt.layers.quantization.base_configr   r   r   r   )sglang.srt.layers.quantization.fp8_kernelr   r   r   (sglang.srt.layers.quantization.fp8_utilsr   r   r   r   sglang.srt.utilsr   &sglang.srt.layers.moe.token_dispatcherr   r   r]   r   rJ   rK   r$   r$   r$   r%   <module>   s"    @^