o
    پi&                    @  s  d dl mZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z;m<Z<m=Z=m>Z> d dl?m@Z@ d dlAmBZBmCZCmDZDmEZE d dlFmGZG d dlHmIZI erd dlJmKZK d dlLmMZM d dlNmOZOmPZP d dlQmRZR dZSz%eD rzd dlTmSZS W n eUy   d dlVmWZS Y nw d dlVmWZS W n eUy   dZSY nw zd d lTmXZY d d!lTmZZZm[Z[ d"Z\W n eUyF   eC r<d d#lVm]Z^ d$Z\dZZdZ_dZ[Y nw zd d%l`maZb d d&lcmdZd W n eUyi   dZbG d'd( d(eZdY nw eeefZgdbd4d5ZheGehd6dbd7d8ZieC reD seSdureId9d:d; ZjeBd<d=Zkd>ZldcdAdBZmelelfdddFdGZndedJdKZodfdNdOZpejqr ZsdPgZtedjuedjvdQZwG dRdS dSe+ZxG dTdU dUexZyG dVdW dWe*ZzG dXdY dYe6Z{G dZd[ d[e)Z|G d\d] d]exZ}G d^d_ d_e*Z~G d`da dae)ZdS )g    )annotationsN)IntEnum)TYPE_CHECKINGAnyDictListOptional)	Parameter)get_tp_group)use_symmetric_memory)envs)is_allocation_symmetric)	MoeRunnerMoeRunnerBackendMoeRunnerConfigget_moe_a2a_backendget_moe_runner_backend)CutlassMoEParamsCutlassMoEType)TritonMoeQuantInfo)/should_use_flashinfer_cutlass_moe_fp4_allgather)ModelWeightParameterPerTensorScaleParameter)FusedMoEMethodBaseLinearMethodBaseQuantizationConfigQuantizeMethodBase)get_fp4_gemm_runner_backend)scaled_fp8_quant)apply_fp8_linearcutlass_fp8_supportedis_blackwell_supported)BaseKVCacheMethod)UnquantizedLinearMethod)convert_to_channelwiseis_layer_skippedper_tensor_dequantizerequantize_with_max_scaleswizzle_blockscale)RadixAttention)get_bool_env_varis_cudais_sm120_supportednext_power_of_2)register_custom_op)register_fake_if_exists)DownGemmOverlapArgsFusedMoE)CombineInputStandardDispatchOutput)WeightsMapper)fp4_quantize)scaled_fp4_quant)mm_fp4)reorder_rows_for_gated_act_gemmshuffle_matrix_sf_aT)cutlass_scaled_fp4_mmF)cutlass_fused_moe)ActivationTypec                   @  s   e Zd ZdZdZdS )r=         N)__name__
__module____qualname__SwigluRelu2 rE   rE   a/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/modelopt_quant.pyr=   f   s    r=   inputtorch.Tensorweightinput_sf	weight_sfalpha	out_dtypetorch.dtypeout_featuresintreturnc           	      C  s$   | j d }t|}| j||f|dS )Ndtype)shaperP   	new_empty)	rG   rI   rJ   rK   rL   rM   rO   MNrE   rE   rF   _sglang_fp4_gemm_fakeo   s   
	rY   )	fake_implc           	   	   C  s:   t  }tr| }t| ||||||dS t| |||||S )N)backend)r   enable_flashinfer_fp4_gemmget_flashinfer_backendflashinfer_fp4_gemmcutlass_fp4_gemm)	rG   rI   rJ   rK   rL   rM   rO   fp4_backendr[   rE   rE   rF   fp4_gemm}   s   
ra   zsgl_kernel::scaled_fp4_quantc                 C  s   d S NrE   )outputrG   output_scaleinput_global_scalerE   rE   rF   !_sgl_kernel_scaled_fp4_quant_fake   s   rf   %SGLANG_CUTEDSL_MOE_SCALAR_INPUT_SCALEtrue    xmc                 C  s   | | d | | S )z(Round up x to the nearest multiple of m.   rE   )rj   rk   rE   rE   rF   round_up_to_multiple   s   rm   n_alignmentk_alignmenttuple[torch.Tensor, int]c                 C  s   | j d }| j d }d}|dkr|| dkrt||}|| }|d }d}|dkr<|| dkr<t||}	|	| }
|
d }|dksD|dkrRtjj| d|d|f } | |fS )a  
    Pad packed NVFP4 weights to satisfy alignment constraints for FP4 GEMM kernels.

    Different backends have different alignment requirements:
    - CUTLASS/cuDNN: N % 32 == 0, K % 32 == 0
    - TRTLLM: N % 128 == 0 (for shuffle_matrix_sf_a), K padding handled separately

    Args:
        weight: Packed FP4 weight tensor of shape [N, K//2] (2 FP4 values per byte)
        n_alignment: Required alignment for N dimension (default 32, use 128 for TRTLLM)
        k_alignment: Required alignment for K dimension (default 32, use 0 to skip)

    Returns:
        Tuple of (padded_weight, weights_padding_cols) where weights_padding_cols
        is the number of columns added for K-dimension padding (in bytes).
    r   rl      )rU   rm   torchnn
functionalpad
contiguous)rI   rn   ro   weight_current_rowsweight_current_col_bytespad_rows
total_rowsweight_current_col_elementspad_cols_bytes
total_colspad_colsrE   rE   rF   pad_nvfp4_weight   s&   



r   x_fp4weights_padding_colsc                 C  s$   |dkrt jj| d|f S | S )a  
    Pad packed FP4 activations to match the K-dimension padding applied to weights.

    Args:
        x_fp4: Packed FP4 activation tensor
        weights_padding_cols: Number of padding columns (in bytes) from weight padding

    Returns:
        Padded activation tensor
    r   )rr   rs   rt   ru   rv   )r   r   rE   rE   rF    pad_nvfp4_activation_for_cutlass   s   r   outoutput_sizec                 C  s&   | j d |kr| dd|f  S | S )a  
    Slice the output tensor to remove padding in N dimension if weight was padded.

    Args:
        out: Output tensor from FP4 GEMM
        output_size: Original output size before padding

    Returns:
        Sliced output tensor with padding removed
    .N)rU   rv   )r   r   rE   rE   rF   slice_nvfp4_output   s   r   static)silurelu2c                      sT   e Zd Zd! fddZd"ddZed#ddZd#ddZd$ddZd%dd Z	  Z
S )&ModelOptQuantConfigkv_cache_quant_algoOptional[str]exclude_modulesOptional[List[str]]packed_modules_mappingOptional[Dict[str, List[str]]]c                   s$   t    || _|pg | _|| _d S rb   )super__init__r   r   r   )selfr   r   r   	__class__rE   rF   r     s   


zModelOptQuantConfig.__init__layertorch.nn.ModuleprefixstrLineartype[LinearMethodBase]Moetype[FusedMoEMethodBase]rQ   Optional[QuantizeMethodBase]c                C  s   ddl m} ddlm} t||r%t|| j| js| |r!t	 S || S | j
r1t|tr1t| S t||rA| |r=d S || S d S )Nr   )
LinearBaser1   )sglang.srt.layers.linearr   &sglang.srt.layers.moe.fused_moe_tritonr2   
isinstancer%   r   r   is_layer_excludedr#   r   r)   ModelOptFp8KVCacheMethod)r   r   r   r   r   r   r2   rE   rE   rF   _get_quant_method  s"   



z%ModelOptQuantConfig._get_quant_method	List[str]c                 C  s   dgS )Nzhf_quant_config.jsonrE   clsrE   rE   rF   get_config_filenames3  s   z(ModelOptQuantConfig.get_config_filenamesc                 C  s   g S rb   rE   r   rE   rE   rF   get_scaled_act_names7  s   z(ModelOptQuantConfig.get_scaled_act_nameshf_to_sglang_mapper'WeightsMapper'c                 C  s\   | j r,|| j }g }|D ]}|| |dr!||d qtt|| _ d S d S )Nlanguage_model.)r   
apply_listappend
startswithremoveprefixlistdictfromkeys)r   r   mappedexpandednamerE   rE   rF   apply_weight_name_mapper:  s   

z,ModelOptQuantConfig.apply_weight_name_mapperboolc           
      C  s   | j sdS |g}|dr||d h d}| j D ]Q}|dddd}|D ]!}t||r6  dS |d}|D ]}t||rJ   dS q=q*|jdd	d
d }	|	|v rm|D ]}|	|jdd	d
d v rl  dS q[qdS )a  Check if a layer should be excluded from quantization.

        Handles:
        - Exact matches (e.g., "lm_head" matching prefix "lm_head")
        - Glob-style wildcards (e.g., "mtp*" matching "mtp_layers")
        - Part-by-part matching (split prefix on "." and check each part)
        - language_model. prefix stripping for vision-language models
        - Fused module patterns (e.g., "q_a_proj" in "fused_qkv_a_proj_with_mqa")
        Fr   >   q_a_projq_b_proj	kv_b_projkv_a_proj_with_mqa.z\.*z.*Trl   )maxsplitr   )	r   r   r   r   replacere	fullmatchsplitrsplit)
r   r   prefixes_to_checkfused_patternspattern	regex_strpfx	pfx_partspartpattern_tailrE   rE   rF   r   J  s2   




z%ModelOptQuantConfig.is_layer_excluded)r   r   r   r   r   r   )
r   r   r   r   r   r   r   r   rQ   r   )rQ   r   )r   r   )r   r   rQ   r   )r@   rA   rB   r   r   classmethodr   r   r   r   __classcell__rE   rE   r   rF   r     s    


r   c                      sz   e Zd ZdZ				d%d& fddZedd Zed'ddZed(ddZed)ddZ	ed*ddZ
d+d#d$Z  ZS ),ModelOptFp8Configz^Configuration for ModelOpt FP8 quantization, including serialization and compatibility checks.FNis_checkpoint_fp8_serializedr   kv_cache_quant_methodr   r   r   r   r   rQ   Nonec                   s,   t  ||| || _|rtd dS dS )z
        Args:
            is_checkpoint_fp8_serialized (bool): Indicates if the checkpoint uses serialized FP8 format.
        zSDetected ModelOpt FP8 checkpoint. The format is experimental and subject to change.N)r   r   r   loggerwarning)r   r   r   r   r   r   rE   rF   r   ~  s   zModelOptFp8Config.__init__c                 C     |  ||S z9Override quantization method based on the model's config.&_modelopt_override_quantization_methodr   hf_quant_config
user_quantrE   rE   rF   override_quantization_method     z.ModelOptFp8Config.override_quantization_methodr   c                 C     dS )Nmodelopt_fp8rE   r   rE   rE   rF   get_name     zModelOptFp8Config.get_nameList[torch.dtype]c                 C  s   t jt jgS rb   )rr   bfloat16halfr   rE   rE   rF   get_supported_act_dtypes  s   z*ModelOptFp8Config.get_supported_act_dtypesrP   c                 C  r   )NY   rE   r   rE   rE   rF   get_min_capability  r   z$ModelOptFp8Config.get_min_capabilityconfigDict[str, Any]c                 C  s   d }d }| d}|d ur-| d}t|tr'| ddkr'| ddkr'd}| d}n$z| |d	g}| d}| d
}| d}W n tyP   tdw |d u rYtdd|vratd| d||| ddS )N
quant_algokv_cache_schemetypefloatnum_bits   FP8ignorequantizationr   r   Cannot find 'quant_algo' in the model's quantization config. Expected either flat format (config.json) or nested format (hf_quant_config.json).z=Cannot find 'quant_algo' in the model's quantization config. zModelOptFp8Config only supports static FP8 quantization in SGLang. For FP4 quantization, use ModelOptFp4Config. Check the quantization config for your model's configuration.Tr   )r   r   r   r   )getr   r   get_from_keys
ValueError)r   r   r   r   quant_methodr   quantization_sectionrE   rE   rF   from_config  sD   	




zModelOptFp8Config.from_configr   r   r   r   c                 C     | j ||ttdS N)r   r   )r   ModelOptFp8LinearMethodModelOptFp8MoEMethodr   r   r   rE   rE   rF   get_quant_method  s   z"ModelOptFp8Config.get_quant_method)FNNN)
r   r   r   r   r   r   r   r   rQ   r   rQ   r   rQ   r   rQ   rP   )r   r   rQ   r   )r   r   r   r   rQ   r   )r@   rA   rB   __doc__r   r   r   r   r   r   r   r  r   rE   rE   r   rF   r   {  s$    
8r   c                      sD   e Zd ZdZd fddZdddZd ddZ	d!d"ddZ  ZS )#r   a  Linear method for ModelOpt static FP8 quantization.

    Supports loading FP8 checkpoints with static weight and activation scales.
    Future support may include dynamic scales.

    **Limitations**:
    1. Only supports per-tensor quantization due to `torch._scaled_mm` limitations.
    2. Only supports the `float8_e4m3fn` data type.

    Args:
        quant_config (ModelOptFp8Config): The ModelOpt quantization configuration.
    quant_configr   c                   s   t    || _t | _d S rb   )r   r   r  r    r   r  r   rE   rF   r     s   
z ModelOptFp8LinearMethod.__init__r   r   input_size_per_partitionrP   output_partition_sizes	List[int]
input_sizeOptional[int]r   params_dtyperN   rQ   r   c              
   K  s   t |}|d}	| jjrtjn|}
||_||_||_|	dt
tj|||
ddd|	d | jjrOdD ]}|	|ttjt|fttjjtjd|	d q3d	S d	S )
zTCreates and registers weights, weight scales, and input scales for FP8 quantization.weight_loaderrI   rS   rl   r   data	input_dim
output_dimr  )weight_scaleinput_scaler  r  N)sumr   r  r   rr   float8_e4m3fnlogical_widthsr  output_size_per_partitionregister_parameterr   emptyr   fulllenfinfofloat32min)r   r   r  r	  r  r   r  extra_weight_attrsr  r  weight_dtype
scale_namerE   rE   rF   create_weights  sJ   
z&ModelOptFp8LinearMethod.create_weightsc                 C  s`   t |j|j|j\}}t| dd|_| jrt||j}t|dd|_t|j	 dd|_dS )z:Requantizes weights after loading using the maximum scale.Frequires_gradN)
r'   rI   r  r  r	   tr    r$   r  max)r   r   max_w_scalequantized_weightrE   rE   rF   process_weights_after_loading+  s   z5ModelOptFp8LinearMethod.process_weights_after_loadingNrj   rH   biasOptional[torch.Tensor]c                 C  s   t ||j|j|j|| jdS )z"Applies FP8 linear transformation.)rG   rI   r  r  r,  r    )r   rI   r  r  r    )r   r   rj   r,  rE   rE   rF   apply7  s   zModelOptFp8LinearMethod.applyr  r   )r   r   r  rP   r	  r
  r  r  r   r  r  rN   rQ   r   r   r   rQ   r   rb   r   r   rj   rH   r,  r-  rQ   rH   )	r@   rA   rB   r  r   r$  r+  r.  r   rE   rE   r   rF   r     s    

6r   c                      s"   e Zd ZdZd fddZ  ZS )r   z[
    Handles loading FP8 kv-cache scaling factors from modelopt quantized checkpoints.
    r  r   c                   s   t  | d S rb   )r   r   r  r   rE   rF   r   M  s   z!ModelOptFp8KVCacheMethod.__init__r/  )r@   rA   rB   r  r   r   rE   rE   r   rF   r   H  s    r   c                   @  sB   e Zd ZdZdddZdddZd ddZd!ddZd"ddZdS )#r   zMoE method for ModelOpt FP8.
    Supports loading FP8 checkpoints with static weight scale and activation scale.

    Args:
        quant_config: The ModelOpt quantization config.
    r  r   c                 C  s   || _ t | _d S rb   )r  r    r  rE   rE   rF   r   Y  s   zModelOptFp8MoEMethod.__init__r   r   num_expertsrP   hidden_sizeintermediate_size_per_partitionr  rN   c                 K  sp  ddl m} | jjrtjn|}|d}	|jjrdnd}
|
| }t	tj
||||ddd|	d}|d| t	tj
||||ddd|	d}|d	| | jjr||
f}ttj|ttjjtjd|	d
}ttj|fttjjtjd|	d
}|d| |d| |d|jji ttj|fdtjd|	d
}ttj|fdtjd|	d
}|d| |d| d S d S )Nr   FusedMoeWeightScaleSupportedr  rq   rl   rS   r  
w13_weight	w2_weightr  w13_weight_scalew2_weight_scaler   g      ?w13_input_scalew2_input_scale)r   r6  r  r   rr   r  r   moe_runner_configis_gatedr   r  r  r   r  r  r  r   updateTENSORvalue)r   r   r2  r3  r4  r  r!  r6  r"  r  
num_shardsintermediate_sizer7  r8  w13_scale_shaper9  r:  r;  r<  rE   rE   rF   r$  ]  s   	

z#ModelOptFp8MoEMethod.create_weightsrQ   r   c                 C  s  t |jjdd|_t |jjdd|_t|dr|jdur|j dkr|jjddj}|j	j
r1dnd}|jjd | }t|jjd D ]<}d}t|D ]3}t|j| ||| ddf |j| | }t||| \|j| ||| ddf< }	||7 }qKqCt |dd|_n	t |jjdd|_t|d	r|jdurt |jjdd|_t|d
r|jdurt |j dd|_t|dr|jdurt |j dd|_t  rddlm}
 |
|dd dS t  rYt|d
r|jdusJ t|dr|jdusJ t|dr	|jdusJ t|d	r|jdusJ |jtj}|jtj}|jtj}|jtj}t || dd|_t | dd|_t || dd|_t |dd|_dS dS )zProcess FP8 MoE weights after loading from serialized checkpoint.

        Only supports pre-quantized checkpoints with FP8 weights and scales.
        Fr%  r9  Nrq   rl   dimr   r:  r;  r<  )+align_fp8_moe_weights_for_flashinfer_trtllmT)swap_w13_halves)r	   r7  r  r8  hasattrr9  rF  r(  valuesr=  r>  rU   ranger&   r   r:  r;  r<  r   is_flashinfer_trtllm2sglang.srt.layers.moe.moe_runner.flashinfer_trtllmrG  is_flashinfer_cutlasstorr   r  fc1_dequant
reciprocal	fc2_quantfc2_dequantfc1_input_dequant)r   r   max_w13_scalesrB  r4  	expert_idstartshard_id	dq_weight_rG  r  activation_scaler9  r:  rE   rE   rF   r+    s   



z2ModelOptFp8MoEMethod.process_weights_after_loadingr=  r   c                 C  s   || _ ttj|| _d S rb   )r=  r   r   TRITONrunnerr   r   r=  rE   rE   rF   create_moe_runner  s   z&ModelOptFp8MoEMethod.create_moe_runnerdispatch_outputr4   r3   c                 C  sD  |j }|j}ddlm} t  rq||rqddlm}m	} ddl
m} |j}	| jjdks1J d|	jdks:J d|	jrAJ d	|	jrHJ d
||j|j|j|j|j |j|jjd |jd|j|j|j|jdd}
|||
| jS t  r	t| jj }|tj u r| jj!r|tj"u r| jj!sJ d|j#|j$}}t%||j\}}|j&}|jd }d }t't( t)  d t*j+|jd |||j,d}W d    n1 sw   Y  t-|||.t*j/||j|j|||j0|j1|j2|j3g|j4|j|j5|j6t7|jd |dd }ddl8m9} ||dS t:|j|jdd|j;|j<|j|j=d}
| j>?||
S )Nr   )TopKOutputChecker)FlashInferTrtllmFp8MoeQuantInfo+fused_experts_none_to_flashinfer_trtllm_fp8RoutingMethodTyper   z-Only silu is supported for flashinfer fp8 moerl   z"ModelOpt FP8 MoE requires top_k==1z1ModelOpt FP8 MoE does not support expert groupingz/ModelOpt FP8 MoE does not support grouped top-krq   FT)r7  r8  global_num_expertslocal_expert_offsetlocal_num_expertsrC  routing_method_typeblock_quantr;  output1_scales_scalaroutput1_scales_gate_scalaroutput2_scales_scalaruse_routing_scales_on_inputzQOnly Relu2 non-gated or Swiglu gated are supported for flashinfer cutlass fp8 moedisabledrT   device)rc   rG   token_selected_expertstoken_final_scalesfc1_expert_weightsfc2_expert_weightsoutput_dtyperJ   quant_scalesep_sizeep_ranktp_sizetp_ranktune_max_num_tokensactivation_typeStandardCombineInputhidden_states)r7  r8  use_fp8_w8a8per_channel_quant	w13_scalew2_scale	a13_scalea2_scale)@r  topk_outputsglang.srt.layers.moe.topkra  r   rL  format_is_bypassedrM  rb  rc  sglang.srt.layers.moe.utilsre  topk_configr=  
activationtop_knum_expert_group
topk_groupr7  r8  r2  moe_ep_ranknum_local_expertsrU   Llama4r;  rk  rl  rm  rN  ACT_STR_TO_TYPE_MAPr=   rD   r>  rC   topk_weightstopk_idsr   rT   r   r
   r   rr   r  rr  flashinfer_cutlass_fused_moerO  rP   rP  rR  rS  rT  moe_ep_sizemoe_tp_sizemoe_tp_rankr-   &sglang.srt.layers.moe.token_dispatcherr  r   r9  r:  r<  r]  run)r   r   r`  rj   r  ra  rb  rc  re  r  
quant_infor  r  r  x_fp8rZ  rw  original_colx_sfsymm_outputrc   r  rE   rE   rF   r.    s   







zModelOptFp8MoEMethod.applyNr/  
r   r   r2  rP   r3  rP   r4  rP   r  rN   r0  r   r   r=  r   )r   r   r`  r4   rQ   r3   )	r@   rA   rB   r  r   r$  r+  r_  r.  rE   rE   rE   rF   r   Q  s    


U
er   c                      s   e Zd ZdZ					d(d) fddZedd Zed*ddZed+ddZed,ddZ	e
d-ddZed.d!d"Zd/d&d'Z  ZS )0ModelOptFp4ConfigzConfig class for FP4.FNis_checkpoint_nvfp4_serializedr   r   r   
group_sizerP   r   r   r   r   rQ   r   c                   s.   t  ||| || _|rtd || _d S )Nz]Detected nvfp4 checkpoint. Please note that the format is experimental and subject to change.)r   r   r  r   r   r  )r   r  r   r  r   r   r   rE   rF   r     s   
zModelOptFp4Config.__init__c                 C  r   r   r   r   rE   rE   rF   r     r   z.ModelOptFp4Config.override_quantization_methodc                 C  r   )Nmodelopt_fp4rE   r   rE   rE   rF   r     r   zModelOptFp4Config.get_namer   c                 C  s   t jt jt jgS rb   )rr   r   r   r  r   rE   rE   rF   r     s   z*ModelOptFp4Config.get_supported_act_dtypesc                 C  r   )Nd   rE   r   rE   rE   rF   r     r   z$ModelOptFp4Config.get_min_capabilitycfgr   c                 C  s  t  }| d}t|tr|| | d}t|tr+|d}t|tr+|| | dp1i  D ]1}t|tre|d}t|trJ|| | D ]}t|trd|d}t|trd|| qNq4|sltdt|dkr{tdt	| t
t|S )zLReturn the unique group_size across the config; raise if missing/mismatched.r  r   config_groupszNo group_size found in config.rl   z Inconsistent group_size values: )setr   r   rP   addr   rJ  r   r  sortednextiter)r  sizesvqgsubrE   rE   rF   common_group_size  s2   















z#ModelOptFp4Config.common_group_sizer   r   c              	   C  s  d }d }g }| d}|d urw| d}t|tr-| ddkr*| ddkr*d}n!d}nt|trI|  }|d	v r?d}n|d
v rFd}nd}nd}| d}|d u rp| di }|rptt| i }	|	 di }
|
 d}| dg }n/z!| 	|dg}|d }| d}|sd}t
|}| dg }W n ttfy   tdw |dvrtdd|v }|d u s|d u rtd| d| d|  td| ||||| dS )Nr   r   r   r   r   r   r   auto)r   FLOAT8)FP4FLOAT4NVFP4r  r  r  weightsr   r   r   r   r   )r   r  zModelOpt currently only supports: FP8, NVFP4 quantizations in sglang. Please check the quantization config for your model's configuration.zgroup_size: z,kv_cache_quant_algo: z,exclude_modules: z_NVFP4 quantization requires group_size and exclude_modules specified in the quantization configr   )r   r   r   r   stripupperr  r  rJ  r   r  r  r   KeyErrorr   r   )r   r   r   r  r   r   r   scheme_namer  first_groupweights_configr  r  rE   rE   rF   r     s~   	







zModelOptFp4Config.from_configr   r   r   c                 C  r   r   )r   ModelOptFp4LinearMethodModelOptNvFp4FusedMoEMethodr   rE   rE   rF   r  7  s   z"ModelOptFp4Config.get_quant_method)FNNNN)r  r   r   r   r  rP   r   r   r   r   rQ   r   r  r  r  )r  r   rQ   rP   )r   r   rQ   r  )r   r   r   r   )r@   rA   rB   r  r   r   r   r   r   r   staticmethodr  r   r  r   rE   rE   r   rF   r    s*    
 Zr  c                   @  s<   e Zd ZdZdddZdddZdddZ	d d!ddZdS )"r  a8  Linear method for NVFP4.
    Supports loading NVFP4 checkpoints with the following structure:

    |Tensor Name           | datatype      |  shape      |
    |----------------------------------------------------|
    |input_scale           | torch.float32 | scalar      |
    |weight                | NVFP4(SE2M1)  | [1, X, y/2] |
    |weight_scale          | FP8-E4M3      | [X, Y]      |
    |weight_scale_2        | torch.float32 | scalar      |

    The weights are quantized per block of 16 elements.
    Args: quant_config: The ModelOpt quantization config.
    r  r  c                 C  s
   || _ d S rb   )r  r  rE   rE   rF   r   O  s   
z ModelOptFp4LinearMethod.__init__r   r   r  rP   r	  r
  r  r   r  rN   c                 K  s  ~~| j js
tdt|}|d}	||_||_||_|d dkr&td| j jr-tj	n|}
t
tj||d tjddd|	d	}|d
| ttjt|tjd|	d}|d| ttjt|tjd|	d}|d| t
tj||| j j |
ddd|	d	}|d| d S )NHNVFP4 quantization was selected,  dynamic quantization is not supported.r     r   z=Unsupported model when in features size is not multiple of 16rq   rS   rl   r  rI   r  r  weight_scale_2r  )r  r  r   r  r   r  r  r  rr   r  r   r  uint8r  r   r  r  r  )r   r   r  r	  r  r   r  r!  r  r  r"  rI   r  r  r  rE   rE   rF   r$  R  sd   


z&ModelOptFp4LinearMethod.create_weightsrQ   r   c                 C  s  |j  tj}|j tj}t|dd|_ t|dd|_t|j |j dd|_td| tjdd|_|j	j
d |_t  rddlm}m} t|j	jddd\}}|j}|j
d |j
d krz|j
d |j
d  }	tjj|ddd|	f}|j
d }
d}|
d dkrt|
d}||
 }tjj|d|ddf}|d	 }tjj|d|ddf}|}d}|j
}||tj|}||tj||tj}t|dd|_t|dd|_	||_d S t|j	j\}}||_t|dd|_	|j}|j}|d
kr|d}|jdks	J |j
\}}}t|d}t|d}tj |||f|j!d}||d |d |d |f< |j
\}}}|d dks@J |d dksIJ |||d dd|d d}|"d}|# $ }|d
krm|||n||||}t|dd|_d S )NFr%  rl   r   )shuffle_matrix_ar:      )rn   ro      r   rq   r>   rS   ri   )r   rl   r  r>   rq      )%r  r(  rO  rr   r  r  r	   rL   input_scale_invrI   rU   r  r   rL  
flashinferr  r:   r   r  r  rs   rt   ru   rm   viewr  reshaper  weight_scale_interleavedr   ndim	unsqueezezerosrT   permuterv   cuda)r   r   input_scale_2r  r  r:   rI   rZ  scalepad_nscale_kr   padded_scale_kpad_scale_kpad_weight_kepilogue_tile_mshuffled_scale_shapescales
scale_ndimBrW   KM_paddedK_paddedpadded_scalesbatchesrowscolsrE   rE   rF   r+    s   











z5ModelOptFp4LinearMethod.process_weights_after_loadingNrj   rH   r,  r-  c                 C  s   |j }|j\}}|j}|jj\}}||g}	t||j\}
}|
j tjks%J |jj tjks.J |jj tj	ks7J |j
j tjks@J t|dd}t|
|}
|j}|j}tr[|jj}|jj}t|
||||j
||}t||}|d urs|| }|j|	 S )Nr   r   )rT   rU   r  rI   r6   r  rr   r  r  r  rL   r  getattrr   r\   Tra   r   r  )r   r   rj   r,  rw  x_mrZ  r   w_noutput_shaper   x_scale_interleavedr   ww_scale_interleavedr   rE   rE   rF   r.    s<   



zModelOptFp4LinearMethod.applyr  r  )r   r   r  rP   r	  r
  r  rP   r   rP   r  rN   r0  rb   r1  )r@   rA   rB   r  r   r$  r+  r.  rE   rE   rE   rF   r  @  s    


Jcr  c                   @  sv   e Zd ZdZd,ddZed-dd	Zed-d
dZd.ddZd/ddZ	ed-ddZ
d0ddZd1d#d$Zd2d)d*Zd+S )3r  z
       MoE Method for FP4 Quantization with Blockscales and PerTensorScales
    Args:
        quant_config: NVFP4 Quant Config
    r  r  c                 C  s*   || _ t s
tdt  | _i | _d S )NzUCurrent platform does not support NVFP4 quantization. Please use Blackwell and above.)r  r!   r   r   rL  enable_flashinfer_trtllm_moe_cache_permute_indicesr  rE   rE   rF   r   4  s   
z$ModelOptNvFp4FusedMoEMethod.__init__rQ   r   c                 C     ddl m} 	 |  S Nr   )r   )sglang.srt.layers.moer   rN  r   r   rE   rE   rF   enable_flashinfer_cutlass_moeA     
z9ModelOptNvFp4FusedMoEMethod.enable_flashinfer_cutlass_moec                 C  r  r  )r  r   is_flashinfer_cutedslr  rE   rE   rF   enable_flashinfer_cutedsl_moeH  r  z9ModelOptNvFp4FusedMoEMethod.enable_flashinfer_cutedsl_moer   r   r2  rP   r3  r4  r  rN   c                 K  s:  | j jstd||_||_| j |_ tj}tj}|d}	|j	j
r#dnd}
ttj|j|
| |d |ddd|	d}|d| ttj|j||d |ddd|	d}|d| ttj|j|
| || j j |ddd|	d}|d	| tt|jd
d|_ttj|j||| j j |ddd|	d}|d| tt|jd
d|_ddlm} |d|jji |j	j
r|jdfn|jf}ttj|tjd|	d}|d| ttj|jtjd|	d}|d| |d|jji |j|
f}ttj|tjd|	d}d|_|d| ttj|jtjd|	d}d|_|d| d S )Nr  r  rq   rl   rS   r  r7  r8  r9  Fr%  r:  r   r5  r   r  w13_weight_scale_2w2_weight_scale_2Tr;  r<  ) r  r  r   r4  r  rr   r  r  r   r=  r>  r   r  r  r  r  r	   r(   r9  w13_blockscale_swizzledr:  w2_blockscale_swizzledr   r6  r?  BLOCKrA  r   r  r@  r2  _sglang_require_global_experts)r   r   r2  r3  r4  r  r!  r"  weight_scale_dtyper  rB  r7  r8  r9  r:  r6  w13_weight_scale_shaper  r  w13_input_scale_shaper;  r<  rE   rE   rF   r$  O  s   	







z*ModelOptNvFp4FusedMoEMethod.create_weightsr   c              
     s.   j jr't jdddf  jdddf std  jdddf }n jdd }t|dd _| js;| j	rN j
 tj} j tj}nT| jrtrd j
 tj j
jd }n j
jddjtj} j} fdd	}||}||}trt||d ksJ |d }n j
jd
djtj} j}t|| tjdd _t| j tjdd _td| tjdd _td| tjdd _ jdtst r jndi d} j jrdnd}d jfd jffD ]d\}}	t  ! r7 j"jd d |  j#jd d | d}
|	jd
 |
| ks6J d| d|
|  d|	jd
  n|	j| d dkrQt$d|t%|	jt&| j'dd |	j(tj)ks_J | dq| j	rzt*durzt+durzddl,m-} |  dS t. j} ` j/j01|  j"}|2d|2d }|r j jrJ dttj3j45|ddd|fdd _"ttj3j45 j#d|d ddfdd _#ttj3j45 jd|d fdd _tt. jdd _6t j"j0dd _"t. j} ` j6j01|  j"j7}t8t9j:| j; j#jd d  j"jd d d _<dS )zProcess FP4 MoE weights after loading from serialized checkpoint.

        Only supports pre-quantized checkpoints with FP8 weights and scales.
        Nr   rl   zIw1_weight_scale_2 must match w3_weight_scale_2. Accuracy may be affected.Fr%  rE  c                   sH   | j  jfks	J  j j  jksJ |  j j  jd  j  S )Nrl   )rU   r2  r  r  r  )r  r   rE   rF   _slice_scale  s   zOModelOptNvFp4FusedMoEMethod.process_weights_after_loading.<locals>._slice_scaler   re   r  rq   w13w2)r	  r
  z	Expected z_weight_scale.dim(2) == z, got r  zCNVFP4 %s_weight_scale K' not multiple of 4: shape=%s, group_size=%sr  z2 Weight Blockscale must be represented as FP8-E4M3)+align_fp4_moe_weights_for_flashinfer_trtllmz]The intermediate size required padding, but padding is also implemented for gated activations)r2  r4  r3  )=r=  r>  rr   allcloser  r   warning_oncer	   r  r  r;  r(  rO  r  r<  r  CUTEDSL_MOE_SCALAR_INPUT_SCALErepeatrU   rJ  MOE_NVFP4_DISPATCHall	g1_alphasr  	g2_alphasw13_input_scale_quantw2_input_scale_quant
dispatcherset_quant_configr   r9  r:  r   rL  r7  r8  r   tupler  r  rT   r  r9   r:   rM  r  r(   r   r  copy_sizers   rt   ru   r  rr  r   r   BlockscaledFP4r2  cutlass_moe_params)r   r   r  r;  r<  r  
block_size
assert_dimr   r  expected_blocksr  r   r7  intermediate_size_padr  rr  rE   r  rF   r+    s    	





z9ModelOptNvFp4FusedMoEMethod.process_weights_after_loadingc                 C  s   | j o| jjS rb   )r  r=  r>  r   rE   rE   rF   load_up_proj_weight_first  s   z5ModelOptNvFp4FusedMoEMethod.load_up_proj_weight_firstr=  r   c                 C  s&   || _ t  rttj|| _d S d S rb   )r=  r   rL  r   r   FLASHINFER_TRTLLMr]  r^  rE   rE   rF   r_    s   

z-ModelOptNvFp4FusedMoEMethod.create_moe_runnerr2   r`  r4   r3   c                 C  s  |j }|j}|j}| jj}|tv sJ d|dt | j}t|drfddlm	} ddl
m}	 t|d|	j}
||jj|jj|jj|jj|jj|jj|jj|j|j|j|j |j|j|
d}| j||S | jr3dd	lm} |j rwJ d
|j!|j"}}t#j$}|%|r|j&}n7|j'd }|d ur|jj(r|d9 }t)t* t+  d t#j,|j'd |||j-d}W d    n1 sw   Y  t.d#i d|d|d|/t#j0d|d|j12t#j3d|j42t#j3d|d|d|j|j52t#j6|j|j7|j82t#j6|jgd|j9d|jd|j:d|j;dt<|j'd dt| dt= > d }ddlm?} ||d S dd!l@mA} |j!|j"}}|||j|j1|j5|j|j7|j4|j8|j|||jB|j d"/|jC}ddlm?} ||d S )$Nzactivation=z) missing from ACT_STR_TO_TYPE_MAP.keys()=gemm1_weights_fp4_shuffledr   )FlashInferTrtllmFp4MoeQuantInford  ri  )r#  gemm2_weights_fp4_shuffledgemm1_scales_fp4_shuffledgemm2_scales_fp4_shuffled
g1_scale_cr  r  r  rf  rg  rh  r4  ri  )DispatchOutputChecker<apply_router_weight_on_input is not supported for Flashinferrl   rq   ro  rq  rc   rG   rs  rt  ru  rv  rw  rJ   rx  ry  rz  r{  r|  r}  r~  enable_alltoallr  r  )cutlass_moe_fp4)a	a1_gscalew1_fp4w1_blockscale	w1_alphas	a2_gscalew2_fp4w2_blockscale	w2_alphasr  r  paramsapply_router_weight_on_inputrE   )Dr  hidden_states_scaler  r=  r  r  keysrI  rM  r$  r  re  r  Defaultr#  r  r%  r&  r'  r(  r  r  r  r2  r  r  r4  r]  r  r  r  r)  r7  r  r  rr   r   format_is_flashinfer
moe_outputrU   r>  r   r
   r   r  rr  r  rO  rP   r7  r  longr8  r   int32r  r  r  r  r  r-   r   is_flashinferr  !sglang.srt.layers.moe.cutlass_moer,  r  rT   )r   r   r`  rj   r  r  r  r=  r$  re  ri  r  r)  r  r  rw  r  
output_colrc   r  r,  rE   rE   rF   r.    s   









z!ModelOptNvFp4FusedMoEMethod.applyrj   +tuple[torch.Tensor, Optional[torch.Tensor]]masked_mrH   c                 C  s   |j dks	J d| jsJ d|jrJ dddlm} t|dd }|d
|tr*d n|j|j|j	|j
|j|j|j|j|d
|d urKt|j|j|jd	ni }|S )Nr   z"Only SiLU activation is supported.z#only support flashinfer cutedsl moer*  r   )flashinfer_cutedsl_moe_maskeddown_gemm_overlap_args)
r  re   w1r0  w1_alphar
  a2_global_scaler4  w2_alpharC  )down_sm_countdown_signalsdown_start_eventrE   )r  r  r7  ,sglang.srt.layers.moe.flashinfer_cutedsl_moerD  r  r  r  r7  r   r  r8  r  r  r  r   num_smssignalstart_event)r   r   rj   rC  r=  rD  rE  r   rE   rE   rF   apply_without_routing_weights   sB   z9ModelOptNvFp4FusedMoEMethod.apply_without_routing_weightsNr  )rQ   r   r  r0  r  )r   r2   r`  r4   rQ   r3   )
r   r2   rj   rB  rC  rH   r=  r   rQ   rH   )r@   rA   rB   r  r   propertyr  r  r$  r+  r!  r_  r.  rQ  rE   rE   rE   rF   r  -  s"    

 
 <

	 r  )rG   rH   rI   rH   rJ   rH   rK   rH   rL   rH   rM   rN   rO   rP   rQ   rH   )rj   rP   rk   rP   rQ   rP   )rI   rH   rn   rP   ro   rP   rQ   rp   )r   rH   r   rP   rQ   rH   )r   rH   r   rP   rQ   rH   )
__future__r   loggingenumr   typingr   r   r   r   r   regexr   rr   torch.nn.parameterr	   sglang.srt.distributedr
   <sglang.srt.distributed.device_communicators.pynccl_allocatorr   sglang.srt.environr   sglang.srt.layers.dp_attentionr   r  r   r   r   r   r   (sglang.srt.layers.moe.cutlass_moe_paramsr   r   'sglang.srt.layers.moe.moe_runner.tritonr   r  r   sglang.srt.layers.parameterr   r   *sglang.srt.layers.quantization.base_configr   r   r   r   (sglang.srt.layers.quantization.fp4_utilsr   )sglang.srt.layers.quantization.fp8_kernelr   (sglang.srt.layers.quantization.fp8_utilsr   r    r!   'sglang.srt.layers.quantization.kv_cacher"   &sglang.srt.layers.quantization.unquantr#   $sglang.srt.layers.quantization.utilsr$   r%   r&   r'   r(   !sglang.srt.layers.radix_attentionr)   sglang.srt.utils.commonr*   r+   r,   r-   sglang.srt.utils.custom_opr.   sglang.srt.utils.patch_torchr/   -sglang.srt.batch_overlap.single_batch_overlapr0   ,sglang.srt.layers.moe.fused_moe_triton.layerr2   r  r3   r4   sglang.srt.models.utilsr5   r6   r  ImportError
sgl_kernelr7   r8   r^   r9   r:   r\   r;   r_   r  flashinfer.fused_moer<   r  flashinfer.fused_moe.corer=   	getLoggerr@   r   rY   ra   rf   r  FP4_GEMM_ALIGNMENTrm   r   r   r   SGLANG_MOE_NVFP4_DISPATCHr   r  ACTIVATION_SCHEMESrC   rD   r  r   r   r   r   r   r  r  r  rE   rE   rE   rF   <module>   s   





2

pgf	  G + n