o
    پig                  
   @  s  U d dl mZ d dlmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZmZmZ d dlmZ d dlmZ d dlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z. e  oe% Z/e  oe$ Z0e& Z1e! rd dl2m3Z3m4Z4m5Z5 d dl6m7Z7 i Z8de9d< i Z:de9d< 	d=d>ddZ;erd d l<m=Z=m>Z> e# Z?e,d!oe?Z@e" ZAe?rz&d d"lBmCZCmDZD d d#lEmFZF d d$lGmHZHmIZImJZJ d d%lKmLZL d d&lMmNZN W n eOy ZP zeP ZC ZD ZF ZLZNW Y dZP[PndZP[Pww d'd( ZQd?d,d-ZRe.eRd.d?d/d0ZSe.dd1	2d@dAd5d6ZTG d7d8 d8eZUG d9d: d:eZVG d;d< d<eZWdS )B    )annotations)TYPE_CHECKINGListOptionalN)	Parameter)get_tp_group)use_symmetric_memory)is_allocation_symmetric)	MoeRunnerMoeRunnerBackendMoeRunnerConfig)TritonMoeQuantInfo)get_moe_runner_backend)FusedMoEMethodBaseQuantizationConfigQuantizeMethodBase)is_layer_skipped)get_global_server_args)is_cudais_flashinfer_availableis_gfx95_supportedis_hipis_sm90_supportedis_sm100_supportedis_triton_kernels_availablemxfp_supportednext_power_of_2round_upset_weight_attrs)get_bool_env_var)register_custom_op)mxfp8_quantizenvfp4_block_scale_interleavetrtllm_fp4_block_scale_moe)!get_w2_permute_indices_with_cachezdict[torch.Size, torch.Tensor]'_flashinfer_mxfp4_permute_indices_cachez>dict[tuple[tuple[int, ...], int, int, str, int], torch.Tensor]._flashinfer_mxfp4_permute_indices_device_cachextorch.Tensorepilogue_tile_mintnum_elts_per_sfOptional[int]returnc           	      C  s   |d u ri nd|i}t t| |fi |}| jjd u rdn| jj}|d u r&dn|}t| j||| jj|f}t|}|d u rG|	| j}|t|< |S )Nr+   )
r$   r%   deviceindextupleshapetyper&   getto)	r'   r)   r+   
extra_argspermute_indicesdevice_indexnum_elts_per_sf_key	cache_keycached_device_indices r<   X/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/mxfp4.py,_get_flashinfer_mxfp4_device_permute_indicesJ   s0   r>   )CombineInputStandardDispatchOutputSGLANG_USE_AITER)ActivationType	QuantType)	fused_moe)shuffle_scale_a16w4shuffle_weightshuffle_weight_a16w4)dynamic_mxfp4_quant)e8m0_shufflec                 C  s   ddl m  m} ddlm} ddlm}m}m} ddl	m
} |jdd\}	}
|jd|d\}}tr=d	dd
}|| ntrHddi}|| | dd} |dd}||| |d|	fi |
} ||||fi |}| | |fS )z7weight swizzle for mxfp4 moe, used for OAI mxfp4 kernelr   N)
InFlexData)FP4convert_layoutwrap_torch_tensor)layout   )mx_axis)rP   	num_warpsT)is_persistentepilogue_subtilesplit_kr.   dtype)+triton_kernels.matmul_ogs_details.opt_flagsmatmul_ogs_details	opt_flagstriton_kernels.numericsrJ   triton_kernels.tensorrK   rL   rM   triton_kernels.tensor_detailsrN   "make_default_matmul_mxfp4_w_layout(make_default_matmul_mxfp4_w_scale_layout_is_sm100_supportedupdate_opt_flags_constraints_is_sm90_supported	transpose)quant_tensorscalerQ   rZ   rJ   rK   rL   rM   rN   value_layoutvalue_layout_optsscale_layoutscale_layout_optsconstraintsr<   r<   r=   _swizzle_mxfp4   s8   


rk   re   float_dtypetorch.dtypec                 C  s0   t jg | jd d | jd d R || jdS )Nr.      rW   r/   )torchemptyr2   r/   )r'   re   rl   r<   r<   r=   _dequant_mxfp4_fake   s   &rr   )	fake_implc              
   C  sB   zddl m} W n ty } ztd|d }~ww || ||S Nr   )mxziThe package `amd-quark` is required to use MX-FP4 models. Please install it with `pip install amd-quark`.)quark.torch.kernelru   ImportErrordq_mxfp4)r'   re   rl   ru   errr<   r<   r=   dequant_mxfp4   s   rz   )	out_shapeevenscale_calculation_modestrc              
   C  s@   zddl m} W n ty } ztd|d }~ww || |S rt   )rv   ru   rw   	qdq_mxfp4)r'   r}   ru   ry   r<   r<   r=   quant_dequant_mxfp4   s   r   c                      s   e Zd Z		d#d$ fddZed	d
 Zed%ddZed&ddZed'ddZed(ddZ	dd Z
d)ddZd*d!d"Z  ZS )+Mxfp4ConfigNFignored_layersOptional[list[str]]is_checkpoint_mxfp4_serializedboolc                   s   t    || _|| _d S N)super__init__r   r   )selfr   r   	__class__r<   r=   r      s   

zMxfp4Config.__init__c                 C  sR   |  |dg}d|v }tr$t r| |dS tjdj}td| d| |dS )Nquant_methodmxfp4r   r   zCurrent platform z not support mxfp4 computation)get_from_keys_is_hipr   rp   cudaget_device_propertiesgcnArchName
ValueError)clsconfigr   r   platformr<   r<   r=   from_config   s   

zMxfp4Config.from_configr-   r*   c                 C     dS )NP   r<   r   r<   r<   r=   get_min_capability      zMxfp4Config.get_min_capabilityr~   c                 C  r   )Nr   r<   r   r<   r<   r=   get_name   r   zMxfp4Config.get_namelist[torch.dtype]c                 C  s   t jt jgS r   )rp   bfloat16float16r   r<   r<   r=   get_supported_act_dtypes   s   z$Mxfp4Config.get_supported_act_dtypes	list[str]c                 C     g S r   r<   r   r<   r<   r=   get_config_filenames   r   z Mxfp4Config.get_config_filenamesc                 C  s   | j S r   r   r   r<   r<   r=   is_static_cfg  s   zMxfp4Config.is_static_cfglayertorch.nn.ModuleprefixOptional['QuantizeMethodBase']c                 C  s   ddl m} ddlm} ddlm} t||r-| jr&t|| j| j	dr&| S t
r+| S d S t||r=| jr:t|dS t S | jrDtdd S )Nr   )
LinearBase)FusedMoE)UnquantizedLinearMethod)r   r   fused_mapping)r   z(Mxfp4 attention layer is not implemented)sglang.srt.layers.linearr   &sglang.srt.layers.moe.fused_moe_tritonr   &sglang.srt.layers.quantization.unquantr   
isinstancer   r   packed_modules_mappingr   r   Mxfp4MoEMethodMxfp4DynamicQuantMoEMethodNotImplementedError)r   r   r   r   r   r   r<   r<   r=   get_quant_method  s(   
	

zMxfp4Config.get_quant_method	List[str]c                 C  r   r   r<   r   r<   r<   r=   get_scaled_act_names  s   z Mxfp4Config.get_scaled_act_namesNF)r   r   r   r   )r-   r*   )r-   r~   )r-   r   )r-   r   )r   r   r   r~   r-   r   )r-   r   )__name__
__module____qualname__r   classmethodr   r   r   r   r   r   r   r   __classcell__r<   r<   r   r=   r      s"    	

r   c                      sH   e Zd Zd fddZ	dd ddZdd Zd!ddZd"ddZ  ZS )#r   r   r~   c                   sB   t    || _d | _t  | _d| _t  | _	t
 j| _d S r   )r   r   r   topk_indices_dtyper   is_triton_kernelsuse_triton_kernels	with_biasis_flashinfer_mxfp4use_flashinferr   flashinfer_mxfp4_moe_precision)r   r   r   r<   r=   r   %  s   
zMxfp4MoEMethod.__init__Fr   r   num_expertsr*   hidden_sizeintermediate_size_per_partitionparams_dtyperm   r   r   c                 K  s  || _ tj}tj}	|| _d}
|}tr&| jr t|d}t|d}n&t|d}n tr?t|d}t|d}||j | _	||j
 | _ntrFt||
}|| _
|| _tjjtj|jd| |d |ddd}|d| t|| tjjtj|jd| ||
 |	ddd}|d	| t|| tjjtj|jd| tjddd}|d
| t|| tjjtj|j||d |ddd}|d| t|| tjjtj|j|||
 |	ddd}|d| t|| tjjtj|j|tjddd}|d| t|| d S )N       @   rn   rV   Frequires_grad
w13_weightw13_weight_scalew13_weight_bias	w2_weightw2_weight_scalew2_weight_bias)r   rp   uint8r   r`   r   r   
_use_aiterr   
hidden_padr   intermediate_padhas_triton_kernelsnnr   zerosnum_local_expertsregister_parameterr   r   )r   r   r   r   r   r   r   extra_weight_attrsweight_dtypescale_dtypemxfp4_block)intermediate_size_per_partition_after_padr   r   r   r   r   r   r<   r<   r=   create_weights4  s   

	
	

	
	
zMxfp4MoEMethod.create_weightsc           '      C  s  | j rjttjdg| j tjd dd|_ttjdg| j tjd dd|_ttjdg| j tjd dd|_	d}|j
 dkre|j
jd	 | jkre|j
jd
 | jd kre|j
jd | jd ksgJ |j dkr|jjd	 | jkr|jjd
 | jd kr|jjd | j| ksJ |j dkr|jjd	 | jkr|jjd
 | jkr|jjd | jd ksJ |j dkr|jjd
 | jkr|jjd | j| ksJ |j dkr|jjd	 | jkr|jjd
 | jd ksJ |j dkr|jjd	 | jkr|jjd
 | jksJ |jj}|jj}|j
j}|jj}|jjtj}|jjtj}ddd}	|	|d}|	|d}|	|d}g }
g }g }g }g }g }d}t|d	 tj|}t|d	 tj|dd}t|d	 dd
|}t|d	 tj|}t|d	 tj|dd}t|d	 dd
|}t| jD ]a}|
|| tj|   |t|| tj|   ||| dd
|   ||| tj|   |t|| tj|   ||| dd
|   qt|
}t|| jd| j | j| tj}t|}t|| j| j| j| tj}t|dd|_
t|dd|_t|dd|_t|dd|_tt|| jddd|_tt|| jddd|_d S t r2|jd ur}|jjtj|j_|jd ur|jjtj|j_|j
j\}}}|j
tj!|j
jtj||d d|"d	dd
d ||| |jj||d dd"d	dd
d ||d|j_t#|j
dd|j
_t$|jd|jjd | jd}t#|jdd|j_t$|jd|jjd | jd}|jjd|d d"d	dd
 d||j_tj%j|dd|_tj%j|dd|_d S | j&rd	dl'm(}m)} |jtj}|jtj} t|dd|_t| dd|_d}!t*|j
|j|!\}}"}#t*|j|j|!\}}$}%||#||"dd| _+||%||$dd| _,|| _-|| _.|`
|`n4d	dl/m0}& |&|j
|jtj1dd}|&|j|jtj1dd}|`
|`|`|`t|jdd|_
t|jdd|_tj2  d S )NgZd;?rV   Fr   g      ?g      @r      r   rO   rn   r.   c                 S  sl   | j }|dk rt|| }t|}|| d ||< ||d d | j| } | |d } t|}| j| S )Nr   rn   rO   )r2   lenlistinsertreshapeflip)r'   axisr2   	new_shaper<   r<   r=   swap_every_two_rows  s   

zIMxfp4MoEMethod.process_weights_after_loading.<locals>.swap_every_two_rowsrU         )r+   T)FlexCtxPrecisionConfig   )rhs_data)weight_scaleflex_ctx)upcast_from_mxfp)target_dtyper   )r.   )3r   r   rp   tensorr   float32r   gemm1_alpha
gemm1_betagemm1_clamp_limitr   dimr2   r   r   r   r   r   r   r   datar5   r>   viewr   r   rangeappend
contiguousr"   stackfloat8_e4m3fnr   copy_permuterG   rE   r   r   triton_kernels.matmul_ogsr   r   rk   w13_precision_configw2_precision_configw13_weight_triton_tensorw2_weight_triton_tensor$triton_kernels.numerics_details.mxfpr   r   empty_cache)'r   r   sf_block_sizer   r   r   r   w13_biasw2_biasr   gemm1_weights_mxfp4_shuffledgemm1_scales_mxfp4_shuffledgemm2_weights_mxfp4_shuffledgemm2_scales_mxfp4_shuffledgemm1_bias_shuffledgemm2_bias_shuffledr)   w13_weight_permute_indicesw13_scale_permute_indicesw13_bias_permute_indicesw2_weight_permute_indicesw2_scale_permute_indicesw2_bias_permute_indicesienkshuffled_w13_scaleshuffled_w2_scaler   r   r   r   rQ   w13_flex	w13_scalew2_flexw2_scaler   r<   r<   r=   process_weights_after_loading  s  

























z,Mxfp4MoEMethod.process_weights_after_loadingmoe_runner_configr   c                 C  s(   || _ | jr	tjntj}t||| _d S r   )r)  r   r   TRITON_KERNELSTRITONr
   runner)r   r   r)  backendr<   r<   r=   create_moe_runner  s   z Mxfp4MoEMethod.create_moe_runnerdispatch_outputr@   r-   r?   c                 C  sj  ddl m} ddlm} |j}|j}| jr| jdkrA|jt	j
ks"J |}d }|jd }	| j|	kr@t	jjj|d| j|	 fddd}n&| jd	krdt|d
| jd\}}|t	jjg |jd d dR  }nt |jd | jksqJ ||sxJ |jj}
|j}tt t  d* |jd }|jt	jkr|jd d n|jd }t	j||t	j
|jd}W d    n1 sw   Y  t| t	j
d |||j!|j"|j#|j$|j%|j&|j'|j(|j)d d d |j*|
d d | j+|j,|j- |j-d ddt.|jd |dd }||dS t/rT|\}}}t0t	dr|j!t	j1}|j't	j1}n|j!}|j'}| j| j2 }t	jjj|d| j2fddd}t3||||||j4t5j6t7j8|j"|j(| j9j:| j2| j;|j#|j)d}||dS | j<j=}|> rddl?m@} |jAdksmJ d|| jBd urw| jBn|j!| jCd ur| jCn|j'tD|dd tD|dd tD| dd tD| dd d}ntE|j!|j'tD|dd tD|dd d}| j<F||S )Nr   StandardCombineInput)TopKOutputCheckerbf16r.   constantg        )modevaluedefaultF)	alignment)disabledrn   ro   rO   T)tune_max_num_tokensoutputhidden_statesfloat4_e2m1fn_x2)
expert_mask
activation
quant_typew1_scaler'  doweight_stage1r   r   bias1bias2)TritonKernelsQuantInfoz:Expert parallel is not supported when using triton kernelsr   r   r	  r
  )r   r   r  r  r	  r
  )r   r   b13b2)G&sglang.srt.layers.moe.token_dispatcherr1  sglang.srt.layers.moe.topkr2  r=  topk_outputr   r   rW   rp   r   r2   r   r   
functionalpadr!   r   r  r   r   format_is_bypassedtopk_configtop_krouter_logitsr   r   r	   r   rq   r/   r#   r5   r   r   r   r   r   r   r   r   r   r   r   moe_ep_rankr   r   r   hasattrr>  r   rD   expert_mask_gpurB   SwiglurC   per_1x32r)  apply_router_weight_on_inputr   r,  runner_backendr   /sglang.srt.layers.moe.moe_runner.triton_kernelsrF  moe_ep_sizer  r  getattrr   run)r   r   r/  r1  r2  r'   rK  x_quantx_scaleorigin_hidden_states_dimrP  rQ  
num_tokensr   symm_outputtrtllm_gen_outputtopk_weightstopk_ids_r   r   origi_hidden_sizer;  r-  rF  
quant_infor<   r<   r=   apply  s   



(













zMxfp4MoEMethod.apply)r   r~   )F)r   r   r   r*   r   r*   r   r*   r   rm   r   r   r   r   r)  r   r   r   r/  r@   r-   r?   )	r   r   r   r   r   r(  r.  rh  r   r<   r<   r   r=   r   #  s    x  
-r   c                   @  s<   e Zd Zdd	d
Zdd ZdddZdddZdddZdS )r   r   r   r   r*   r   r   r   rm   c                 K  s   ddl m} tjjtj|d| ||ddd}tjjtj||||ddd}	|d| t|| |d|	 t|	| tjjtj|dtj	ddd}
tjjtj|tj	ddd}|d	|
 |d
| |
d|jji d |_d |_d S )Nr   )FusedMoeWeightScaleSupportedrn   rV   Fr   r   r   r   r   r   )r   rk  rp   r   r   rq   r   r   onesr   updateTENSORr6  w13_input_scalew2_input_scale)r   r   r   r   r   r   r   rk  r   r   r   r   r<   r<   r=   r     sH   
	




z)Mxfp4DynamicQuantMoEMethod.create_weightsc                 C  st   |j }| dkrdnd}|r|d }|d|}t|\}}|r2|d d |j d f }||}t|}||fS )Nrn   TFr.   )r2   r   r   rH   rI   )r   ww_shapew_need_reshapew_last_dim_size	mx_scalesw_new_shaper<   r<   r=   mxfp4_quantize  s   
z)Mxfp4DynamicQuantMoEMethod.mxfp4_quantizer-   Nonec                 C  s   |  |jj\}}|  |jj\}}t}|r$t| d}t| d}tjj	|dd|_||j_
tjj	|dd|_tjj	|dd|_||j_
tjj	|dd|_d S )N)r   r   Fr   )rw  r   r   r   _is_shuffle_moe_mxfp4rF   r  rp   r   r   is_shuffledr   r   )r   r   w13w13_mx_scalesw2w2_mx_scalesrz  r<   r<   r=   r(    s   z8Mxfp4DynamicQuantMoEMethod.process_weights_after_loadingr)  r   c                 C  s
   || _ d S r   )r)  )r   r   r)  r<   r<   r=   r.    s   
z,Mxfp4DynamicQuantMoEMethod.create_moe_runnerr/  r@   r?   c                 C  s   ddl m} |j}|j}|\}}}tr|tj}ttdr-|j	
tj}	|j
tj}
n|j	}	|j}
t|j	dr?d|	_d|
_t||	|
||tj|j|j| jjdkrTtjntjd|jd}||d	S )
Nr   r0  r>  rz  TsiluF)rA  rB  r'  r@  rC  r?  r<  )rI  r1  r=  rK  r   r5   rp   r   rS  r   r   r>  r   rz  rD   rC   rV  r   r   r)  r@  rB   SiluGelurT  )r   r   r/  r1  r'   rK  rc  rd  re  r   r   r;  r<   r<   r=   rh    s@   


z Mxfp4DynamicQuantMoEMethod.applyN)
r   r   r   r*   r   r*   r   r*   r   rm   )r   r   r-   rx  ri  rj  )r   r   r   r   rw  r(  r.  rh  r<   r<   r<   r=   r     s    
9

r   r   )r'   r(   r)   r*   r+   r,   r-   r(   )r'   r(   re   r(   rl   rm   r-   r(   )r|   )r'   r(   r}   r~   r-   r(   )X
__future__r   typingr   r   r   rp   torch.nn.parameterr   sglang.srt.distributedr   <sglang.srt.distributed.device_communicators.pynccl_allocatorr   sglang.srt.layers.dp_attentionr	   sglang.srt.layers.moer
   r   r   'sglang.srt.layers.moe.moe_runner.tritonr   sglang.srt.layers.moe.utilsr   *sglang.srt.layers.quantization.base_configr   r   r   $sglang.srt.layers.quantization.utilsr   sglang.srt.server_argsr   sglang.srt.utilsr   r   r   r   r   r   r   r   r   r   r   sglang.srt.utils.commonr   sglang.srt.utils.custom_opr    r`   rb   r   
flashinferr!   r"   r#   flashinfer.fused_moe.corer$   r%   __annotations__r&   r>   rI  r?   r@   r   r   ry  aiterrB   rC   aiter.fused_moerD   aiter.ops.shufflerE   rF   rG   aiter.ops.triton.quantrH   aiter.utility.fp4_utilsrI   rw   ry   rk   rr   rz   r   r   r   r   r<   r<   r<   r=   <module>   st   4
"
"Q    a