o
    
۾in                     @   s  d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZmZ d d	lmZ d d
lmZmZmZmZ d dlmZmZmZ d dlmZmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> e
e?Z@G dd deZAdeAfddZBd eCdeAfd!d"ZDG d#d$ d$e'ZEG d%d& d&eZFG d'd( d(eFZGdS ))    )EnumN)	Parameter)envs)get_current_vllm_config)init_logger)	Attention)FusedMoEFusedMoEConfigFusedMoEMethodBase)modular_kernel)FusedMoEQuantConfigmxfp4_mxfp8_moe_quant_configmxfp4_w4a16_moe_quant_configocp_mx_moe_quant_config)BatchedMarlinExpertsMarlinExpertsfused_marlin_moe)OAITritonExpertsUnfusedOAITritonExperts)TrtLlmGenExperts)
LinearBaseUnquantizedLinearMethod)QuantizationMethods)QuantizationConfigQuantizeMethodBase)get_marlin_input_dtype) prepare_moe_fp4_layer_for_marlin)_can_support_mxfp4_swizzle_mxfp4get_padding_alignment)is_layer_skipped)set_weight_attrs)current_platform)scalar_types)has_flashinfer)has_triton_kernels)round_upc                   @   s(   e Zd ZdZdZdZdZdZdZdZ	dS )	Mxfp4Backendr                     N)
__name__
__module____qualname__NONESM100_FI_MXFP4_MXFP8_TRTLLMSM100_FI_MXFP4_MXFP8_CUTLASSSM100_FI_MXFP4_BF16SM90_FI_MXFP4_BF16MARLINTRITON r8   r8   a/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/mxfp4.pyr'   ?   s    r'   returnc                  C   s^   t  stjS t odt    kodk n  } tjdu r'| r't	d tj
S t	d tjS )zg
    Not all MXFP4 backends support LoRA. Select backends that are known to
    have LoRA support.
    	   r      r   Fz2[get_mxfp4_backend_with_lora] Using Triton backendz2[get_mxfp4_backend_with_lora] Using Marlin backend)r"   is_cudar'   r1   r%   get_device_capabilityr   VLLM_MXFP4_USE_MARLINlogger	info_oncer7   r6   )triton_kernels_supportedr8   r8   r9   get_mxfp4_backend_with_loraO   s   

rE   with_lora_supportc                 C   sF  | rt  S t rtdrt rtjrtd t	j
S tdr/t r/tjr/td t	jS tdr=t r=tjr=t	jS tdrMt rMtd t	jS tdsWtdr_t s_td t oodt   komdk n  }tjsu|s}td	 t	jS td
 t	jS t rtd t	jS t rt rtd
 t	jS t	jS )NZ   z,Using FlashInfer MXFP4 BF16 backend for SM90d   z6Using FlashInfer MXFP4 MXFP8 CUTLASS backend for SM100zUsing FlashInfer MXFP4 BF16 backend for SM100, For faster performance on SM100, consider setting VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, though this may impact accuracy.zMXFP4 MoE is enabled on Hopper/Blackwell but FlashInfer is not available. This may result in degraded performance. Please `pip install vllm[flashinfer]` for best results.r;   r=   zUsing Marlin backendzUsing Triton backendzUsing xpu backend on XPU)rE   r"   r?   is_device_capabilityr$   r   "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16rB   rC   r'   r5   is_device_capability_family+VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASSr3   #VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8r2   r4   warning_oncer%   r@   rA   r6   r7   is_xpuis_rocmr1   )rF   rD   r8   r8   r9   get_mxfp4_backendg   sn   






rQ   c                       s   e Zd Zddee dB f fddZedd Zedefdd	Z	ede
fd
dZedeej fddZedee fddZdejjdeddfddZ  ZS )Mxfp4ConfigNignored_layersc                    s   t    || _d S N)super__init__rS   )selfrS   	__class__r8   r9   rV      s   

zMxfp4Config.__init__c                 C   s   |  S rT   r8   )clsconfigr8   r8   r9   from_config   s   zMxfp4Config.from_configr:   c                 C      dS )NP   r8   rZ   r8   r8   r9   get_min_capability      zMxfp4Config.get_min_capabilityc                 C   r]   )Nmxfp4r8   r_   r8   r8   r9   get_name   ra   zMxfp4Config.get_namec                 C   s   t jgS rT   )torchbfloat16r_   r8   r8   r9   get_supported_act_dtypes   s   z$Mxfp4Config.get_supported_act_dtypesc                 C   s   g S rT   r8   r_   r8   r8   r9   get_config_filenames   ra   z Mxfp4Config.get_config_filenameslayerprefixzQuantizeMethodBase | Nonec                 C   s   t |tr| jrt|| j| jdrt S tjddd t S t |tr8t	
 r,t|jS t|j}t||_|S t |trDtjddd d S )N)ri   rS   fused_mappingzPMXFP4 linear layer is not implemented - falling back to UnquantizedLinearMethod.local)scopezOMXFP4 attention layer is not implemented. Skipping quantization for this layer.)
isinstancer   rS   r    packed_modules_mappingr   rB   
debug_oncer   r"   rO   XpuMxfp4MoEMethod
moe_configMxfp4MoEMethodr   marlin_input_dtyper   )rW   rh   ri   quant_methodr8   r8   r9   get_quant_method   s0   





zMxfp4Config.get_quant_methodrT   )r.   r/   r0   liststrrV   classmethodr\   intr`   r   rc   rd   dtyperf   rg   nnModuleru   __classcell__r8   r8   rX   r9   rR      s&    
rR   c                       s   e Zd Zdef fddZdejjdedededej	f
d	d
Z
dd ZdejjdedB fddZdejdejjdejfddZedefddZdedejdejdejdejeejejf B f
ddZdedejdejdejeejejf B fddZ  ZS )rr   moec                    sR   t  | t|j| _d | _t jj| _	| jt
jks$J d|j di | _d S )Nz$get_mxfp4_backend(with_lora_support=zn) foundno compatible MXFP4 MoE backend (FlashInfer/Marlin/Triton).Please check your environment and try again.)rU   rV   rQ   is_lora_enabledmxfp4_backendrs   r   compilation_configmax_cudagraph_capture_sizemax_capture_sizer'   r1   _cache_permute_indices)rW   r~   rX   r8   r9   rV      s   
zMxfp4MoEMethod.__init__rh   num_expertshidden_sizeintermediate_size_per_partitionparams_dtypec                 K   sV  || _ tj}tj}d}	|}
| jtjkr4t|d}
t r"t|d}nt|d}||_	||_ ||_
|
|_nE| jtjks@| jtjkrKt|d}
t|d}n.| jtjksW| jtjkrbt|d}
t|d}nt rtt }t||}
t||}nt|d}
|
| _|| _
tjjtj|d|
 |d |ddd}|d	| t|| tjjtj|d|
 ||	 |ddd}|d
| t|| tjjtj|d|
 tjddd}|d| t|| tjjtj|||
d |ddd}|d| t|| tjjtj|||
|	 |ddd}|d| t|| tjjtj||tjddd}|d| t|| d S )N          @   r)   rz   Frequires_grad
w13_weightw13_weight_scalew13_bias	w2_weightw2_weight_scalew2_bias)r   rd   uint8r   r'   r6   r&   r"   rO   r   r   r   r2   r4   r3   r5   rP   r   intermediate_sizer{   r   zerosregister_parameterr!   re   )rW   rh   r   r   r   r   extra_weight_attrsweight_dtypescale_dtypemxfp4_block)intermediate_size_per_partition_after_pad	pad_alignr   r   r   r   r   r   r8   r8   r9   create_weights   s   	
	
	
	

	
	
zMxfp4MoEMethod.create_weightsc           >      C   s  | j tjkrt|| jd d S | j tjks| j tjkrddlm} ddl	m
} ttjdg| j tjd dd|_ttjd	g| j tjd dd|_ttjd
g| j tjd dd|_d}|j dkr|jjd | jkr|jjd | jd kr|jjd | jd ksJ |j dkr|jjd | jkr|jjd | jd kr|jjd | j| ksJ |j dkr|jjd | jkr|jjd | jkr|jjd | jd ksJ |j dkr|jjd | jkr|jjd | j| ksJ |j dkr|jjd | jkr|jjd | jd ksJ |j dkr2|jjd | jkr2|jjd | jks4J |jj}|jj}|jj}|jj}|jjtj}	|jjtj}
d#dd}||d}||d}||	d}	g }g }g }g }g }g }d}t | jD ]}|| j!|| "tj#|}|$|| "tj#||j% &  || j!|| "tj#|dd}|$||| "tj#||j% &  || j!|	| ' (dd|}|$|	| ' (dd||	j% &  || j!|| "tj#|}|$|| "tj#||j% &  || j!|| "tj#|dd}|$||| "tj#||j% &  || j!|
| ' (dd|}|$|
| ' (dd||
j% &  q{t)|}t)|(| jd| j | j| "tj*}t)|}t)|(| j| j| j| "tj*}t|dd|_t|dd|_t|dd|_t|dd|_tt)|(| jddd|_tt)|(| jddd|_d S | j tj+ks| j tj,krDttjdg| j tjd dd|_ttjd	g| j tjd dd|_ttjd
g| j tjd dd|_d}|j dkr0|jjd | jkr0|jjd | jd kr0|jjd | jd ks2J |j dkr\|jjd | jkr\|jjd | jd kr\|jjd | j| ks^J |j dkr|jjd | jkr|jjd | jkr|jjd | jd ksJ |j dkr|jjd | jkr|jjd | j| ksJ |j dkr|jjd | jkr|jjd | jd ksJ |j dkr|jjd | jkr|jjd | jksJ |jj}|d d d d dd d f |d d dd dd d f }}tj-||gdd}tj.|ddd\}}tj-||gdd}|jjtj}|d d d d df |d d dd df }} tj-|| gdd}!tj.|!ddd\}"}#tj-|#|"gddtj/}$|jj}%|%d d d d dd d f |%d d dd dd d f }&}'tj-|&|'gdd}(tj.|(ddd\})}*tj-|*|)gdd}+| j tj+krddl0m1}, |+j}-|,|+"tj#(|-}.|jj}/|/j}-|,|/"tj#(|-}0t|dd|_t|.dd|_t|$dd|_t|0dd|_d S | j tj,krBdd }1|+tj#"tj#}2|1|2}3|jj}|tj#"tj#}4|1|4}5tj2jtj-||gdddd|_tj2j|$dd|_tj2j|3dd|_tj2j|5dd|_d S d S | j tj3krddl4m5}6m6}7 |jtj}	|jtj}
t|	dd|_t|
dd|_| j7j8pw| j7j9}8|8rt:j;dkrdnd}9nd}9t<|j|j|9\}}:};t<|j|j|9\}}<}=|7|;|6|:dd| _=|7|=|6|<dd| _>|| _|| _|`|`||_||_d S t?d | j  d!t@t d")$N)input_dtyper   )nvfp4_block_scale_interleave)!get_w2_permute_indices_with_cachegZd;?r   Fr   g      ?g      @r   r*   r(   r)   c                 S   sl   | j }|dk rt|| }t|}|| d ||< ||d d | j| } | |d } t|}| j| S )Nr   r)   r(   )shapelenrv   insertreshapeflip)xaxisr   	new_shaper8   r8   r9   swap_every_two_rows  s   

zIMxfp4MoEMethod.process_weights_after_loading.<locals>.swap_every_two_rowsr      )num_elts_per_sf)dim)block_scale_interleavec                 S   s\   | j }| |d |d |d d d}|dddd}||d |d d |d d }|S )Nr   r(   r)   r+   r*   )r   r   permute)ww_shapew_interleavedr8   r8   r9   _interleave_mxfp4_cutlass_sm90  s   zTMxfp4MoEMethod.process_weights_after_loading.<locals>._interleave_mxfp4_cutlass_sm90)FlexCtxPrecisionConfigi   r+      )rhs_data)weight_scaleflex_ctxzUnsupported mxfp4_backend: z: should be one of: .)r   )Ar   r'   r6   r   rs   r2   r4   flashinfer.fp4_quantizationr   flashinfer.fused_moe.corer   r   rd   tensorr   float32cudagemm1_alpha
gemm1_betagemm1_clamp_limitr   r   r   r   r   r   r   r   r   r   datatoranger   viewr   appenddevice
contiguouscloner   stackfloat8_e4m3fnr3   r5   catchunkre   
flashinferr   r{   r7   triton_kernels.matmul_ogsr   r   r~   use_pplx_kernelsuse_deepep_ll_kernelsr   VLLM_MOE_DP_CHUNK_SIZEr   w13_precision_configw2_precision_config
ValueErrorrv   )>rW   rh   r   r   sf_block_sizer   r   r   r   r   r   r   gemm1_weights_mxfp4_shuffledgemm1_scales_mxfp4_shuffledgemm2_weights_mxfp4_shuffledgemm2_scales_mxfp4_shuffledgemm1_bias_shuffledgemm2_bias_shuffledepilogue_tile_mipermute_indicespermute_sf_indicespermute_bias_indicesw13_wgate_wup_wdeinterleaved_w13_ww1_ww3_ww13_weight_swappedw13_bgate_bup_bdeinterleaved_w13_bb1b3w13_bias_swappedw13_sgate_sup_sdeinterleaved_w13_ss1s3w13_scale_swappedr   
orig_shapew13_scale_interleavedw2_sw2_scale_interleavedr   
w31_scalesw31_scales_interleaved	w2_scalesw2_scales_interleavedr   r   is_batched_moe	num_warpsw13_flex	w13_scalew2_flexw2_scaler8   r8   r9   process_weights_after_loading  sN  














:.:

 







z,Mxfp4MoEMethod.process_weights_after_loadingr:   Nc                 C   s   | j tjkrt|j|j|j|jdS | j tjkr(| j	}| j
}t|j|j||dS | j tjtjfv r=t|j|j|j|jdS | j tjfv rPt|j|j|j|jdS |j}|j}td|j|j||dS )N)w1_biasr   w1_scaler	  rb   )quant_dtyper  r   r  r	  )r   r'   r6   r   r   r   r   r   r7   r   r   r2   r3   r   r4   r   )rW   rh   r  r	  r8   r8   r9   get_fused_moe_quant_config  sT   z)Mxfp4MoEMethod.get_fused_moe_quant_configprepare_finalizec                 C   s  |j tjjkr3| jtjkr*| }|d usJ | jd usJ t	||
 | j| jdS td| j d| jd us:J | jtjksF| jtjkr\|j|j|j| jd}t| j| jfi |S | jtjkrit| j| jS | jtjkr| jjrzt| j| jS t| j| jS td| j d)N)max_num_tokensnum_dispatchersquant_configrq   zIncompatible Mxfp4 backend (z) for EP batched experts format)r   r   r   r   z) for EP)activation_formatmkFusedMoEActivationFormatBatchedExpertsr   r'   r6   max_num_tokens_per_rankmoe_quant_configr   r  r~   NotImplementedErrorr2   r4   r   r   r   r   r   r   r7   r   r   r   )rW   r  rh   r  kwargsr8   r8   r9   select_gemm_implB  sD   zMxfp4MoEMethod.select_gemm_implc                 C   s$   | j tjkp| j tjkp| j tjkS rT   )r   r'   r2   r4   r7   rW   r8   r8   r9   is_monolithict  s
   

zMxfp4MoEMethod.is_monolithicr   topk_weightstopk_idsc                 C   s`  | j rJ |jrtd| jtjkr8t||j|j|j	|j
|j|j||d d tjj|j|j|j|j| j| jj dS t|j|j|j|j|j|j|j|j|j|jj|jj |jj!sZJ d| jtj"ksh| jtj#kshJ ddl$m%} | jtj"krddl&m'} ||dd\}}t(j)| j*|j+d	}	|j, -t(j.|	|j, -t(j.|	g}
|}t/d||j, -t(j0|j, -t(j0d
}n| jtj#kr|j1t(j2ksJ |j|jg}
|}t/d|j|jd}t(j3|t(j2d}|di d|d|4t(j5, d|dt(j2d|d|
d|j	d|j
d|j6d|j7d|j8d| jj9d| jj:d| jj;d| jj<dt=| j>d| |S )NEPLB is not supported for mxfp4)	global_scale1global_scale2quant_type_idapply_router_weight_on_inputglobal_num_experts
activation
expert_mapr   inplace0MXFP4 are not supported with this configuration.r   )flashinfer_cutlass_fused_moemxfp8_quantizeTr   )r   )use_mxfp8_act_scalinginput_sffc1_expert_weightsfc2_expert_weights)use_w4_group_scalingr/  r0  r   inputtoken_selected_expertstoken_final_scalesoutput_dtypeoutputquant_scalesfc1_expert_biasesfc2_expert_biasesswiglu_alphaswiglu_betaswiglu_limittp_sizetp_rankep_sizeep_ranktune_max_num_tokensr(   r8   )?r  enable_eplbr  r   r'   r6   r   r   r   r   r   r   r   r#   float4_e2m1fidr$  r%  r&  r'  rs   r~   disable_inplacer   use_grouped_topk
topk_groupnum_expert_groupcustom_routing_functione_score_correction_biasscoring_func
eplb_stateexpert_load_viewlogical_to_physical_maplogical_replica_countr3   r5   vllm.utils.flashinferr*  r   r,  rd   onesr   r   r   r   int32dictlongrz   re   
empty_liker   ry   r   r   r   r=  r>  r?  r@  maxr   )rW   rh   r   r  r  r*  r,  x_quantx_scalefake_input_scaler7  fi_inputextra_kwargsr6  r8   r8   r9   apply|  s   
	
zMxfp4MoEMethod.applyrouter_logitsc           
      C   s\  | j sJ |jrtdt|j|j|j|j|j|j	|j
|j|j|jj|jj|jjs.J d| jtjks;| jtjkrddlm} | jtjkrT|jtjksOJ |}d }n&| jtjkrzddlm} ||d\}}|tjjg |jd d dR  }|d(i d|tjd	d d
|d|d|j d|j!d|j"d|j#d|j$d|j%d|j&d|j'd|j(dd dd dd d|j)d|j*dd dd d| j+d|j,|j- d| j.dd d |j/rd!ndd"d#d$t0| j1d!d }|S d"d#d$t0| j1d!d }|S | jtj2kr&dd%l3m4}	 |	||j |j&||j*|j/|j)|j| j5|j
d&
S t6d'| j ))Nr   r)  r   )trtllm_fp4_block_scale_moer+  Fr   routing_logitsrouting_biashidden_stateshidden_states_scalegemm1_weightsgemm1_weights_scale
gemm1_biasr   r   r   gemm2_weightsgemm2_weights_scale
gemm2_biasoutput1_scale_scalaroutput1_scale_gate_scalaroutput2_scale_scalarr   top_kn_grouprG  r   local_expert_offsetlocal_num_expertsrouted_scaling_factorrouting_method_typer(   do_finalizeTrA  )triton_kernel_moe_forward)
ra  w1w2gating_outputtopkrenormalizer%  r'  r  r$  zUnsupported backend: r8   )7r  rB  r  r   rF  rG  rH  r'  rI  rJ  r$  rK  r&  rL  rM  rN  rO  r   r'   r2   r4   r   r^  rz   rd   re   r,  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r%  rl  r   r@  ro  r   rx  rV  r   r7   ?vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moers  r  r   )
rW   rh   r   r]  r^  rW  rX  r,  trtllm_gen_outputrs  r8   r8   r9   apply_monolithic  s   
&	
zMxfp4MoEMethod.apply_monolithic)r.   r/   r0   r	   rV   rd   r{   r|   ry   rz   r   r
  r   r  r  FusedMoEPrepareAndFinalizeFusedMoEPermuteExpertsUnpermuter  propertyboolr  r   Tensortupler\  r{  r}   r8   r8   rX   r9   rr      sh    
    
/
2
rrr   c                
       s   e Zd Zdef fddZdejjdedededej	f
 fd	d
Z
dejjddfddZedefddZdedejdejdejfddZ  ZS )rp   rq   c                    s   t  | || _d S rT   )rU   rV   rq   )rW   rq   rX   r8   r9   rV   L  s   
zXpuMxfp4MoEMethod.__init__rh   r   r   r   r   c                    s&   t  j|||||fi | || _d S rT   )rU   r   original_hidden_size)rW   rh   r   r   r   r   r   rX   r8   r9   r   P  s   	
z XpuMxfp4MoEMethod.create_weightsr:   Nc                 C   s   d S rT   r8   )rW   rh   r8   r8   r9   r
  c  s   z/XpuMxfp4MoEMethod.process_weights_after_loadingc                 C   r]   )NTr8   r  r8   r8   r9   r  f  ra   zXpuMxfp4MoEMethod.is_monolithicr   r]  c           
      C   s  |j dks	J dddlm} | \}}tj||jtj|jd}tj||jtj	|jd}tj||jtj	|jd}	|j
rVtjjj|||j|j|j|j|j|j|jd	\}}ntjj|||	||j|j |||j| jjro|jnd |j|j| jjr{|jnd |j|||j|j |jddS )	N	swigluoaiz9Only swiglu_oai activation is supported for XPU MXFP4 MoEr   )xpu_fused_moe)rz   r   )n_expert_groupn_topk_grouprK  rp  biasT)ra  w13r   
w13_scalesru  r   r  r  r  n_experts_per_tokenr&  r   is_mxfp4)r&  $vllm_xpu_kernels.fused_moe_interfacer  sizerd   emptyrl  r   r   rR  rF  ops_moe_Cfused_grouped_topkrx  rH  rG  rK  rp  rJ  topk_softmaxr   r~   has_biasr   r   r   r   r   ro  )
rW   rh   r   r]  r  M_routing_weightsselected_expertstoken_expert_indicesr8   r8   r9   r{  j  sb   	z"XpuMxfp4MoEMethod.apply_monolithic)r.   r/   r0   r	   rV   rd   r{   r|   ry   rz   r   r
  r~  r  r  r   r  r{  r}   r8   r8   rX   r9   rp   K  s2    rp   )Henumr   rd   torch.nn.parameterr   vllmr   vllm.configr   vllm.loggerr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.fused_moer   r	   r
   r   r  +vllm.model_executor.layers.fused_moe.configr   r   r   r   5vllm.model_executor.layers.fused_moe.fused_marlin_moer   r   r   ry  r   r   /vllm.model_executor.layers.fused_moe.trtllm_moer   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.quantization.base_configr   r   :vllm.model_executor.layers.quantization.utils.marlin_utilsr   >vllm.model_executor.layers.quantization.utils.marlin_utils_fp4r   9vllm.model_executor.layers.quantization.utils.mxfp4_utilsr   r   r   9vllm.model_executor.layers.quantization.utils.quant_utilsr    vllm.model_executor.utilsr!   vllm.platformsr"   vllm.scalar_typer#   rP  r$   vllm.utils.import_utilsr%   vllm.utils.math_utilsr&   r.   rB   r'   rE   r  rQ   rR   rr   rp   r8   r8   r8   r9   <module>   sN   E=      h