o
    
۾i                     @   s  d dl m Z  d dlmZmZ d dlZd dlmZ d dlm  m	  m
  mZ d dlmZ d dlmZ d dlmZmZ d dlmZmZmZ d d	lmZmZmZmZmZ d d
lmZm Z m!Z!m"Z"m#Z#m$Z$ d dl%m&Z&m'Z'm(Z( d dl)m*Z* d dl+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9m:Z: d dl;m<Z< d dl=m>Z>m?Z?m@Z@mAZAmBZB d dlCmDZDmEZEmFZF d dlGmHZHmIZImJZJmKZKmLZLmMZMmNZN d dlOmPZPmQZQ d dlRmSZSmTZTmUZUmVZV d dlWmXZX erd dlYmZZZ ee[Z\g dZ]dgZ^G dd de1Z_G dd  d e,Z`G d!d" d"e`ZaG d#d$ d$e'ZbG d%d& d&e'ZcG d'd( d(e'ZdG d)d* d*eZeebea_feeea_ge_ea_hG d+d, d,e`ZiG d-d. d.e'ZjG d/d0 d0eZkejei_fekei_ge_ei_hG d1d2 d2e`ZlG d3d4 d4e'Zmemel_fe_el_hdS )5    )fnmatch)TYPE_CHECKINGAnyN)	Parameter)init_logger)	Attention)FusedMoEConfigFusedMoEQuantConfig)FusedMoEFusedMoEMethodBaseFusedMoeWeightScaleSupported)Fp8MoeBackend convert_to_fp8_moe_kernel_formatmake_fp8_moe_kernelmake_fp8_moe_quant_configselect_fp8_moe_backend)NvFp4MoeBackend"convert_to_nvfp4_moe_kernel_format(is_global_sf_supported_for_nvfp4_backendmake_nvfp4_moe_kernelmake_nvfp4_moe_quant_configselect_nvfp4_moe_backend)
LinearBaseLinearMethodBaseUnquantizedLinearMethod)QuantizationMethods)QuantizationConfigQuantizeMethodBase)init_fp8_linear_kernel)BaseKVCacheMethod)flashinfer_trtllm_fp4_moe flashinfer_trtllm_fp4_routed_moe)"apply_fi_trtllm_fp8_per_tensor_moe)W8A8BlockFp8LinearOp%process_fp8_input_tensor_strategy_moe&process_fp8_weight_tensor_strategy_moe)get_marlin_input_dtype)MXFP8_BLOCK_SIZEMXFP8_SCALE_DTYPEMXFP8_VALUE_DTYPEMxfp8LinearBackendMxfp8LinearOp)apply_nvfp4_linear%convert_to_nvfp4_linear_kernel_formatselect_nvfp4_linear_backend)
GroupShapeis_layer_skippedkFp8DynamicTokenSymkFp8StaticTensorSymkFp8StaticTokenSymkNvfp4DynamickNvfp4Static)cutlass_block_fp8_supportedrequantize_with_max_scale)BlockQuantScaleParameterChannelQuantScaleParameterModelWeightParameterPerTensorScaleParameter)replace_parameter)WeightsMapper)FP8FP8_PER_CHANNEL_PER_TOKEN	FP8_PB_WONVFP4MXFP8r>   c                       s"   e Zd ZdZd fddZ  ZS )ModelOptFp8KVCacheMethodzI
    Supports loading kv-cache scaling factors from FP8 checkpoints.
    quant_configModelOptQuantConfigBasec                    s   t  | d S N)super__init__selfrD   	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/modelopt.pyrH   |      z!ModelOptFp8KVCacheMethod.__init__)rD   rE   )__name__
__module____qualname____doc__rH   __classcell__rM   rM   rK   rN   rC   w   s    rC   c                       s   e Zd ZU eZeed< eZeed< e	Z
eed< dee f fddZdedefd	d
ZdejjdeddfddZdddZedee fddZedededB dee deeef dedB dd fddZedeeef dd fddZ  ZS ) rE   LinearMethodClsFusedMoEMethodClsKVCacheMethodClsexclude_modulesc                    s   t    || _d S rF   )rG   rH   rX   )rJ   rX   rK   rM   rN   rH      s   

z ModelOptQuantConfigBase.__init__prefixreturnc                 C   s   t | jdkr	dS t|| j| jrdS | jD ]}||kr/||v s,|dr/||dv r/ dS q| jD ]
}t||r= dS q3dS )z
        Check if a layer should be excluded from quantization.

        Handles both exact matching (for fused layers) and ModelOpt wildcard matching.

        The ModelOpt exclude_modules list is a list of wildcards.
        r   FTzlanguage_model.)lenrX   r0   packed_modules_mapping
startswithremoveprefixr   )rJ   rY   exclude_modulewildcard_patternrM   rM   rN   is_layer_excluded   s"   


z)ModelOptQuantConfigBase.is_layer_excludedlayerQuantizeMethodBase | Nonec                 C   s   t |tr
| | S | |rt |trt S d S d|v s!d|v r$t S t |tr=| | }t|dddkr;t||_	|S t |t
rY| j| |jd}t|dddkrWt||_	|S d S )Nvision_towervision_modelbackend marlin)rD   
moe_config)
isinstancer   rW   ra   r   r   rU   getattrr&   marlin_input_dtyper
   rV   ri   )rJ   rb   rY   quant_methodrM   rM   rN   get_quant_method   s*   








z(ModelOptQuantConfigBase.get_quant_methodhf_to_vllm_mapperr=   c                 C   s   t | jdkrCg }| jD ].}t |dkr5|d dkr5|d dkr5||d d  ||d d d  q|| q||| _d S d S )Nr      *.z.*)r[   rX   append
apply_list)rJ   ro   new_exclude_modulesexcluderM   rM   rN   apply_vllm_mapper   s   

$z)ModelOptQuantConfigBase.apply_vllm_mapperc                   C   s   dgS )Nzhf_quant_config.jsonrM   rM   rM   rM   rN   get_config_filenames   s   z,ModelOptQuantConfigBase.get_config_filenamesrm   kv_cache_quant_methodNoriginal_config
group_sizec                C   s   t d)Nz-Please implement this function in sub classes)NotImplementedError)clsrm   r{   rX   r|   r}   rM   rM   rN   _from_config   s   
z$ModelOptQuantConfigBase._from_configconfigc              	   C   s^  d|v r*|  |dg}t|tstd|d}|d}|d}|dg }n|d}|d}|dg }|d}|sEtdt| }|d u rPnt|ts^td	t| | }t|tsptd
t| |d u rwd }n"t|t	r|}nzt	|}W n tt
fy   tdt| d w |tvrtdt d| j|||||dS )Nquantizationz4Expected 'quantization' to be a dictionary in config
quant_algokv_cache_quant_algor}   rX   ignorez+Missing 'quant_algo' in quantization configz*kv_cache_quant_algo must be a string, got z$exclude_modules must be a list, got z#group_size must be an integer, got z"ModelOpt currently only supports: zj quantizations in vLLM. Please check the `hf_quant_config.json` file for your model's quant configuration.)rm   r{   rX   r}   r|   )get_from_keysrj   dict
ValueErrorgetstruppertypelistint	TypeErrorQUANT_ALGOSr   )r   r   rD   rm   r{   group_size_rawrX   r}   rM   rM   rN   from_config   sh   










z#ModelOptQuantConfigBase.from_config)ro   r=   )rP   rQ   rR   r   rU   r   __annotations__r   rV   r   rW   r   r   rH   boolra   torchnnModulern   ry   staticmethodrz   classmethodr   r   r   r   r   rT   rM   rM   rK   rN   rE      sD   
 &

%
$rE   c                       s   e Zd ZdZdedededB dee ddf
 fdd	Zdefd
dZ	dee
j fddZedefddZededB fddZedededB dee deeef dedd fddZ  ZS )ModelOptFp8ConfigzConfig class for ModelOpt FP8.rm   is_checkpoint_fp8_serializedr{   NrX   rZ   c                    s|   t  | || _|| _|| _|rtd| | jdkr!t| _d S | jdkr+t	| _d S | jdkr5t
| _d S td| j d)NzoDetected ModelOpt fp8 checkpoint (quant_algo=%s). Please note that the format is experimental and could change.r>   r?   r@   z.Unsupported ModelOpt FP8 quant_algo for vLLM: z9. Supported: FP8 / FP8_PER_CHANNEL_PER_TOKEN / FP8_PB_WO.)rG   rH   rm   r   r{   loggerwarningModelOptFp8LinearMethodrU   ModelOptFp8PcPtLinearMethodModelOptFp8PbWoLinearMethodr   )rJ   rm   r   r{   rX   rK   rM   rN   rH   N  s(   





zModelOptFp8Config.__init__c                 C      dS )NmodeloptrM   rJ   rM   rM   rN   get_namen     zModelOptFp8Config.get_namec                 C   s   t jt jgS rF   )r   bfloat16halfr   rM   rM   rN   get_supported_act_dtypesq  s   z*ModelOptFp8Config.get_supported_act_dtypesc                 C   r   )NY   rM   r   rM   rM   rN   get_min_capabilityt     z$ModelOptFp8Config.get_min_capabilityc                 C   s   |du rdS | dd }|dkrdS d|v r3|d }t|tr1t| dd}| dkr1dS dS t| dd}| dkrCdS dS )zSDetect if this ModelOpt config should be used based on
        quantization config.Nrm   rg   r   r   r   r>   r   lowerrj   r   r   r   r   hf_quant_cfg
user_quantrm   rD   r   rM   rM   rN   override_quantization_methodx  s    
z.ModelOptFp8Config.override_quantization_methodr|   kwargsc                K   s   d|v }| ||||S )Nr>   rM   )r   rm   r{   rX   r|   r   r   rM   rM   rN   r     s   
zModelOptFp8Config._from_config)rP   rQ   rR   rS   r   r   r   rH   r   r   r   dtyper   r   r   r   r   r   r   r   rT   rM   rM   rK   rN   r   K  sF     
r   c                   @      e Zd ZdZdeddfddZdejjde	d	e
e	 d
e	de	dejfddZdejjddfddZ	ddejjdejdejdB dejfddZdS )r   a  Linear method for Model Optimizer static quantization.
    Supports loading FP8 checkpoints with static weight scale and
    activation scale. Future support might be added for dynamic
    scales.

    Limitations:
    1. Only support per-tensor quantization due to torch._scaled_mm support.
    2. Only support float8_e4m3fn datatype
        Args: quant_config: The ModelOpt quantization config.
    rD   rZ   Nc                 C   s$   || _ tttt | jjd| _d S N)activation_quant_keyweight_quant_key	out_dtypemodule_name)rD   r   r2   r   get_default_dtyperL   rP   
fp8_linearrI   rM   rM   rN   rH        z ModelOptFp8LinearMethod.__init__rb   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec                 K   s   ~~t |}|d}	||_||_||_| jjrtjn|}
t	tj
|||
ddd|	d}|d| | jjruttj
t|tjd|	d}ttjj|d d < |d| ttj
t|tjd|	d}ttjj|d d < |d	| d S d S )
Nweight_loaderr      r   data	input_dim
output_dimr   weightr   r   weight_scaleinput_scale)sumr   logical_widthsr   output_size_per_partitionrD   r   r   float8_e4m3fnr:   emptyregister_parameterr;   r[   float32finfomin)rJ   rb   r   r   r   r   r   extra_weight_attrsr   r   weight_dtyper   r   scalerM   rM   rN   create_weights  sB   

z&ModelOptFp8LinearMethod.create_weightsc                 C   sr   |j }|j }|j|jd k st|j |j|j\}}t| dd|_ t|dd|_t|j dd|_d S )Nr   Frequires_grad)	r   r   maxallr7   r   r   tr   )rJ   rb   r   max_w_scalerM   rM   rN   process_weights_after_loading  s   
z5ModelOptFp8LinearMethod.process_weights_after_loadingxbiasc                 C      | j |||S rF   r   apply_weightsrJ   rb   r   r   rM   rM   rN   apply     zModelOptFp8LinearMethod.applyrF   rP   rQ   rR   rS   r   rH   r   r   r   r   r   r   r   r   Tensorr   rM   rM   rM   rN   r     s6    	
0r   c                   @   r   )r   a#  Linear method for ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoints.

    Expected checkpoint structure (per Linear):
    - weight: fp8-e4m3fn, shape [out, in]
    - weight_scale: fp32, shape [out] (per-output-channel)
    - no input_scale (activations are dynamically quantized per-token)
    rD   rZ   Nc                 C   s$   || _ tttt | jjd| _d S r   )	rD   r   r1   r3   r   r   rL   rP   r   rI   rM   rM   rN   rH     r   z$ModelOptFp8PcPtLinearMethod.__init__rb   r   r   r   r   r   c                 K   s   ~~| j js
tdt|}|d}	||_||_||_tt	j
||t	jddd|	d}
|d|
 tt	j
|t	jdd|	d}t	t	jj|d d < |d	| d S )
NzMFP8_PER_CHANNEL_PER_TOKEN currently only supports FP8-serialized checkpoints.r   r   r   r   r   r   )r   r   r   r   )rD   r   r   r   r   r   r   r   r:   r   r   r   r   r9   r   r   r   rJ   rb   r   r   r   r   r   r   r   r   r   r   rM   rM   rN   r     s8   


z*ModelOptFp8PcPtLinearMethod.create_weightsc                 C   s*   t |j dd|_t |jjdd|_d S )NFr   )r   r   r   r   r   rJ   rb   rM   rM   rN   r   C  s   z9ModelOptFp8PcPtLinearMethod.process_weights_after_loadingr   r   c                 C   r   rF   r   r   rM   rM   rN   r   G  r   z!ModelOptFp8PcPtLinearMethod.applyrF   r   rM   rM   rM   rN   r     s6    	
,r   c                   @   s   e Zd ZU dZdZeeef ed< deddfddZ	d	e
jjd
edee dedede
jfddZd	e
jjddfddZ	dd	e
jjde
jde
jdB de
jfddZdS )r   a  Linear method for ModelOpt FP8_PB_WO checkpoints.

    ModelOpt exports `weight_scale` as a 4D tensor:
      [out_blk, 1, in_blk, 1]
    where block size is typically 128 for both dims.

    vLLM executes it as FP8 GEMM with *dynamic per-token* activation quant.
    )   r   _WEIGHT_BLOCK_SIZErD   rZ   Nc                 C   s@   || _ | j\}}t| j| _tt||td|t dd| _d S )Nr   F)weight_group_shapeact_quant_group_shaper6   use_aiter_and_is_supported)rD   r   r   weight_block_sizer#   r/   r6   w8a8_block_fp8_linear)rJ   rD   block_nblock_krM   rM   rN   rH   \  s   
z$ModelOptFp8PbWoLinearMethod.__init__rb   r   r   r   r   r   c                 K   s  ~~| j js
tdt|}|d}	||_||_||_| j|_t	t
j||t
jddd|	d}
|d|
 | j\}}|| dkrKtd| d	| d
|| dkr\td| d	| d
|| }|| }tt
j|d|dft
jddd|	d}t
t
jj|d d < |d| d S )Nz=FP8_PB_WO currently only supports FP8-serialized checkpoints.r   r   r   r   r   r   z6ModelOpt FP8_PB_WO requires out_features divisible by , got rt   z5ModelOpt FP8_PB_WO requires in_features divisible by rp   r   )rD   r   r   r   r   r   r   r   r   r:   r   r   r   r   r   r8   r   r   r   )rJ   rb   r   r   r   r   r   r   r   r   r   r   r   out_blksin_blksr   rM   rM   rN   r   g  sb   



z*ModelOptFp8PbWoLinearMethod.create_weightsc                 C   sn   t |jjdd|_|j}| dkr|dd}n| dkr,tdt|j dt |	 dd|_d S )	NFr      r   rq   rp   z2Unexpected ModelOpt FP8_PB_WO weight_scale shape: rt   )
r   r   r   r   dimsqueezer   tupleshape
contiguous)rJ   rb   r   rM   rM   rN   r     s   z9ModelOptFp8PbWoLinearMethod.process_weights_after_loadingr   r   c                 C   s   | j j||j|jd |dS )N)inputr   r   r   r   )r   r   r   r   r   rM   rM   rN   r     s   z!ModelOptFp8PbWoLinearMethod.applyrF   )rP   rQ   rR   rS   r   r   r   r   r   rH   r   r   r   r   r   r   r   r   r   rM   rM   rM   rN   r   P  s8   
 	
Ar   c                       sz  e Zd ZdZdededdf fddZ	d+deej	ej	ej	f dB de
jdB fd	d
Zde
jdejjde
jfddZdejjdedededejf
ddZdedej	dej	dej	dej	dej	dej	fddZdejjddfddZdejjdedB fdd Zedefd!d"Zded#ej	d$ej	dej	eej	ej	f B fd%d&Zded#ej	d'ej	d(ej	dej	eej	ej	f B f
d)d*Z  ZS ),ModelOptFp8MoEMethodzMoE method for ModelOpt FP8.
    Supports loading FP8 checkpoints with static weight scale and
    activation scale.
    Args:
        quant_config: The ModelOpt quantization config.
    rD   ri   rZ   Nc                    s:   t  | || _| jjsJ t| jttd\| _| _d S N)r   
weight_keyactivation_key)	rG   rH   rD   r   r   moer2   fp8_backendexperts_clsrJ   rD   ri   rK   rM   rN   rH     s   zModelOptFp8MoEMethod.__init__routing_tablesc                 C      t | jj dNzV uses the new modular kernel initialization logic. This function should not be called.r   rL   rP   rJ   r	  rM   rM   rN   maybe_make_prepare_finalize     z0ModelOptFp8MoEMethod.maybe_make_prepare_finalizeprepare_finalizerb   c                 C   r
  r  r  rJ   r  rb   rM   rM   rN   select_gemm_impl     z%ModelOptFp8MoEMethod.select_gemm_implnum_expertshidden_sizeintermediate_size_per_partitionr   c                 K   s6  ||_ ||_| jjrtjn|}|d}| jjrdnd}	t	tj
||	| ||ddd|d}
|d|
 t	tj
||||ddd|d}|d| ttj||	fdtjd|d	}ttj|fdtjd|d	}|d
| |d| ttj|fdtjd|d	}ttj|fdtjd|d	}|d| |d| d S )Nr   rp   r   r   r   
w13_weight	w2_weight      ?r   w13_weight_scalew2_weight_scalew13_input_scalew2_input_scale)
orig_dtyper  rD   r   r   r   r   r  is_act_and_mulr:   r   r   r;   fullr   )rJ   rb   r  r  r  r   r   r   r   w13_num_shardsr  r  r  r  r  r  rM   rM   rN   r     sp   	
z#ModelOptFp8MoEMethod.create_weightsw13w2	w13_scalew2_scaler  r  c              
   C   s   t | j|||||||d\}}}}t|d| t|d| t|d| t|d| | || _| jrM| jd us9J t| j| j| j| j| |j	d| _
d S d S )N)r  rb   r"  r#  r$  r%  r  r  r  r  r  r  )moe_quant_configri   r  r  r	  shared_experts)r   r  r<   get_fused_moe_quant_configr&  r  r   r  !_maybe_init_expert_routing_tablesr'  moe_mk)rJ   rb   r"  r#  r$  r%  r  r  rM   rM   rN   _setup_kernelA  s4   
z"ModelOptFp8MoEMethod._setup_kernelc           	   	   C   s   |j }|j}|j}|j}|j}|j}t||\}}t|d| t|d| |j}t	||||j j
d | jjd\}}| ||||||| d S )Nr  r  r   )r  r  )r  r  r  r  r  r  r$   r<   r  r%   r   r  r  r+  )	rJ   rb   r"  r#  r$  r%  r  r  
shard_sizerM   rM   rN   r   j  s,   

	z2ModelOptFp8MoEMethod.process_weights_after_loadingc                 C   s,   |j }|j}|j}|j}t| j||||dS )N)r  w1_scaler%  a1_scalea2_scale)r  r  r  r  r   r  )rJ   rb   r-  r%  r.  r/  rM   rM   rN   r(    s   z/ModelOptFp8MoEMethod.get_fused_moe_quant_configc                 C      | j tjkS rF   )r  r   FLASHINFER_TRTLLMr   rM   rM   rN   is_monolithic     z"ModelOptFp8MoEMethod.is_monolithicr   router_logitsc                 C   sr   | j sJ | jtjksJ |jrtd|jdks!J d|j |jr&J t||||j	|j
|j|j|j|jd	S )Nz9EPLB not supported for FlashInfer TRTLLM FP8 MoE Backend.siluz#Expected 'silu' activation but got )	rb   hidden_statesr4  routing_biasglobal_num_expertstop_knum_expert_group
topk_groupapply_router_weight_on_input)r2  r  r   r1  enable_eplbr~   
activationrenormalizer"   e_score_correction_biasr8  r9  r:  r;  r<  rJ   rb   r   r4  rM   rM   rN   apply_monolithic  s*   


z%ModelOptFp8MoEMethod.apply_monolithictopk_weightstopk_idsc                 C   sf   | j rJ | jtjkr|jdv sJ d|j | jd usJ | j||j|j|||j|j|j	|j
d	S )N)r5  relu2_no_mulz>Expected activation to be in ('silu', 'relu2_no_mul'),but got r>  r8  
expert_mapr<  )r2  r  r   FLASHINFER_CUTLASSr>  r*  r  r  r8  rG  r<  rJ   rb   r   rC  rD  rM   rM   rN   r     s&   
zModelOptFp8MoEMethod.applyrF   )rP   rQ   rR   rS   r   r   rH   r   r   r   mkFusedMoEPrepareAndFinalizer  r   r   FusedMoEPermuteExpertsUnpermuter  r   r   r   r
   r+  r   r	   r(  propertyr   r2  rB  r   rT   rM   rM   rK   rN   r    s    
	


O
)

r  c                       s   e Zd ZdZ	ddededB dee deddf
 fd	d
Zde	fddZ
deej fddZedefddZede	dB fddZedededB dee deeef dedB dedd fddZ  ZS )ModelOptNvFp4ConfigzConfig class for ModelOpt FP4.   is_checkpoint_nvfp4_serializedr   NrX   r}   rZ   c                    s4   t  | || _|rtd || _|| _d S d S )NzkDetected ModelOpt NVFP4 checkpoint. Please note that the format is experimental and could change in future.)rG   rH   rP  r   r   r}   r   )rJ   rP  r   rX   r}   rK   rM   rN   rH     s   
zModelOptNvFp4Config.__init__c                 C   r   )Nmodelopt_fp4rM   r   rM   rM   rN   r     r   zModelOptNvFp4Config.get_namec                 C   s   t jt jt jgS rF   )r   r   r   r   r   rM   rM   rN   r     rO   z,ModelOptNvFp4Config.get_supported_act_dtypesc                 C   r   )NK   rM   r   rM   rM   rN   r     r   z&ModelOptNvFp4Config.get_min_capabilityc                 C   s   |du rdS | dd }|dkrdS d|v r/|d }t|tr-| dd}d|v r-dS dS | dd}t|trBd	| v rBdS dS )
zWDetect if this ModelOpt FP4 config should be used based on
        quantization config.Nrm   rg   r   r   r   rA   rQ  FP4r   r   rM   rM   rN   r     s    
z0ModelOptNvFp4Config.override_quantization_methodrm   r{   r|   r   c          
         sb   d|v }|d u r
d}|r*d|v r*|d  g d} fdd|D }	|	r*t d|	 | ||||S )NrA   rO  r   )r}   r   rX   c                       g | ]}| vr|qS rM   rM   .0fieldrD   rM   rN   
<listcomp>3      z4ModelOptNvFp4Config._from_config.<locals>.<listcomp>zJNVFP4 quantization requires the following fields in hf_quant_config.json: )r   )
r   rm   r{   rX   r|   r}   r   rP  required_fieldsmissing_fieldsrM   rX  rN   r     s*   
z ModelOptNvFp4Config._from_config)rO  )rP   rQ   rR   rS   r   r   r   r   rH   r   r   r   r   r   r   r   r   r   r   r   rT   rM   rM   rK   rN   rN    sL    
	rN  c                   @   r   )ModelOptNvFp4LinearMethoda{  Linear method for Model Optimizer NVFP4.
    Supports loading NVFP4 checkpoints with the following structure:

    input_scale: torch.float32, scalar ,
    weight: NVFP4(represented as byte) Shape: [1, X, y/2]
    weight_scale: FP8-E4M3, Shape: [X, Y], aka per block scale,
    weight_scale_2: torch.float32, scalar,
    Args: quant_config: The ModelOpt quantization config.
    rD   rZ   Nc                 C   s   || _ d | _t | _d S rF   )rD   rl   r.   rf   rI   rM   rM   rN   rH   O  s   z"ModelOptNvFp4LinearMethod.__init__rb   r   r   r   r   r   c                 K   s  ~~| j js
tdt|}|d}	||_||_||_|d dkr&td| j jr-tj	n|}
t
tj|j|jd tjddd|	d	}|d
| ttjt|tjd|	d}|d| ttjt|tjd|	d}|d| t
tj||| j j |
ddd|	d	}|d| d S )NzHNVFP4 quantization was selected,  dynamic quantization is not supported.r   rO  r   z=Unsupported model when in features size is not multiple of 16rp   r   r   r   r   r   r   weight_scale_2r   )rD   rP  r   r   r   r   r   r   r   r   r:   r   uint8r   r;   r[   r   r}   )rJ   rb   r   r   r   r   r   r   r   r   r   r   input_global_scaleweight_global_scaler   rM   rM   rN   r   T  sd   


z(ModelOptNvFp4LinearMethod.create_weightsc                 C   s   |j  tj}t|dd|_|` |j tj}t|dd|_|`t|j|j dd|_	td|j tjdd|_
t| j| d S )NFr   r  )r   r   tor   r   r   r`  r^  ra  alphainput_global_scale_invr-   rf   )rJ   rb   r`  ra  rM   rM   rN   r     s   z7ModelOptNvFp4LinearMethod.process_weights_after_loadingr   r   c                 C   s   t | j|||dS )N)rf   rb   r   r   )r,   rf   r   rM   rM   rN   r     s   zModelOptNvFp4LinearMethod.applyrF   )rP   rQ   rR   rS   rN  rH   r   r   r   r   r   r   r   r   r   r   rM   rM   rM   rN   r]  D  s6    

Jr]  c                       s  e Zd ZdZdededdf fddZ	d,deej	ej	ej	f dB de
jdB fd	d
Zde
jdejjde
jfddZdefddZdejjdedededejf
ddZdeddfddZedd Zdedej	dej	deej	eej	 f fddZdejjdedB fdd Zedefd!d"Zedefd#d$Zded%ej	dej	dej	eej	ej	f B fd&d'Z ded%ej	d(ej	d)ej	dej	eej	ej	f B f
d*d+Z!  Z"S )-ModelOptNvFp4FusedMoEz]
    MoE Method for FP4 Quantization.
    Args:
        quant_config: NVFP4 Quant Config
    rD   ri   rZ   Nc                    s:   t  | || _t| jttd\| _| _t	| j| _
d S r  )rG   rH   rD   r   r  r5   r4   nvfp4_backendr  r   use_global_sfr  rK   rM   rN   rH     s   
zModelOptNvFp4FusedMoE.__init__r	  c                 C   r
  r  r  r  rM   rM   rN   r    r  z1ModelOptNvFp4FusedMoE.maybe_make_prepare_finalizer  rb   c                 C   r
  r  r  r  rM   rM   rN   r    r  z&ModelOptNvFp4FusedMoE.select_gemm_implc                 C   r   )zY
        FP4 variants use 'weight_scale_2' pattern for per-tensor weight scales.
        TrM   r   rM   rM   rN   uses_weight_scale_2_pattern  s   z1ModelOptNvFp4FusedMoE.uses_weight_scale_2_patternr  r  r  r   c                 K   s  | j jsJ ||_||_| j |_ tj}tj}|d}	|d}
| jj	r&dnd}t
tj||| |d |ddd|	d}|d| t
tj|||d |ddd|	d}|d| t
tj||| || j j |ddd|	d}|d	| t
tj|||| j j |ddd|	d}|d
| |dtjji ttj||tjd|	d}|d| ttj|tjd|	d}|d| |dtjji | jr|
n|}ttj||tjd|	d}|d| ttj|tjd|	d}|d| d S )Nr   r8  rp   r   r   r   r  r  r  r  rm   r   w13_weight_scale_2w2_weight_scale_2r  r  )rD   rP  r  r   r   r_  r   r   r  r  r:   r   r   r}   updater   BLOCKvaluer;   r   TENSORrg  )rJ   rb   r  r  r  r   r   r   weight_scale_dtyper   r8  r!  r  r  r  r  ri  rj  global_sf_num_expertsr  r  rM   rM   rN   r     s   	





z$ModelOptNvFp4FusedMoE.create_weightsc                 C   s:  | j jrt|jdddf |jdddf std |jdddf  }t| j	||j
|j||j|j|j|j|j| j jd\}}}}}}}	}
t|d| t|d| t|d| t|d	| t|d
| t|d| t|d|	 t|d|
 | || _| jr| jdusJ t| j| j | j|j| d| _dS dS )zT
        Convert NVFP4 MoE weights into kernel format and setup the kernel.
        Nr   r   zIw1_weight_scale_2 must match w3_weight_scale_2. Accuracy may be affected.)rf  rb   r"  r$  w13_scale_2	a13_scaler#  r%  
w2_scale_2r/  r  r  r  ri  r  r  r  rj  r  )r&  ri   r  r'  r	  )r  r  r   allcloseri  r   warning_oncer   r   rf  r  r  r  r  r  rj  r  r<   r(  r&  r  r   r'  r)  r*  )rJ   rb   ri  r"  r$  rq  rr  r#  r%  rs  r/  rM   rM   rN   r   f  s`    z3ModelOptNvFp4FusedMoE.process_weights_after_loadingc                 C   r0  rF   )rf  r   r1  r   rM   rM   rN   do_post_quant_allgather  r3  z-ModelOptNvFp4FusedMoE.do_post_quant_allgatherr6  r4  c                 C   s@   | j tjkr
tdddl}|j||jdd\}}|g}||fS )zBOptionally prepare extra tensors to carry through DP allgather/EP.zVprepare_dp_allgather_tensor is only supported for FlashInfer TRTLLM NVFP4 MoE backend.r   NF)is_sf_swizzled_layout)rf  r   r1  RuntimeError
flashinferfp4_quantize	a1_gscale)rJ   rb   r6  r4  ry  hidden_states_fp4hidden_states_sfextra_tensorsrM   rM   rN   prepare_dp_allgather_tensor  s   
z1ModelOptNvFp4FusedMoE.prepare_dp_allgather_tensorc              	   C   s$   t | j|j|j|j|j|j|jdS )N)rf   r$  r%  rq  rs  rr  r/  )r   rf  r  r  ri  rj  r  r  r   rM   rM   rN   r(    s   z0ModelOptNvFp4FusedMoE.get_fused_moe_quant_configc                 C   r   )NTrM   r   rM   rM   rN   supports_eplb  r   z#ModelOptNvFp4FusedMoE.supports_eplbc                 C   s   | j tjko| jjj S rF   )rf  r   r1  r  moe_parallel_configr=  r   rM   rM   rN   r2    s   
z#ModelOptNvFp4FusedMoE.is_monolithicr   c                 C   sJ   | j sJ | jtjkr|jrJ t||||j|j|j|j	|j
|j|jd
S )N)
rb   r   r4  r9  r>  r8  r:  r;  custom_routing_functionr@  )r2  rf  r   r1  r=  r    r9  r>  r8  r:  r;  r  r@  rA  rM   rM   rN   rB    s    
z&ModelOptNvFp4FusedMoE.apply_monolithicrC  rD  c                 C   sr   | j rJ | jtjkr|jsJ t|||||j|j|jdS | j	d us%J | j	||j
|j|||j|j|j|jd	S )N)rb   r   rD  rC  r9  r>  r8  rF  )r2  rf  r   r1  r=  r!   r9  r>  r8  r*  r  r  rG  r<  rI  rM   rM   rN   r     s0   


zModelOptNvFp4FusedMoE.applyrF   )#rP   rQ   rR   rS   rN  r   rH   r   r   r   rJ  rK  r  r   r   rL  r  r   rh  r   r   r   r
   r   rM  rv  r   r  r	   r(  r  r2  rB  r   rT   rM   rM   rK   rN   re    s    
	


t>



re  c                       s   e Zd ZdZdededB dee ddf fddZdefd	d
Z	dee
j fddZedefddZde
jjdeddf fddZededB fddZedededB dee deeef dedd fddZ  ZS )ModelOptMxFp8Configz Config class for ModelOpt MXFP8.is_checkpoint_mxfp8_serializedr   NrX   rZ   c                    s2   t  | || _|stdtd || _d S )Nz[MXFP8 quantization requires a serialized checkpoint. Dynamic quantization is not supported.zkDetected ModelOpt MXFP8 checkpoint. Please note that the format is experimental and could change in future.)rG   rH   r  r   r   r   r   )rJ   r  r   rX   rK   rM   rN   rH     s   
zModelOptMxFp8Config.__init__c                 C   r   )Nmodelopt_mxfp8rM   r   rM   rM   rN   r   2  r   zModelOptMxFp8Config.get_namec                 C   s   t jgS rF   )r   r   r   rM   rM   rN   r   5  s   z,ModelOptMxFp8Config.get_supported_act_dtypesc                 C   r   )Nd   rM   r   rM   rM   rN   r   8  s   z&ModelOptMxFp8Config.get_min_capabilityrb   rY   rc   c                    s    t |tr	tdt ||S )NzhMXFP8 quantization does not yet support MoE models. Please use FP8 or NVFP4 quantization for MoE models.)rj   r
   r~   rG   rn   )rJ   rb   rY   rK   rM   rN   rn   =  s
   
z$ModelOptMxFp8Config.get_quant_methodc                 C   s   |du rdS | dd }|dkrdS d|v r3|d }t|tr1t| dd }d|v r1dS dS t| dd }d|v rCdS dS )	zYDetect if this ModelOpt MXFP8 config should be used based on
        quantization config.Nrm   rg   r   r   r   rB   r  r   r   rM   rM   rN   r   H  s    
z0ModelOptMxFp8Config.override_quantization_methodrm   r{   r|   r   c          	         sX   d|  v }|r&d|v r&|d  ddg} fdd|D }|r&td| | |||S )NrB   r   r   rX   c                    rT  rM   rM   rU  rX  rM   rN   rY  w  rZ  z4ModelOptMxFp8Config._from_config.<locals>.<listcomp>zJMXFP8 quantization requires the following fields in hf_quant_config.json: )r   r   )	r   rm   r{   rX   r|   r   r  r[  r\  rM   rX  rN   r   g  s$   

z ModelOptMxFp8Config._from_config)rP   rQ   rR   rS   r   r   r   rH   r   r   r   r   r   r   r   r   r   r   rn   r   r   r   r   rT   rM   rM   rK   rN   r    sP    
r  c                   @   r   )ModelOptMxFp8LinearMethodz.Linear method for ModelOpt MXFP8 quantization.rD   rZ   Nc                 C   s:   || _ | j jstdtj}t|d| _td|j	 d S )Nz\MXFP8 currently only supports serialized checkpoints. Dynamic quantization is not supported.)rf   zUsing %s backend for MXFP8 GEMM)
rD   r  r   r*   	EMULATIONr+   mxfp8_linear_opr   	info_oncerm  )rJ   rD   rf   rM   rM   rN   rH     s   z"ModelOptMxFp8LinearMethod.__init__rb   r   r   r   r   r   c                 K   s   ~~| j js
tdt|}|d}	||_||_||_|t dkr,tdt d| t	t
j||tddd|	d}
|d	|
 t	t
j||t tddd|	d}|d
| d S )NzoMXFP8 quantization was selected, but checkpoint is not MXFP8 serialized. Dynamic quantization is not supported.r   r   z2MXFP8 requires input dimension to be divisible by r   r   r   r   r   r   )rD   r  r   r   r   r   r   r   r'   r:   r   r   r)   r   r(   r   rM   rM   rN   r     sP   



z(ModelOptMxFp8LinearMethod.create_weightsc                 C   s   |j jdkrtd|j j dt|j j |j jtkr)tdt d|j j d|j j}|j\}}|t }|j	jd |d |f 
 }t|
 dd|_ t|dd|_	d S )	Nrp   z+MXFP8 weight must be 2D tensor [N, K], got zD with shape zMXFP8 weight must be z (FP8 E4M3), got z:. The checkpoint may not be properly quantized with MXFP8.Fr   )r   ndimr   r   r   r   r)   r   r'   r   r   r   )rJ   rb   r   NKscale_kr   rM   rM   rN   r     s$   

z7ModelOptMxFp8LinearMethod.process_weights_after_loadingr   r   c                 C   sd   |j jtkrtd|j j dt |jjtkr$td|jj dt | jj||j |j|j|dS )NzWeight dtype z != expected zWeight scale dtype )r   r   r   r   r   )r   r   r)   r   r   r(   r  r   r   rM   rM   rN   r     s"   zModelOptMxFp8LinearMethod.applyrF   )rP   rQ   rR   rS   r  rH   r   r   r   r   r   r   r   r   r   r   rM   rM   rM   rN   r    s6    
8r  )nr   typingr   r   r   torch.nn.parameterr   3vllm.model_executor.layers.fused_moe.modular_kernelmodel_executorlayers	fused_moemodular_kernelrJ  vllm.loggerr   $vllm.model_executor.layers.attentionr   +vllm.model_executor.layers.fused_moe.configr   r	   *vllm.model_executor.layers.fused_moe.layerr
   r   r   /vllm.model_executor.layers.fused_moe.oracle.fp8r   r   r   r   r   1vllm.model_executor.layers.fused_moe.oracle.nvfp4r   r   r   r   r   r   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.quantization.base_configr   r   9vllm.model_executor.layers.quantization.kernels.scaled_mmr   0vllm.model_executor.layers.quantization.kv_cacher   @vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moer    r!   >vllm.model_executor.layers.quantization.utils.flashinfer_utilsr"   7vllm.model_executor.layers.quantization.utils.fp8_utilsr#   r$   r%   :vllm.model_executor.layers.quantization.utils.marlin_utilsr&   9vllm.model_executor.layers.quantization.utils.mxfp8_utilsr'   r(   r)   r*   r+   9vllm.model_executor.layers.quantization.utils.nvfp4_utilsr,   r-   r.   9vllm.model_executor.layers.quantization.utils.quant_utilsr/   r0   r1   r2   r3   r4   r5   8vllm.model_executor.layers.quantization.utils.w8a8_utilsr6   r7   vllm.model_executor.parameterr8   r9   r:   r;   vllm.model_executor.utilsr<    vllm.model_executor.models.utilsr=   rP   r   r   KV_CACHE_QUANT_ALGOSrC   rE   r   r   r   r   r  rU   rV   rW   rN  r]  re  r  r  rM   rM   rM   rN   <module>   sp    $		 LaYKw  e|  Vnz
