o
    
۾i%                     @   s  d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
m  m  m  mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZF d dlGmHZH d dlImJZJmKZK d dlLmMZMmNZNmOZOmPZPmQZQmRZRmSZS d dlTmUZUmVZVmWZW d dlXmYZY d dlZm[Z[m\Z\m]Z] d dl^m_Z_m`Z` d dlambZb d dlcmdZd erd dlemfZf d d!gZgeehZiG d"d# d#e4ZjG d$d% d%eZkd&ejld'ejld(dfd)d*ZmG d+d, d,e/ZnG d-d. d.enZoG d/d0 d0eZpG d1d2 d2epZqG d3d4 d4e9ZrdS )5    )TYPE_CHECKINGAnyN)Module)TorchDispatchMode)_custom_ops)rocm_aiter_ops)$get_tensor_model_parallel_world_size)init_logger)	Attention)vllm_is_batch_invariant)FusedMoEFusedMoEMethodBaseFusedMoEPermuteExpertsUnpermuteFusedMoEPrepareAndFinalizeFusedMoeWeightScaleSupported)FusedMoEQuantConfig)UnquantizedFusedMoEMethod)Fp8MoeBackend convert_to_fp8_moe_kernel_formatmake_fp8_moe_kernelmake_fp8_moe_quant_configselect_fp8_moe_backend)
LinearBaseLinearMethodBaseUnquantizedLinearMethod)QuantizationMethods)QuantizationConfigQuantizeMethodBase)init_fp8_linear_kernel)BaseKVCacheMethod)"apply_fi_trtllm_fp8_per_tensor_moe)
W8A8BlockFp8LinearOpcreate_fp8_input_scalecreate_fp8_scale_parametercreate_fp8_weight_parameter#maybe_post_process_fp8_weight_block%process_fp8_input_tensor_strategy_moe!process_fp8_weight_block_strategy"process_fp8_weight_tensor_strategy&process_fp8_weight_tensor_strategy_moevalidate_fp8_block_shape)get_marlin_input_dtype)apply_fp8_marlin_linearprepare_fp8_layer_for_marlin)
GroupShapeis_layer_skippedkFp8Dynamic128SymkFp8DynamicTensorSymkFp8DynamicTokenSymkFp8Static128BlockSymkFp8StaticTensorSym)cutlass_block_fp8_supportedcutlass_fp8_supportednormalize_e4m3fn_to_e4m3fnuz)initialize_single_dummy_weight)BlockQuantScaleParameterModelWeightParameterPerTensorScaleParameter)replace_parameterset_weight_attrs)current_platform)is_deep_gemm_supported)WeightsMapperstaticdynamicc                       s   e Zd ZdZ				d#dededee dB dee dB d	df
 fd
dZe	d	e
fddZe	d	eej fddZe	d	efddZe	d	ee fddZd$ddZe	deeef d	d fddZdejjded	dfddZd ed	edB fd!d"Z  ZS )%	Fp8ConfigzConfig class for FP8.FrB   Nis_checkpoint_fp8_serializedactivation_schemeignored_layersweight_block_sizereturnc                    s   t    || _|tvrtd| || _|pg | _|d urA|s%tdt|dkr5tdt| d|dkrAtd| d|| _d S )	NzUnsupported activation scheme zLThe block-wise quantization only supports fp8-serialized checkpoint for now.   zFThe quantization block size of weight must have 2 dimensions, but got z dimensionsrB   zUThe block-wise quantization only supports dynamic activation scheme for now, but got z activation scheme.)	super__init__rD   ACTIVATION_SCHEMES
ValueErrorrE   rF   lenrG   )selfrD   rE   rF   rG   	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/fp8.pyrK   o   s0   


zFp8Config.__init__c                 C      dS )Nfp8rR   clsrR   rR   rS   get_name      zFp8Config.get_namec                 C   s   t jt jgS N)torchbfloat16halfrV   rR   rR   rS   get_supported_act_dtypes      z"Fp8Config.get_supported_act_dtypesc                 C   rT   )NK   rR   rV   rR   rR   rS   get_min_capability   rY   zFp8Config.get_min_capabilityc                 C   s   g S rZ   rR   rV   rR   rR   rS   get_config_filenames   rY   zFp8Config.get_config_filenameshf_to_vllm_mapperr@   c                 C   s    | j d ur|| j | _ d S d S rZ   )rF   
apply_list)rO   rc   rR   rR   rS   apply_vllm_mapper   s   
zFp8Config.apply_vllm_mapperconfigc                 C   sh   |  |dg}d|v }|  |dg}| |dgd }| |dgd }|s,| |dgd }| ||||dS )Nquant_methodrU   rE   rF   rG   modules_to_not_convert)rD   rE   rF   rG   )get_from_keysget_from_keys_or)rW   rf   rg   rD   rE   rF   rG   rR   rR   rS   from_config   s   zFp8Config.from_configlayerprefixzQuantizeMethodBase | Nonec                 C   s   t |tr*t|| j| jdrt S | jst| }t||_	|S t
| }t||_	|S t |trNt|| j| jdr=t|jS | jrGt| |}|S t| |}|S t |trWt| S d S )N)rm   rF   fused_mapping)
isinstancer   r/   rF   packed_modules_mappingr   rD   Fp8OnlineLinearMethodr+   marlin_input_dtypeFp8LinearMethodr   r   
moe_configFp8MoEMethodFp8OnlineMoEMethodr
   Fp8KVCacheMethod)rO   rl   rm   online_methodoffline_methodmoe_quant_methodrR   rR   rS   get_quant_method   s:   







zFp8Config.get_quant_methodnamec                 C   st   | drd|v r|ddS | drd|v r|ddS | dr-d|v r-|d	d
S | dr8|ddS dS )a%  
        Check whether the param name matches the format for k/v cache scales
        in compressed-tensors. If this is the case, return its equivalent
        param name expected by vLLM

        :param name: param name
        :return: matching param name for KV cache scale in vLLM
        z.output_scalez.k_projz.k_proj.output_scalez.attn.k_scalez.v_projz.v_proj.output_scalez.attn.v_scalez.q_projz.q_proj.output_scalez.attn.q_scalezself_attn.prob_output_scalez.prob_output_scalez.attn.prob_scaleN)endswithreplace)rO   r|   rR   rR   rS   get_cache_scale   s   	
zFp8Config.get_cache_scale)FrB   NN)rc   r@   )__name__
__module____qualname____doc__boolstrlistintrK   classmethodr   rX   r[   dtyper^   ra   rb   re   dictr   rk   nnr   r{   r   __classcell__rR   rR   rP   rS   rC   l   sH    

"

"rC   c                       s*   e Zd ZdZ fddZdddZ  ZS )	CopyNumelCounterz
    Tracks total number of elements modified with `copy_`. Useful for keeping
    track of weight loading where underlying weights can be arbitrarily
    transformed (such as with `narrow`) before calling copy.
    c                    s   t    d| _d S Nr   )rJ   rK   copied_numelrO   rP   rR   rS   rK      s   

zCopyNumelCounter.__init__rR   Nc                 C   sD   |d u ri }||i |}|t jjjjkr |  j|d  7  _|S r   )r[   opsatencopy_defaultr   numel)rO   functypesargskwargsoutrR   rR   rS   __torch_dispatch__   s   z#CopyNumelCounter.__torch_dispatch__)rR   N)r   r   r   r   rK   r   r   rR   rR   rP   rS   r      s    r   oldnewrH   c                 C   sB   t t|}i }t| D ]}||vrt| |||< qt|| dS )z;Copies any attrs present in `old` but not in `new` to `new`N)setdirgetattrr=   )r   r   	new_attrsattrs_to_setattrrR   rR   rS   _copy_missing_attrs  s   r   c                   @   s   e Zd ZdZdefddZdejjde	de
e	 de	d	e	d
ejfddZdeddfddZ	ddejjdejdejdB dejfddZdS )rs   a  Linear method for FP8.
    Supports loading FP8 checkpoints with static weight scale and
    dynamic/static activation scale.

    Limitations:
    1. Only support float8_e4m3fn data type due to the limitation of
       torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)

    Args:
        quant_config: The quantization config.
    quant_configc                 C   s  || _ t | _t | _d | _td ptj	| _
t s!t r$d| _
t r*d| _
t | _t | _| j j| _| jd u| _| j jdk| _| jrj| jrMJ | jd usTJ tt| j td| jd | j| jd| _d S | jrpt}nt rvt}nt}t|tt | jj d| _!d S )NY   FrA      r   )weight_group_shapeact_quant_group_shaper5   use_aiter_and_is_supported)activation_quant_keyweight_quant_key	out_dtypemodule_name)"r   r5   r[   get_default_dtyper   rr   r>   has_device_capabilityenvsVLLM_TEST_FORCE_FP8_MARLIN
use_marlinis_rocmis_xpur   r   is_linear_fp8_enabledr   r?   use_deep_gemmrG   block_quantrE   act_q_staticr!   r.   w8a8_block_fp8_linearr4   r6   r2   r1   r   rQ   r   
fp8_linear)rO   r   r   rR   rR   rS   rK     sH   



zFp8LinearMethod.__init__rl   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec                 K   s  t |}|d}	||_||_||_||_d |_| jr0| jd us"J | j|_t|||||| j t	|||	}
|
d|
 | jsNtt||d |	}|
d| n| jrSJ | jd usZJ tt||| j|	}|
d| | jrt||	}t|ddi |
d| d S d S )Nweight_loaderweightweight_scaleweight_scale_inv
scale_typeinput_scale)sumgetlogical_widthsr   output_size_per_partition
orig_dtyperG   r   r*   r$   register_parameterr#   r;   r   r9   r"   r=   )rO   rl   r   r   r   r   r   extra_weight_attrsr   r   r   scalerR   rR   rS   create_weightsK  s\   

	

zFp8LinearMethod.create_weightsrH   Nc                 C   s
  d}d }| j r&| jrJ d}t|j|j\}}t|d|j t|d|j n7|j}|j}| jsKt	|||j
t|dd \}}}| jrK|d usGJ | }| }t|d|j t|d|j |d urht|d| nd |_| jrzt||| jd |`d S | j rt| d S d S )NTFr   r   r   r   input_dtype)r   r   r'   r   r   r<   datar   r   r(   r   r   maxtr   r-   rr   r%   )rO   rl   size_k_firstr   r   r   r   rR   rR   rS   process_weights_after_loading  sJ   


z-Fp8LinearMethod.process_weights_after_loadingxbiasc              
   C   s2  t  r[| jr| jd usJ | jj||j|j|j|dS |jt	j
}|jt	j
}| dkr4|| }n| dkrL|jd |jd krL||d }n|| }t	jj|| |S | jry| jre|j}n|j}t||j||j|j|j| j|dS | jr| jd usJ | jj||j|j|j|dS | j|||S )N)inputr   r   r   r   r   r   )r   r   r   	workspacesize_nsize_kr   r   )r   r   rG   r   applyr   r   r   tor[   r\   r   r   dimshape	unsqueezer   
functionallinearr   r   r,   r   r   r   rr   r   apply_weights)rO   rl   r   r   
weight_fp8r   weight_bf16rR   rR   rS   r     sV   	
zFp8LinearMethod.applyrZ   )r   r   r   r   rC   rK   r[   r   r   r   r   r   r   r   Tensorr   rR   rR   rR   rS   rs     s6    2
@=rs   c                   @   sN   e Zd ZdZdejjdedee dededej	fdd	Z
ded
dfddZdS )rq   zoOnline version of Fp8LinearMethod, loads the fp16/bf16 checkpoint
    and quantized the weights during loading.rl   r   r   r   r   r   c           
         sz   t |}|d| _| _| _| _d  _ fddttj	||d|dddd}	t
  _ d	|	 d S )
Nr   c                    s   t  ds%d _ttj j jdddd}t j|  d|  ` j} t	 }| | |g|R i |}W d    n1 sDw   Y    j|j
7  _ j } j|krc  d _|S )N_loaded_numelr   devicer   r   	input_dim
output_dimr   r   T)hasattrr   r:   r[   
empty_liker   _load_devicer   r   r   r   r   r   -_already_called_process_weights_after_loading)paramloaded_weightr   r   r   copy_numel_counterrestarget_loaded_numelrl   patched_weight_loaderrO   r   rR   rS   r   $  s,   



zCFp8OnlineLinearMethod.create_weights.<locals>.patched_weight_loadermetar   r   r   r   r   r   )r   r   r   r   r   r   rG   r:   r[   emptyget_default_devicer   r   )
rO   rl   r   r   r   r   r   r   r   r   rR   r   rS   r     s*   

,
z$Fp8OnlineLinearMethod.create_weightsrH   Nc                 C   s   t |ddrd S |jjtdkr3ttj|j|jddd|jjd}t|j| |	d| t
|j | jr8J d |_tj|jd d	\}}| }t|d|j t|d
|j | jrdd}t||| jd d|_d S )Nr   Fr   r   r   r   r   r   )r   r   Tr   )r   r   r   r[   r:   r   r   r   r   r   r8   r   r   r   scaled_fp8_quantr   r<   r   r   r-   rr   r   )rO   rl   r   qweightr   r   rR   rR   rS   r   `  s0   


z3Fp8OnlineLinearMethod.process_weights_after_loading)r   r   r   r   r[   r   r   r   r   r   r   r   rR   rR   rR   rS   rq     s     
Orq   c                       s  e Zd ZdZdedejjf fddZdede	de	de	d	ej
f
d
dZdedejdejdejdejdejdB dejdB ddfddZdeddfddZ	d,deejejejf dB dejdB fddZdedejjdefddZdejjdedB fddZedefd d!Zedefd"d#Zded$ejd%ejdejeejejf B fd&d'Zded$ejd(ejd)ejdejeejejf B f
d*d+Z  ZS )-ru   au  MoE method for FP8.
    Supports loading FP8 checkpoints with static weight scale and
    dynamic/static activation scale.

    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
    activation scaling. The weight scaling factor will be initialized after
    the model weights are loaded.

    Args:
        quant_config: The quantization config.
    r   rl   c                    s   t  |j || _| jj| _| jd u| _| jrdnd| _| jr%t}t}nt	}| jj
dkr/t	nt}t| j||dd\| _| _d S )Nr   r   rA   F)rf   
weight_keyactivation_keyallow_vllm_cutlass)rJ   rK   rt   r   rG   r   weight_scale_namer3   r0   r4   rE   r1   r   moefp8_backendexperts_cls)rO   r   rl   r   r   rP   rR   rS   rK     s(   
zFp8MoEMethod.__init__num_expertshidden_sizeintermediate_size_per_partitionr   c                 K   s  ||_ ||_||_||_d |_| jjsJ tj}| j	rZ| jd us"J | j|_t
 }| jd | jd }}	|| dkrEtd| d| d|dkrZ||	 dkrZtd| d|	 dtjjtj|d| ||d	d
d}
|d|
 t|
| tjjtj||||d	d
d}|d| t|| | j	stj|dtjd	}tj|tjd	}n.tj|d|| d |  ||	 d |	 tjd	}tj||| d | ||	 d |	 tjd	}tjj|d
d}tjj|d
d}|d| j | |d| j | || j	rdtjjindtjji t|| t|| | jjdkrU| j	r!J tjjtj|tjd	d
d}|d| t|| tjjtj|tjd	d
d}|d| t|| d S d |_d |_d S )Nr   r   z,The output_size of gate's and up's weight = z3 is not divisible by weight quantization block_n = .z"The input_size of down's weight = z3 is not divisible by weight quantization block_k = rI   r   Frequires_grad
w13_weight	w2_weightw13_w2_rg   rA   w13_input_scalew2_input_scale)r  r  r  r   rG   r   rD   r[   float8_e4m3fnr   r   rM   r   	Parameterr   r   r=   onesfloat32r  updater   BLOCKvalueTENSORrE   r  r  )rO   rl   r  r  r  r   r   tp_sizeblock_nblock_kr  r  w13_scale_dataw2_scale_dataw13_weight_scalew2_weight_scaler  r  rR   rR   rS   r     s   		
	





zFp8MoEMethod.create_weightsw13w2	w13_scalew2_scaler  Nr  rH   c              
   C   s   t | j|||||||d\}}}}t|d| t|d| t|d| j | t|d| j | | || _| jrU| jd usAJ t| j| j| j| j|	 |j
d| _d S d S )N)r  rl   r"  r#  r$  r%  r  r  r  r  r  r  )moe_quant_configrt   r  r  routing_tablesshared_experts)r   r  r<   r  get_fused_moe_quant_configr&  r  r   r  !_maybe_init_expert_routing_tablesr(  moe_mk)rO   rl   r"  r#  r$  r%  r  r  rR   rR   rS   _setup_kernel,  s4   zFp8MoEMethod._setup_kernelc           	   	   C   s  t |ddrd S |j}|j}t |d| j }t |d| j }|j}|j}t r<t|||\}}}t|||\}}}| j	j
dkrd| jrGJ |d urO|d usQJ t||\}}t|d| t|d| | jst|j}t||||j\}}| ||||||| d|_d S )	Nr   Fr  r  rA   r  r  T)r   r  r  r  r  r  r>   is_fp8_fnuzr7   r   rE   r   r&   r<   r  r)   local_num_expertsr,  r   )	rO   rl   r"  r#  r$  r%  r  r  
shard_sizerR   rR   rS   r   Y  sH   




z*Fp8MoEMethod.process_weights_after_loadingr'  c                 C      t | jj dNzV uses the new modular kernel initialization logic. This function should not be called.rM   rQ   r   )rO   r'  rR   rR   rS   maybe_make_prepare_finalize  s   z(Fp8MoEMethod.maybe_make_prepare_finalizeprepare_finalizec                 C   r0  r1  r2  )rO   r4  rl   rR   rR   rS   select_gemm_impl  s   zFp8MoEMethod.select_gemm_implc                 C   sX   | j tjkrd S t|d| j }t|d| j }|j}|j}t| j ||||| jdS )Nr  r  )r  w1_scaler%  a1_scalea2_scaleblock_shape)	r  r   FLASHINFER_TRTLLMr   r  r  r  r   rG   )rO   rl   r6  r%  r7  r8  rR   rR   rS   r)    s   z'Fp8MoEMethod.get_fused_moe_quant_configc                 C   rT   )NTrR   r   rR   rR   rS   supports_eplb  rY   zFp8MoEMethod.supports_eplbc                 C   s   | j tjkS rZ   )r  r   r:  r   rR   rR   rS   is_monolithic  r_   zFp8MoEMethod.is_monolithicr   router_logitsc                 C   s  | j sJ | jtjksJ |jrtd|jdks!J d|j | jrudd l}t	j
jjdi d|d|jd|d|jd	|jd
|jd|jd|jd|jd|jd|jd|jd|j|j d|jd| jd|jd|jS t||||j|j|j|j|j|jd	S )Nz*EPLB not supported for `Fp8MoEMethod` yet.siluz#Expected 'silu' activation but got r   routing_logitsrouting_biasr   r  w13_weight_scale_invr  w2_weight_scale_invglobal_num_expertstop_knum_expert_group
topk_groupintermediate_sizeexpert_offsetr.  r9  routing_method_typerouted_scaling)	rl   hidden_statesr=  r@  rC  rD  rE  rF  apply_router_weight_on_inputrR   )r<  r  r   r:  enable_eplbNotImplementedError
activationr   :vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moer[   r   vllm#flashinfer_fused_moe_blockscale_fp8e_score_correction_biasr  rA  r  rB  rC  rD  rE  rF  r  ep_rankr.  rG   rI  routed_scaling_factorr    rL  )rO   rl   r   r=  rQ  rR   rR   rS   apply_monolithic  sn   

	
zFp8MoEMethod.apply_monolithictopk_weightstopk_idsc                 C   s@   | j d usJ | jrJ | j ||j|j|||j|j|j|jd	S )N)rO  rC  
expert_maprL  )r+  r<  r  r  rO  rC  rY  rL  )rO   rl   r   rW  rX  rR   rR   rS   r     s   
zFp8MoEMethod.applyrZ   )r   r   r   r   rC   r[   r   r   rK   r   r   r   r   r   r,  r   tuplemkr   r3  r   r5  r   r)  propertyr   r;  r<  rV  r   r   rR   rR   rP   rS   ru     s    
{	
-5
	



3ru   c                
       s`   e Zd ZdZdedejjf fddZdede	de	de	d	ej
f
d
dZdeddfddZ  ZS )rv   a  MoE method for online FP8 quantization.
    Supports loading quantized FP16/BF16 model checkpoints with dynamic
    activation scaling. The weight scaling factor will be initialized after
    the model weights are loaded.

    Args:
        quant_config: The quantization config.
    r   rl   c                    s8   t  || |jrJ |jdksJ |jd u sJ d S )NrB   )rJ   rK   rD   rE   rG   )rO   r   rl   rP   rR   rS   rK     s   
zFp8OnlineMoEMethod.__init__r  r  r  r   c                    s6  |_ |_|_|_d _ d  } fdd}||d< | tjjtj|d| |d|ddd}		d	|	 t
|	  tjjtj|||d|ddd}
	d
|
 t
|
  t _tjjtj|tjddd}tjjtj|tjddd}	d| 	d| t
|  t
|  d _d _d S )Nr   c           	         sZ  t dsVd_tj_tj_tjj	tj
jjddd}t|  tj| d| tjj	tj
jjddd}t|  tj| d| `t| jkraj} n
t| jkrkj} t }| | |g|R i |}W d    n1 sw   Y   j|j7  _j j  }j|kr d_|S )	Nr   r   r   Fr  r  r  T)r   r   idr  _w13_weight_orig_idr  _w2_weight_orig_idr[   r   r  r   r   r=   r   r   r   r   r   r   r   )	r   r   r   r   r  r  r   r   r   r   rl   rO   r   rR   rS   r   ,  sB   




z@Fp8OnlineMoEMethod.create_weights.<locals>.patched_weight_loaderrI   r   r   Fr  r  r  r
  r   r!  )r  r  r  r   rG   r[   r   r  r   r   r=   r   r   r  r  r  r  )rO   rl   r  r  r  r   r   new_extra_weight_attrsr   r  r  r   r!  rR   r`  rS   r     s^   	>





z!Fp8OnlineMoEMethod.create_weightsrH   Nc           
   	   C   s  t |ddrd S |jjtdkr:tjjtj|j|jddd}t|d|jj	i t
|j| |d| t|j |jjtdkrltjjtj|j|jddd}t|d|jj	i t
|j| |d| t|j t }tj|j|d	}tj|j|d	}|j}|j}t|jD ]:}	t|j|	d d d d f \||	d d d d f< ||	< t|j|	d d d d f \||	d d d d f< ||	< q| ||||||j|j d
|_d S )Nr   Fr   r   r  r   r  r  r
  T)r   r  r   r[   r   r  r   r   r=   r   r   r   r8   r  r>   	fp8_dtyper   r!  ranger.  r   r   r,  r  r  r   )
rO   rl   r  r  rb  r"  r#  r$  r%  expertrR   rR   rS   r     s\   

 
z0Fp8OnlineMoEMethod.process_weights_after_loading)r   r   r   r   rC   r[   r   r   rK   r   r   r   r   r   rR   rR   rP   rS   rv     s     	
 	rv   c                       s&   e Zd ZdZdef fddZ  ZS )rw   zI
    Supports loading kv-cache scaling factors from FP8 checkpoints.
    r   c                    s   t  | d S rZ   )rJ   rK   )rO   r   rP   rR   rS   rK     s   zFp8KVCacheMethod.__init__)r   r   r   r   rC   rK   r   rR   rR   rP   rS   rw     s    rw   )stypingr   r   r[   torch.nnr   torch.utils._python_dispatchr   	vllm.envsr   3vllm.model_executor.layers.fused_moe.modular_kernelmodel_executorlayers	fused_moemodular_kernelr[  rQ  r   r   vllm._aiter_opsr   vllm.distributedr   vllm.loggerr	   $vllm.model_executor.layers.attentionr
   *vllm.model_executor.layers.batch_invariantr   $vllm.model_executor.layers.fused_moer   r   r   r   r   +vllm.model_executor.layers.fused_moe.configr   *vllm.model_executor.layers.fused_moe.layerr   /vllm.model_executor.layers.fused_moe.oracle.fp8r   r   r   r   r   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.quantization.base_configr   r   9vllm.model_executor.layers.quantization.kernels.scaled_mmr   0vllm.model_executor.layers.quantization.kv_cacher   >vllm.model_executor.layers.quantization.utils.flashinfer_utilsr    7vllm.model_executor.layers.quantization.utils.fp8_utilsr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   :vllm.model_executor.layers.quantization.utils.marlin_utilsr+   >vllm.model_executor.layers.quantization.utils.marlin_utils_fp8r,   r-   9vllm.model_executor.layers.quantization.utils.quant_utilsr.   r/   r0   r1   r2   r3   r4   8vllm.model_executor.layers.quantization.utils.w8a8_utilsr5   r6   r7   -vllm.model_executor.model_loader.weight_utilsr8   vllm.model_executor.parameterr9   r:   r;   vllm.model_executor.utilsr<   r=   vllm.platformsr>   vllm.utils.deep_gemmr?    vllm.model_executor.models.utilsr@   rL   r   loggerrC   r   r   r   rs   rq   ru   rv   rw   rR   rR   rR   rS   <module>   sf   0$	 
  z    T