o
    پi(                     @   s4  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ dd Zd	efd
dZdejdejfddZd'ddZdg ddfddZedkre  Zejdeddd ejdedg ddd ejd d!d"d# e Zed$ej  ed%ej  eejejejd& dS dS )(    N)
AutoConfig)cutlass_fused_experts_fp8fused_experts)MoeRunnerConfig)StandardTopKOutputc                 C   sB   |   |  } }| |  ||   }d| |   | }d| S )N      )doublesum)xydenominatorsim r   P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/test_cutlass_moe.py	calc_diff   s   r   tp_sizec                 C   sH   t jddd}|j}|j}|j}d| |  }|||j||j|jd dS )Nzdeepseek-ai/Deepseek-R1T)trust_remote_coder   weight_block_size)num_expertstopkhidden_sizeshard_intermediate_sizedtypeblock_shape)r   from_pretrainedn_routed_expertsnum_experts_per_tokmoe_intermediate_sizer   r   quantization_config)r   configEr   intermediate_sizer   r   r   r   get_model_config   s   r$   tensorreturnc                 C   s\   t t j}t t | }|dkrd}n|j| }| | }|j|j|jdjt jd}|S )z=Converts tensor to FP8 E4M3, scaling values to fit the range.r   g      ?)minmax)r   )torchfinfofloat8_e4m3fnr(   absclampr'   to)r%   r*   max_valscale_factorscaled_tensor
fp8_tensorr   r   r   to_fp8)   s   
r3   Fc                    s
  t d| d td tjd |d }|d }|d }|d }|d	 |d
 }t d| d| d| d| d| d  tj||fd|dtj|||fdtjd}	tj|||d fdtjd}
t|	t|
\}}|| d | }|| d | }|| d | }|d | d | }tj}tj|||fdd|dtj|||fdd|dtj	tj
||d|dddtjd|||ftjddtj|f|tjdd tj|f|tjddtj|f|d tjddtj|f|tjddtjddtjdtj|ftjddtj|ftjddtj|ftjddtj|ftjddtj|ftjddtj|d ftjdd
tj|dftjddtj|dftjddd	tjtj dkred	ntjtj dkrsd	 	
fdd }ttj||fj|dd!t|||||d"d#d$fd%d }t d& td'D ]	}| }| }qtj  g d(}t d) tjj|d*|d+\}}}t d, tjj|d*|d+\}}}t d-|d.d/|d.d0|d.d1 t d2|d.d/|d.d0|d.d1 |rt d3 t > tdddddddd 
	d4}td5d6	}W d    n	1 s_w   Y  t||}t d7|d8 |d9k s}J d:| t d; d S d S )<Nz
--- Batch Size: z ---cuda*   r   r   r   r   r   r   z
Config: E=z, topk=z, H=z
, I_shard=z, dtype=z, block_shape=)devicer   r   r	   )dimr   )r   r6   i 8p    )FFzNVIDIA H200)FTz
NVIDIA H20)TTc                      sR   t dddddddd 
	dS )Nr	   r   	enable_es)r   	transposer   )
a1_strides
a2_stridesa_ptrsa_scales_ptrsb_ptrsb_scales_ptrs
c1_strides
c2_stridesr;   expert_offsetsout_ptrsproblem_sizes1problem_sizes2topk_idstopk_weightsw1w1_scalew2w2_scale	workspacer   r   r   <lambda>   s.    



zrun_test.<locals>.<lambda>)rJ   rI   router_logitssiluF)r   top_kr   intermediate_size_per_partitionparams_dtype
activationinplacec                      s   t d d	S )NTuse_fp8_w8a8rL   rN   r   r   r   )r   moe_runner_configtopk_outputrK   rL   rM   rN   r   r   r   rP      s    zWarming up...
   )g      ?g?g?z%Benchmarking Cutlass fused_experts...i  )rep	quantilesz$Benchmarking Triton fused_experts...zCutlass fused_experts time: z.3fz ms (median) [z - ]zTriton  fused_experts time: zRunning correctness check...r:   TrX   zDiff: z.6fg-C6?zDiff too high! zCorrectness check passed.)printr)   set_default_devicer4   manual_seed_allrandnfloat32r3   fullsoftmaxrandrandintint32int64emptyuint8get_device_namecurrent_devicer   r6   r   rangesynchronizetritontestingdo_bench_cudagraphno_gradr   r<   r   r   )r   
batch_sizemodel_configcheckr"   r   HIr   w1_hpw2_hpblock_nblock_kw1_blocks_dim1w1_blocks_dim2w2_blocks_dim1w2_blocks_dim2scale_dtypecutlass_lambdatriton_lambda_r^   
cutlass_mscutlass_mincutlass_max	triton_ms
triton_min
triton_max	y_cutlassy_tritondiffr   )r=   r>   r?   r@   rA   rB   r   rC   rD   r;   rE   rZ   rF   rG   rH   rI   r[   rJ   rK   rL   rM   rN   rO   r   r   run_test?   s  
&4







'r      )	r	      r          @            c                 C   s.   t | }td| |D ]	}t| ||| qd S )NzModel Config:)r$   r`   r   )r   batch_sizesrw   rv   ru   r   r   r   main  s
   
r   __main__z	--tp-sizezTensor Parallel size)typedefaulthelpz--batch-sizes+)r	   r   r   r   r   r   r   r   r   i   i   i   i    zList of batch sizes to test)r   nargsr   r   z--check
store_truezEnable check mode)actionr   z!Running benchmarks with TP size: zTesting batch sizes: )r   r   rw   )F)argparser)   rq   triton.testingtransformersr   !sglang.srt.layers.moe.cutlass_moer   0sglang.srt.layers.moe.fused_moe_triton.fused_moer   %sglang.srt.layers.moe.moe_runner.baser   sglang.srt.layers.moe.topkr   r   intr$   Tensorr3   r   r   __name__ArgumentParserparseradd_argument
parse_argsargsr`   r   r   rw   r   r   r   r   <module>   s>    
 O