o
    پi"                  )   @   s  d dl mZ d dlZd dlZd dlmZ d dlmZmZ dej	dej	fddZ
dCd
dZejdg dejddgejddgejddgejddgejdddgejddgejddgejdejgdd Z				dDdej	d ej	d!ej	d"ej	d#ej	d$ej	d%ej	d&ej	d'ej	d(ej	d)ej	d*ej	d+ej	d,ej	d-ej	d.ed/eej	 d0eej	 d1eej	 d2ef(d3d4Z						dEd5ej	d6ed$ej	d%ej	d7ej	d8ej	d9ej	d:ej	d;ed<ed=eej	 d>eej	 d?eej	 d@eej	 fdAdBZdS )F    )OptionalN)cutlass_w4a8_moe)
TopKConfigselect_expertsint4_values_interleavedreturnc                 C   sf   | j d d dkrtd| tj}|ddd df }|ddd df }|d> |d@ B }|tjS )	N   r   zAthe last dim size of int4_values_interleaved tensor must be even..         )shape
ValueErrortotorchint8)r   input_tensor_int8low_nibbleshigh_nibblespacked_tensor r   U/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/test_cutlass_w4a8_moe.pypack_int4_values_to_int8   s   r   r   c           
      C   s   |j d |j d }}t|  }|| ||d ftj}| }||j d |j d |j d | |}|	dddd}||j d |j d | |j d | }| }	||	fS )Nr
   r	   r      )
r   r   cpucudaviewr   r   
contiguousreshapepermute)
num_experts
ref_weight	ref_scale	alignmentnkweightw_qscale_interleavedw_scaler   r   r   pack_interleave   s$   r*   M)r
   r	   r         Ni   Ki   E   tp_sizer,   
use_ep_moeTFtopk
group_size   dtypec	           (      C   s  |r|| }	n|}	|| }d}
|
rft j| |f|ddd }t j|	|d |ft jdd}t j|	||ft jdd}t jdt jdd}t jdt jdd}t j|	|d || f|dd}t j|	||| f|dd}nYt j| ||dd}t jdd|	|d |ft jdd}t jdd|	||ft jdd}d	}t jdt jdd}t jdt jdd}t j|	|d || |dd| }t j|	||| |dd| }t|	||\}}|rt|	||\}}n	t|	||d\}}d}t j|	d
f||t jd}t j|	d
fd| |t jd}t j|	d
f||t jd}t j|	d
f||t jd}|}|}|}|}t j| |f||d} t	|| t
|ddd}!|!\}"}#}$t j|t j|d}%d|%|	d < t||||||"|#|||||||||	|||%}&t||	|"|#||||dd||||d}'t j  t jj|&|'ddd td d S )NFr   r7   devicegMbP?r	   r
   ir,   g{Gzt?r   )r9   r7   )top_krenormalize)hidden_statesrouter_logitstopk_configr   T)has_pre_quant	has_alphapre_quant_scale_1pre_quant_scale_2alpha_1alpha_2g{Gz?g?)rtolatolz(SUCCESS: Final output tensors are close.)r   onesr   float32randnrandintr*   fullint64r   r   arangeint32cutlass_moerefr   synchronizetestingassert_closeprint)(r+   r.   r/   r0   r2   r3   r4   r5   r7   local_edebugaref_weight_1ref_weight_2a1_scalea2_scalescale_1scale_2affine_coeffw1_qw1_scalew2_qw2_scaler9   
a_strides1
c_strides1
a_strides2
c_strides2
b_strides1s_strides13
b_strides2
s_strides2scoretopk_outputtopk_weightstopk_ids_
expert_mapoutput
ref_outputr   r   r   test_cutlass_w4a8_moe4   s   




rs   rW   r_   ra   r`   rb   rm   rn   rc   rg   rd   re   ri   rf   rh   rj   num_local_expertsrZ   r[   rp   apply_router_weight_on_inputc                 C   s   || }| j }tj|d tj|d}tj|dftj|d}tj|dftj|d}t| |||||||||	|
||||||||||S )Nr
   r8   r   )r9   r   emptyrN   r   )rW   r_   ra   r`   rb   rm   rn   rc   rg   rd   re   ri   rf   rh   rj   rt   rZ   r[   rp   ru   r9   expert_offsetsproblem_sizes1problem_sizes2r   r   r   rO      sD   rO   xr    rX   rY   ref_weight_scale_1ref_weight_scale_2r?   r@   rA   rB   rC   rD   c                 C   s  t | }| j}t|D ]}||k}|d }| |d d f }|jd dkr)q|| d| d}t ||
	  dd
t j
|}|| }|| jddd
t	}|
t	| 
|}t ||j| 
t j}|jddd\}}|t jj| }t ||	  dd
t j}|
|}|| }|| jddd
t	}|
t	| 
|}t ||j| 
t j}||d d f  || 
|j7  < q|S )	Nr
   r   g      |g      |@r6   )dimr	   r   )r   
zeros_liker7   rangesumboolr   	unsqueezeclampfloatr   float8_e4m3fnrepeat_interleavematmulTfloat16chunknn
functionalsilu)rz   r    rm   rn   rX   rY   r{   r|   r?   r@   rA   rB   rC   rD   resultsr7   e_idxmaskactivated_tokensactfinal_scalew3_w1ref_w_scale_repeatfc1gatew2fc2r   r   r   rP      s>   

&rP   )r   )NNNF)FFNNNN)typingr   pytestr   &sglang.srt.layers.moe.cutlass_w4a8_moer   sglang.srt.layers.moe.topkr   r   Tensorr   r*   markparametrizebfloat16rs   intr   rO   rP   r   r   r   r   <module>   s   
 	

D	
