o
    پi                     @   s  d dl mZ d dlmZ d dlZd dlZd dlmZ d dl	m
Z
mZ d dlmZ e Ze
do0eZG dd deZed	ed
dddddddejjfd	ejdejdejdejdejdeej deej deej deej dedeej deej dedejfddZ									d,d	ejdejdejdejdejdedededed eej d!eej d"eej d#eej d$eee  dejfd%d&Zejd'ejfd(d)Zd*d+ ZdS )-    )IntEnum)OptionalN)get_bool_env_varis_hip)register_custom_opSGLANG_USE_AITERc                   @   s   e Zd ZdZdZdS )ActivationMethodr      N)__name__
__module____qualname__SILUGELU r   r   X/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/rocm_moe_utils.pyr      s    r   hidden_states)	out_shapeeagerFw1w2topk_weightstopk_ids	fc1_scale	fc2_scalefc1_smooth_scalefc2_smooth_scalea16per_tensor_quant_scaleexpert_maskactivation_methodreturnc                 C   sB   ddl m} ddlm} ||}|| |||||||||	|
||dS )Nr   )ActivationType)asm_moe_tkw1)r   r   r   r   r   r   r   
activation)aiterr!   aiter.fused_moe_bf16_asmr"   )r   r   r   r   r   r   r   r   r   r   r   r   r   r!   r"   r#   r   r   r   rocm_aiter_asm_moe_tkw1   s$   r&   silur#   apply_router_weight_on_inputuse_fp8_w8a8per_channel_quantw1_scalew2_scalea1_scalea2_scaleblock_shapec                 C   s   |dkrt jnt j}|tj}|tj}|rB|rB|rB| dks&J d|jd dks1J dt	| |||||	|
d d dd d |dS J d	)
Nr'      z4`topk_weights` should be in shape (num_tokens, topk)r	   z?Only support topk=1 when `apply_router_weight_on_input` is TrueF)r   r   r   r   r   r   r   r   zThis should not be called.)
r   r   r   totorchfloat32int32dimshaper&   )r   r   r   r   r   r#   r(   r)   r*   r+   r,   r-   r.   r/   r   r   r   r   rocm_fused_experts_tkw1B   s6   r8   BLOCK_Nc                 C   s   t d}t d}t |}||krd S || t d| }||k }| ||  ||  }t j||dd}|d }|||  ||	  }t j||dd}|| }|||
  ||  }t j|||d d S )Nr   r	   g        )maskother   g      ?)r:   )tl
program_idloadarangestore)A_ptr	scale_ptrOut_ptrMNrecv_token_num	stride_am	stride_an	stride_sm	stride_sn	stride_om	stride_onr9   pid_mpid_nrecv_token_num_valoffs_nr:   a_ptrsa	scale_idxs_ptrssoutout_ptrsr   r   r   upscale_kernely   s   


rY   c           	      C   sz   | j \}}tj| |d}d}|t||f}t| | |||||| d| d|d|d|d|d|d |S )N)dtype   r   r	   )r9   )r7   r3   
empty_liketritoncdivrY   stride)	hidden_statehidden_state_scalerG   output_dtyperE   rF   Outr9   gridr   r   r   upscale   s(   
re   )	r'   FFFNNNNN)enumr   typingr   r3   r]   triton.languagelanguager=   sglang.srt.utilsr   r   sglang.srt.utils.custom_opr   _is_hip
_use_aiterr   r   valueTensorboolintr&   strlistr8   jit	constexprrY   re   r   r   r   r   <module>   s   
		
,	


7*