o
    پi                     @  s   U d dl mZ d dlmZ d dlmZmZ d dlZd dlm	Z	m
Z
mZmZmZ d dlmZ er7d dlmZmZ daded	< eG d
d deZeG dd deZeG dd de	ZedddddZdS )    )annotations)	dataclass)TYPE_CHECKINGOptionalN)MoeQuantInfoMoeRunnerConfigRunnerInputRunnerOutputregister_fused_func)MoeRunnerBackend)StandardCombineInputStandardDispatchOutputOptional[torch.Tensor]MARLIN_MOE_WORKSPACEc                   @  s@   e Zd ZU dZded< ded< ded< ded< edd	d
ZdS )MarlinRunnerInputz.Input bundle passed to the Marlin runner core.torch.Tensorhidden_statestopk_weightstopk_idsrouter_logitsreturnr   c                 C     t jS Nr   MARLINself r   [/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/moe_runner/marlin.pyrunner_backend#      z MarlinRunnerInput.runner_backendNr   r   __name__
__module____qualname____doc____annotations__propertyr   r   r   r   r   r      s   
 r   c                   @  s(   e Zd ZU dZded< ed	ddZdS )
MarlinRunnerOutputz3Output bundle returned from the Marlin runner core.r   r   r   r   c                 C  r   r   r   r   r   r   r   r   .   r    z!MarlinRunnerOutput.runner_backendNr!   r"   r   r   r   r   r)   (   s
   
 r)   c                   @  s   e Zd ZU dZded< ded< ded< ded< ded< ded	< d
ed< dZded< dZded< dZded< dZded< dZ	ded< dZ
ded< dS )MarlinMoeQuantInfoz4Quantization payload consumed by the Marlin backend.r   w13_qweight
w2_qweight
w13_scales	w2_scalesr   w13_g_idx_sort_indicesw2_g_idx_sort_indicesintweight_bitsN	w13_g_idxw2_g_idxTbool	is_k_full
w13_qzeros	w2_qzeros
expert_map)r#   r$   r%   r&   r'   r3   r4   r6   r7   r8   r9   r   r   r   r   r*   3   s   
 r*   nonemarlindispatch_outputr   
quant_inforunner_configr   r   r   c           	      C  s  ddl m} ddlm} ddlm} | j}| j}|jdks!J dt	d u s+t	j
|j
kr2||j
dda	|di d	|d
|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jdt	d|jd|jd|jd|j|j}||dS )Nr   )fused_marlin_moe)r   )marlin_make_workspacesiluz"Only SiLU activation is supported.   )max_blocks_per_smr   w1w2w1_scalew2_scalegating_outputr   r   r9   g_idx1g_idx2sort_indices1sort_indices2w1_zerosw2_zeros	workspacenum_bitsr6   inplacerouted_scaling_factor)r   r   )7sglang.srt.layers.moe.fused_moe_triton.fused_marlin_moer?   /sglang.srt.layers.moe.token_dispatcher.standardr   +sglang.srt.layers.quantization.marlin_utilsr@   r   topk_output
activationr   devicer+   r,   r-   r.   r   r   r   r9   r3   r4   r/   r0   r7   r8   r2   r6   rQ   rR   todtype)	r<   r=   r>   r?   r   r@   r   rV   outputr   r   r   fused_experts_none_to_marlinL   sr   	
r\   )r<   r   r=   r*   r>   r   r   r   )
__future__r   dataclassesr   typingr   r   torch%sglang.srt.layers.moe.moe_runner.baser   r   r   r	   r
   sglang.srt.layers.moe.utilsr   &sglang.srt.layers.moe.token_dispatcherr   r   r   r'   r   r)   r*   r\   r   r   r   r   <module>   s"    
