o
    شظ¾i¸7  م                   @   s  d Z ddlmZ ddlmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ er<ddlmZ dd	lmZmZ dd
lmZ z
ddlmZ dZW n eyQ   dZY nw eG dd„ dƒƒZdddedee fdd„Zejdeƒ dچdejdedejfdd„ƒZG dd„ de
ƒZdS )a  
KT Expert Parallelism Wrapper for MoE layers.

This module provides a generic wrapper that enables CPU-GPU expert parallelism
for any MoE quantization method. It coordinates parallel execution of GPU experts
(using any quantization method) and CPU experts (using AMX/AVX instructions).
é    )ع	dataclass)عTYPE_CHECKINGعOptionalN)عget_tensor_model_parallel_rank)عFusedMoEMethodBase)عget_compiler_backend)عMoeRunnerConfig)عCombineInputعStandardDispatchOutput)ع
ServerArgs)عKTMoEWrapperTFc                   @   sb   e Zd ZU dZeed< eed< eed< eed< eed< eed< eed< eed	< d
Zee ed< d
S )عKTConfiga"  Configuration for KTransformers heterogeneous computing CPU part.

    Args:
        layer_idx: Layer index in the model
        num_gpu_experts: Number of experts to run on GPU
        cpuinfer_threads: Number of CPU inference threads
        threadpool_count: Number of thread pools for CPU computation
        weight_path: Path to CPU quantized weights
        chunked_prefill_size: Chunk size for prefill computation
        method: CPU computation method (e.g., "int4")
        num_layers: Total number of layers in the model (optional)
    ع	layer_idxعnum_gpu_expertsعcpuinfer_threadsعthreadpool_countعweight_pathعchunked_prefill_sizeعmax_deferred_experts_per_tokenعmethodNع
num_layers)	ع__name__ع
__module__ع__qualname__ع__doc__عintع__annotations__عstrr   r   © r   r   ْW/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/kt_ep_wrapper.pyr   #   s   
 r   عserver_argsr   r   عreturnc                 C   sf   | j du rdS d}z|  ، }t|ddƒ}W n	 ty   Y nw t|| j| j| j| j | j| j	| j
|dچ	S )zàCreate KTConfig from ServerArgs if KT is configured.

    Args:
        server_args: Global server arguments
        layer_idx: Layer index in the model

    Returns:
        KTConfig if KT is configured, None otherwise
    Nعnum_hidden_layers)	r   r   r   r   r   r   r   r   r   )عkt_weight_pathعget_hf_configعgetattrع	Exceptionr   عkt_num_gpu_expertsعkt_cpuinferعkt_threadpool_countr   ع	kt_methodع!kt_max_deferred_experts_per_token)r    r   r   ع	hf_configr   r   r   ع!create_kt_config_from_server_args=   s(   
‏÷r-   )عdynamicعbackendعtopk_idsr   c                 C   s   d| | |k< | S )a  Mask CPU expert IDs by setting them to -1.

    This function masks expert IDs that should be computed on CPU (IDs >= num_gpu_experts)
    so they won't be computed on GPU. The masked IDs are set to -1, which causes the
    GPU MoE kernel to skip those experts.

    Args:
        topk_ids: Tensor of shape [num_tokens, top_k] containing expert IDs
        num_gpu_experts: Number of experts that should run on GPU (experts 0 to num_gpu_experts-1)

    Returns:
        Modified topk_ids tensor with CPU expert IDs masked as -1
    éےےےےr   )r0   r   r   r   r   عmask_cpu_expert_idsb   s   r2   c                
   @   sت   e Zd ZdZdedefdd„Zdejj	de
de
d	e
d
ejf
dd„Zdejj	ddfdd„Zdejj	ddfdd„Zdejj	ddddfdd„Zdejdejfdd„Zdejj	ddddfdd„Zdefd d!„ZdS )"عKTEPWrapperMethodaے  Wrapper for any MoE quantization method to enable CPU-GPU expert parallelism.

    This wrapper coordinates parallel execution of:
    - GPU experts (0 to num_gpu_experts-1) using any quantization method
    - CPU experts (num_gpu_experts to total_experts-1) using AMX/AVX instructions

    The wrapper implements the submit-compute-sync pattern:
    1. Submit CPU expert computation (non-blocking)
    2. Execute GPU expert computation in parallel
    3. Synchronize and merge CPU+GPU results

    Example:
        # Wrap any GPU method with AMX/AVX CPU expert support
        gpu_method = CompressedTensorsWNA16MoE(quant_config, prefix)
        kt_config = KTConfig(layer_idx=0, num_gpu_experts=4, ...)
        method = KTEPWrapperMethod(gpu_method, kt_config)
    ع
gpu_methodع	kt_configc                 C   sH   t stdƒ‚|| _|| _|j| _d| _| j| j_tƒ | _d| _d| _	dS )z؟Initialize the KT EP wrapper.

        Args:
            gpu_method: The quantization method to use for GPU experts
            kt_config: Configuration for KT CPU expert computation
        zVkt_kernel is not installed. To use KTransformers EP wrapper, please install kt_kernel.TN)
عKTRANSFORMERS_AVAILABLEعImportErrorr4   r5   r   عoverride_num_local_expertsr   عtp_rankعwrapperع_layer_params)عselfr4   r5   r   r   r   ع__init__ˆ   s   ے

zKTEPWrapperMethod.__init__عlayerعnum_expertsعhidden_sizeعintermediate_size_per_partitionعparams_dtypec           
      K   sخ   || _ || _|| _|j}|j|j }| jjpd}	| jjdur0| jjdur0| jj| jjd kr0d}	| j	j
d|| j|||dœ|¤ژ | jdkret| jj||||| j| jj| jj| jj| jj| jj|	dچ| _dS dS )aڑ  Create weights for both GPU and CPU experts.

        Args:
            layer: The MoE layer module
            num_experts: Total number of experts (GPU + CPU)
            hidden_size: Hidden dimension size
            intermediate_size_per_partition: Intermediate size per TP partition
            params_dtype: Data type for parameters
            **extra_weight_attrs: Additional weight attributes
        r   Né   )r>   r?   r@   rA   rB   )r   r?   عnum_experts_per_tokr@   عmoe_intermediate_sizer   r   r   r   r   r   r   r   )عglobal_num_expertsr@   rA   عtop_kعmoe_tp_sizer5   r   r   r   r4   عcreate_weightsr   r9   r   r   r   r   r   r   r:   )
r<   r>   r?   r@   rA   rB   عextra_weight_attrsrD   عintermediate_size_fullعlayer_max_deferredr   r   r   rI   ¥   sH   
ےûْ
ôےz KTEPWrapperMethod.create_weightsr!   Nc                 C   sn   t | jdƒr| j |، | jdkr3| jdur5tj ،  ddlm	} |ƒ j
| jj  ، }| j |، dS dS dS )znProcess weights after loading from checkpoint.

        Args:
            layer: The MoE layer module
        عprocess_weights_after_loadingr   N)ع#get_global_expert_location_metadata)عhasattrr4   rM   r9   r:   عtorchعcudaعsynchronizeع(sglang.srt.eplb.expert_location_dispatchrN   عphysical_to_logical_map_cpur5   r   ع
contiguousعload_weights)r<   r>   rN   rT   r   r   r   rM   ê   s   
ے‎َz/KTEPWrapperMethod.process_weights_after_loadingعmoe_runner_configr   c                 C   s&   || _ | jr
| j|_| j ||، dS )z‍Create MoE runner for computation.

        Args:
            layer: The MoE layer module
            moe_runner_config: Configuration for MoE runner
        N)rW   r8   r   عnum_local_expertsr4   عcreate_moe_runner)r<   r>   rW   r   r   r   rY     s   	z#KTEPWrapperMethod.create_moe_runnerعdispatch_outputr
   c                 C   sd   | j jdks
J dƒ‚| jdks| jdu rdS |j}|j}|\}}}| j |||tj 	|j
،j، dS )a_  Submit CPU expert computation asynchronously (non-blocking).

        This method submits the CPU expert computation to AMX/AVX without waiting
        for completion, allowing GPU computation to proceed in parallel.

        Args:
            layer: The MoE layer module
            dispatch_output: Dispatched tokens and routing information
        عsiluz"Only SiLU activation is supported.r   N)rW   ع
activationr9   r:   عhidden_statesعtopk_outputعsubmit_forwardrP   rQ   عcurrent_streamعdeviceعcuda_stream)r<   r>   rZ   عxr^   عtopk_weightsr0   ع_r   r   r   عsubmit  s   ے
ےzKTEPWrapperMethod.submitrc   c                 C   s8   | j dks
| jdu rt |،S | j |tj |j،j،S )a&  Synchronize and retrieve CPU expert computation results.

        This method waits for the CPU computation to complete and returns the results.

        Args:
            x: Reference tensor for shape and device information

        Returns:
            CPU expert computation results
        r   N)	r9   r:   rP   ع
zeros_likeعsync_forwardrQ   r`   ra   rb   )r<   rc   r   r   r   عsync1  s
   
ےzKTEPWrapperMethod.syncr	   c                 C   s’   ddl m} |j}|j}| jdkr|  ||، |j}t|| jƒ}|j	|dچ}|j	|dچ}	| j
 ||	،}
|
j}| jdkrD|  |،}|| }||dچS )aے  Execute hybrid CPU+GPU MoE forward pass with parallelism.

        This is the main computation method that coordinates:
        1. Submit CPU expert computation (non-blocking)
        2. Execute GPU expert computation in parallel
        3. Synchronize CPU results and merge with GPU results

        Args:
            layer: The MoE layer module
            dispatch_output: Dispatched tokens and routing information

        Returns:
            Combined computation results from CPU and GPU experts
        r   )عStandardCombineInput)r0   )r^   )r]   )ع&sglang.srt.layers.moe.token_dispatcherrj   r]   r^   r9   rf   r0   r2   r   ع_replacer4   عapplyri   )r<   r>   rZ   rj   rc   r^   r0   عmasked_topk_idsعmasked_topk_outputعmasked_dispatch_outputعgpu_combine_inputعoutputع
cpu_outputr   r   r   rm   D  s"   
ے


zKTEPWrapperMethod.applyعnamec                 C   s0   |dv rt dt| ƒj› d|› d‌ƒ‚t| j|ƒS )a-  Delegate attribute access to the wrapped GPU method.

        This allows the wrapper to transparently expose attributes and methods
        from the wrapped GPU quantization method.

        Args:
            name: Attribute name

        Returns:
            Attribute value from gpu_method
        )r4   r:   r5   ْ'z' object has no attribute ')عAttributeErrorعtyper   r%   r4   )r<   rt   r   r   r   ع__getattr__w  s
   ےzKTEPWrapperMethod.__getattr__)r   r   r   r   r   r   r=   rP   عnnعModuler   عdtyperI   rM   rY   rf   عTensorri   rm   r   rx   r   r   r   r   r3   u   sP    ‏
‎‏‎üû
ْEے
ے‏‎
ü‏‎
ü3r3   )r   عdataclassesr   عtypingr   r   rP   عsglang.srt.distributedr   ع*sglang.srt.layers.quantization.base_configr   عsglang.srt.utilsr   عsglang.srt.layers.moer   rk   r	   r
   عsglang.srt.server_argsr   ع	kt_kernelr   r6   r7   r   r   r-   عcompiler|   r2   r3   r   r   r   r   ع<module>   s:   ےےے
‏%