o
    پie                      @   s   d dl mZ d dlZdejdedeejejf fddZdejdedeejejejf fd	d
Zdejdedededef
ddZdejdedededededeejejejf fddZ	dgZ
dS )    )TupleNweight	num_packsreturnc                    sL  | j \}}|| dksJ ||   dkr2tj| dtj| jd| j }tj| tjd}||fS |  j	dddj
 }tj| dtjdd	}tj|dd
}t|D ]N}dg| }dg| || D ]=}	t fddt|D |jd}
|
  k s~J |
|||	f< |
 |||	f< ||
  | ||	f 7  < |
  d7  < qcqS||fS )a  
    Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs
    are as balanced as possible.

    Parameters:
        weight: [X, n], the weight of each item
        num_packs: number of packs

    Returns:
        pack_index: [X, n], the pack index of each item
        rank_in_pack: [X, n], the rank of the item in the pack
    r      dtypedevice)r	   T)
descendingcpu)
fill_valuer	   r
   )r   c                 3   s     | ]}|  k r|V  qd S )N ).0igroups_per_pack
pack_itemsr   \/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/eplb/eplb_algorithms/deepseek.py	<genexpr>)   s    z#balanced_packing.<locals>.<genexpr>)key)shapetorcharangesizeint64r
   expand
zeros_likefloatsortindicesr   	full_likerangemin__getitem__)r   r   
num_layers
num_groups
pack_indexrank_in_packr    r   pack_weightsgrouppackr   r   r   balanced_packing   s:   



r,   num_phyc                 C   s   | j \}}|| }|dksJ | j}tj|tj|d|d}tj||tj|d}tj||tj|d}tj|tj|d}	t||D ])}
| | j	ddj
}||dd|
f< ||	|f |dd|
f< ||	|f  d7  < qA|||fS )a  
    Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized.

    Parameters:
        weight: [X, num_log]
        num_phy: total number of experts after replication

    Returns:
        phy2log: [X, num_phy], logical expert id of each physical expert
        rank: [X, num_phy], the replica rank
        logcnt: [X, num_log], number of replicas for each logical expert
    r   r   r   r   )dimN)r   r
   r   r   r   repeatzerosonesr"   maxr    )r   r-   nnum_lognum_redundantr
   phy2logranklogcntarangenr   redundant_indicesr   r   r   replicate_experts4   s   

r;   num_physical_expertsr&   	num_nodesnum_gpusc                 C   s  | j \}}|| dksJ || }|| dksJ || }|| dks%J || dks-J || }	dtjdtjfdd}
| d||fd}t||\}}|| | | dtj|tj|j	d 
d}|
|}| d|d|| }t||| \}}}|| d|}t||| \}}||	 | }|
|}|d|}|||dtjd||| |j	d	d
dd
 
d}|d|}|d||d}||dd|}|||fS )aC  
    Parameters:
        weight: [num_moe_layers, num_logical_experts]
        num_physical_experts: number of physical experts after replication
        num_groups: number of expert groups
        num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
        num_gpus: number of GPUs, must be a multiple of `num_nodes`

    Returns:
        physical_to_logical_map: [num_moe_layers, num_physical_experts]
        logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
        logical_count: [num_moe_layers, num_logical_experts]
    r   permr   c              	   S   s:   t | }|d| t j| dt j| jd| j |S )Nr   r   )	r   
empty_likescatter_r   r   r   r
   r   r   )r?   invr   r   r   inversep   s   
z/rebalance_experts_hierarchical.<locals>.inverser   r   )r
   r   )r   r   Tensor	unflattensumr,   	unsqueezer   r   r
   flattengatherviewr;   )r   r<   r&   r=   r>   r%   num_logical_experts
group_sizegroups_per_nodephy_experts_per_gpurC   tokens_per_groupgroup_pack_indexgroup_rank_in_packlog2mlogmlog2logtokens_per_mlogphy2mlogphyrankmlogcnttokens_per_phyr'   r(   phy2pphypphy2phy	pphy2mlogpphy2logpphyrankr8   r   r   r   rebalance_experts_hierarchicalS   s\   


	
r_   num_replicasenable_hierarchicalc              	   C   s   | j \}}|   } |rt| ||||\}}	}
nt| |dd|\}}	}
|
  }tj|||fdtj|
j	d}|
|dd|| |	 tj|tj|j	d|d |||
fS )a  
    Entry point for expert-parallelism load balancer.

    Parameters:
        weight: [layers, num_logical_experts], the load statistics for all logical experts
        num_replicas: number of physical experts, must be a multiple of `num_gpus`
        num_groups: number of expert groups
        num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
        num_gpus: number of GPUs, must be a multiple of `num_nodes`

    Returns:
        physical_to_logical_map: [layers, num_replicas], the expert index of each replica
        logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
        expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert
    r   r   r   )r   r   r   r_   r2   itemr   fullr   r
   rK   rA   r   r   )r   r`   r&   r=   r>   ra   r%   rL   r6   rW   r8   	maxlogcntlog2phyr   r   r   rebalance_experts   s0   





rf   )typingr   r   rE   intr,   r;   r_   boolrf   __all__r   r   r   r   <module>   sV   
-

U

5