o
    ߥi                     @   s   d Z ddlmZmZmZ ddlZddlmZ dd Zdej	d	e
fd
dZdeejj d	eejjejjf fddZdeejj d	eejjejjf fddZdee d	ee fddZdS )z-
Copyright 2020 The Microsoft DeepSpeed Team
    )DictListTupleN   )MoEc                 C   s@   d}d}|   D ]\}}t|trd}|j} ||fS q||fS )NFr   T)named_modules
isinstancer   num_experts)mhas_moer	   _module r   [/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/gpt_moe/moe/utils.pyhas_moe_layers   s   
r   paramreturnc                 C   s   t | dr
| js
dS dS )N	allreduceTF)hasattrr   )r   r   r   r   is_moe_param   s   r   paramsc                 C   s:   g g }}| D ]}t |r|| q|| q||fS )N)r   append)r   shared_paramsexpert_paramspr   r   r   *split_params_into_shared_and_expert_params   s   
r   groupc                 C   sV   g }g }| D ] }|j dur&t|r||j |j q||j |j q||fS )a  Split grad of parameters into grads of non-expert params
    and grads of expert params. This is useful while computing
    grad-norms for clipping and overflow detection

        group (List[torch.nn.Parameter]):
    Args:
            The group of parameters to split

    Returns:
        Tuple[List[torch.nn.Parameter], List[torch.nn.Parameter]]:
        list of gradients for non MoE params, list of gradients of MoE params
    N)gradr   r   todtype)r   expert_gradsshared_gradsr   r   r   r   0split_params_grads_into_shared_and_expert_params)   s   
r"   param_groupsc                 C   s  t | tr
t| } nt | tr| g} nt | ts!tdt|  t }| D ]}|d D ]}t|r8||j	 q,q&t|}i }| D ]O}i ||d < |D ]D}i ||d  |< |||d  | d< d||d  | d< |
 D ]!}|dkr|dkrg ||d  | |< qn|| ||d  | |< qnqLqB| D ]'}g }|d D ]}t|r||d  |j	 d | q|| q||d< q| D ]\}}	|	 D ]	\}
}| | qqt| S )zSplit parameters into different MoE groups for optimizer

    Args:
        param_groups (Tuple[Dict]):
            The list of parameter groups to split

    Returns:
        Tuple[Dict]:
        list of MoE/non-MoE groups for optimizer
    zUnknown param group type of r   nameTmoe)r   tuplelistdict
ValueErrortypesetr   add
group_namekeysr   items)r#   data_parallel_group_namesparam_groupr   	group_moekeyori_key
new_paramskvk1v1r   r   r   4split_params_into_different_moe_groups_for_optimizerC   sb   






r:   )__doc__typingr   r   r   torchlayerr   r   Tensorboolr   nn	Parameterr   r"   r:   r   r   r   r   <module>   s*    



