o
    Ti+                  
   @   sV  d dl mZ d dlmZmZmZmZmZmZm	Z	 d dl
Z
d dl
mZ ddlmZ dejdeeef fd	d
Zde
jdefddZdee
jj deee
jj ee
jj f fddZdee
jj deee
j ee
j f fddZ	ddeeeef eeeef df eeeef  f deeef deeeef  fddZdd ZdefddZdS )     )defaultdict)AnyDictListSetTupleUnioncastN)nn   )MoEmreturnc                 C   s<   d}d}|   D ]}t|trd}|j} ||fS q||fS )NFr   T)modules
isinstancer   num_experts)r   has_moer   module r   G/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/moe/utils.pyhas_moe_layers   s   
r   paramc                 C   s   t | dr
| js
dS dS )N	allreduceTF)hasattrr   )r   r   r   r   is_moe_param   s   r   paramsc                 C   s8   g }g }| D ]}t |r|| q|| q||fS N)r   append)r   shared_paramsexpert_paramspr   r   r   *split_params_into_shared_and_expert_params!   s   r!   groupc                 C   sV   g }g }| D ] }|j dur&t|r||j |j q||j |j q||fS )a  Split grad of parameters into grads of non-expert params
    and grads of expert params. This is useful while computing
    grad-norms for clipping and overflow detection

        group (List[torch.nn.Parameter]):
    Args:
            The group of parameters to split

    Returns:
        Tuple[List[torch.Tensor], List[torch.Tensor]]:
        list of gradients for non MoE params, list of gradients of MoE params
    N)gradr   r   todtype)r"   expert_gradsshared_gradsr    r   r   r   0split_params_grads_into_shared_and_expert_params.   s   
r(   髪
param_groups.max_group_sizec                 C   s  t | tr
t| } nt | tr| g} nt | ts!tdt|  t }| D ]}ttt	j
 |d D ]}t|r>||j q2q&tdd }| D ]}|D ]}i ||dg d||d  |< qLqH| D ]-}g }ttt	j
 |d D ]}t|r||d  |j d | qp|| qp||d< qb|dur| D ]S}| D ]L}g }	g }
d	}ttt	j
 |d D ]"}||  |kr|	| || 7 }q|
|	 |g}	| }q|	r|
|	 |
D ]}| i |d|i qqq| S | D ]}| D ]}| | qq| S )
a6  Split parameters into different MoE groups for optimizer

    Args:
        param_groups (Union[Dict[str, Any], Tuple[Dict[str, Any], ...], List[Dict[str, Any]]])
            The list of parameter groups to split

    Returns:
        List[Dict[str, Any]]:
        list of MoE/non-MoE groups for optimizer
    zUnknown param group type of r   c                   S   s   t tS r   )r   dictr   r   r   r   <lambda>d   s    zFsplit_params_into_different_moe_groups_for_optimizer.<locals>.<lambda>T)namemoer   r.   Nr   )r   tuplelistr,   
ValueErrortypesetr	   r   r
   	Parameterr   add
group_namer   r   valuesnumel)r*   r+   data_parallel_group_namesparam_groupr   	group_moekey
new_params	moe_group	cur_group
all_groupssize_of_cur_groupr"   r   r   r   4split_params_into_different_moe_groups_for_optimizerH   sp   



	




rC   c                 C   s   |  ddS )Nr/   F)get)r;   r   r   r   is_moe_param_group   s   rE   model_parametersc                 C   s   t | ts	J d| D ]}t |tjtfstdt| q| d }t |tjjr3| dd}t	|S t |trGt
dd | D sEt	| S | S d S )Nzmodel_parameters must be a listzgparam argument that would be given to the optimizer should be an iterable of Tensors or dicts, but got r   zdense-params)r   r.   c                 S   s   g | ]}d |v qS )r/   r   ).0r;   r   r   r   
<listcomp>   s    z.configure_moe_param_groups.<locals>.<listcomp>)r   r1   torchTensorr,   	TypeErrorr3   r
   r5   rC   any)rF   r    firstr;   r   r   r   configure_moe_param_groups   s    

rN   )r)   )collectionsr   typingr   r   r   r   r   r   r	   rI   r
   layerr   Moduleboolintr   rJ   r   r5   r!   r(   strfloatrC   rE   rN   r   r   r   r   <module>   s6   $



0

O