o
    Tiů                     @   s  d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	 ddl
Z
ddlmZ zddlmZ W n eyA   ddl
mZ Y nw ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& e$ j'Z(e$ j)Z*G dd dZ+i Z,dd Z-dd Z.G dd de/Z0dd Z1dd Z2de3fddZ4dd  Z5d!d" Z6d#d$ Z7G d%d& d&e/Z8d'd( Z9d)d* Z:dkd,d-Z;dld.d/Z<dmd0d1Z=dkd2d3Z>d4d5 Z?d6d7 Z@d8d9 ZAG d:d; d;ZBdaCdaDdnd>d?ZEd@dA ZFdBdC ZGdodDdEZHdFdG ZIdHdI ZJdpdKdLZKdqdMdNZLdrdQdRZMdSdT ZNdUdV ZOdWdX ZPG dYdZ dZe
jQjRZSd[d\ ZTdsd]d^ZUd_d` ZVdtdae3dbe3fdcddZWdodbe3fdedfZXdgeeef dheeef de3fdidjZYdS )uzP
Copyright NVIDIA/Megatron

Helper functions and classes from multiple sources.
    )IterableNsqrt)prod)
functional)inf)UnionListDict)comm)is_moe_param)groupslogger)bwc_tensor_model_parallel_rank bwc_pipeline_parallel_world_sizebwc_pipeline_parallel_group)PIPE_REPLICATED)get_accelerator)	transposec                   @   s   e Zd ZdZdd ZdS )
DummyOptimz
    Dummy optimizer presents model parameters as a param group, this is
    primarily used to allow ZeRO-3 without an optimizer
    c                 C   s   g | _ | j d|i d S )Nparams)param_groupsappend)selfr    r   K/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/utils.py__init__-   s   zDummyOptim.__init__N)__name__
__module____qualname____doc__r   r   r   r   r   r   '   s    r   c                 O   s   |j tvrnt  }|t   t | ||i | W d    n1 s*w   Y  t  | t  t|j < t t|j   ||i | W d    n1 sZw   Y  | rlt 	t|j   d S d S t 	t|j   d S N)
r   graph_cacher   Streamwait_streamcurrent_streamstreamcreate_graphcapture_to_graphreplay_graph)replay_first_stepfuncargskwargscuda_streamr   r   r   graph_process5   s   

r/   c                 C   s   | S r!   r   )r+   r   r   r   noop_decoratorG      r0   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )noop_contextc                 C      d S r!   r   r   r   r   r   r   M   r1   znoop_context.__init__c                 C   r3   r!   r   r4   r   r   r   	__enter__P   r1   znoop_context.__enter__c                 C   r3   r!   r   )r   exc_typeexc_valexc_tbr   r   r   __exit__S   r1   znoop_context.__exit__N)r   r   r   r   r5   r9   r   r   r   r   r2   K   s    r2   c                 C   s   t j| }t j|dd dS )z|Create the directory path to ``filename`` if it does not already exist.

    Args:
        filename (str): A file path.
    T)exist_okN)ospathdirnamemakedirs)filenamer=   r   r   r   ensure_directory_existsW   s   r@   c                 C   s4   ddl }ddl}||  |j|  t|  dS )zSet the random seed for common PRNGs used during training: random, numpy, and torch.

    Args:
        seed (int): the seed to use
    r   N)numpyrandomseedtorchmanual_seed)rC   rA   rB   r   r   r   set_random_seeda   s
   
rF   returnc                 C   s,   t | dr
| jr
dS t | dr| jrdS dS )Nmodel_parallelTtensor_model_parallelF)hasattrrH   rI   pr   r   r   is_model_parallel_parametern   s
   rM   c                    sx    | r	|  S t| tr fdd| D S t| tr)t fdd| D S t| tr: fdd|  D S | S )a~  
    Return a copy of tensor on specified device.
    Works on individual tensors, and tensors contained/nested in lists, tuples, and dicts.
    Parameters:
        item: tensor to copy or (possibly nested) container of tensors to copy.
        device: target device
        criterion_func: Function to restrict copy operation to items meet criterion

    Returns:
        None
    c                       g | ]}t | qS r   copy_to_device.0vcriterion_funcdevicer   r   
<listcomp>       z"copy_to_device.<locals>.<listcomp>c                    rN   r   rO   rQ   rT   r   r   rW      rX   c                       i | ]\}}|t | qS r   rO   rR   krS   rT   r   r   
<dictcomp>       z"copy_to_device.<locals>.<dictcomp>)to
isinstancelisttupledictitems)itemrV   rU   r   rT   r   rP   x   s   



rP   c                    s    | r|  }|j| _| S t| tr fdd| D S t| tr/t fdd| D S t| tr@ fdd|  D S | S )a  
    Move tensor on to specified device by changing the storage.
    Works on individual tensors, and tensors contained/nested in lists, tuples, and dicts.
    Parameters:
        item: tensor to move or (possibly nested) container of tensors to move.
        device: target device
        criterion_func: Function to restrict move operation to items meet criterion

    Returns:
        None
    c                    rN   r   move_to_devicerQ   rT   r   r   rW      rX   z"move_to_device.<locals>.<listcomp>c                    rN   r   re   rQ   rT   r   r   rW      rX   c                    rY   r   re   rZ   rT   r   r   r\      r]   z"move_to_device.<locals>.<dictcomp>)r^   datar_   r`   ra   rb   rc   )rd   rV   rU   device_copyr   rT   r   rf      s   



rf   c                 C   sJ   | d t tj|d }tj|t  tj d}tj||d | } | S )N      ?grouprV   dtype)	floatdistget_world_sizerD   tensorr   current_device_name
all_reducerd   )all_groups_normrk   scaled_normscaled_norm_tensorr   r   r   get_norm_with_moe_layers_fast   s
   rw   c                   @   sL   e Zd ZdZdddZdddZdd	d
Zdd ZdddZe	dd Z
dS )CheckOverflowz7Checks for overflow in gradient across parallel processNFc                 C   s`   || _ |rg nd | _|| _|| _d| _|r,|D ]}|D ]}| j| t|r*d| _qqd S d S )NFT)mpur   zero_reduce_scatter	deepspeedhas_moe_paramsr   r   )r   r   ry   rz   r{   rk   paramr   r   r   r      s   zCheckOverflow.__init__Tc                 C   s   d|v }t  |g}| jrtj|tjjt d | j	d ur-tj|tjj| j	
 d n|r<tj|tjjd t  |d  }t|S )Noprk   )r   r   )r   FloatTensorr|   ro   rs   ReduceOpMAXr   _get_max_expert_parallel_groupry   get_model_parallel_groupbarrierrd   bool)r   
norm_groupreduce_overflowoverflowoverflow_gpur   r   r   check_using_norm   s   
zCheckOverflow.check_using_normc                 C   sf   g }d}|d u r| j }| j}n|d usJ d|D ]}|D ]}|| t|r*d}qq| j||dS )NFz0self.params and param_groups both cannot be noneT)r|   )r   r|   r   r   has_overflow)r   r   r   r|   rk   r}   r   r   r   check   s    

zCheckOverflow.checkc                 C   s6   t |D ]\}}|jd ur| |jj|r dS qdS )NTF)	enumerategrad_has_inf_or_nanrg   )r   r   irL   r   r   r   has_overflow_serial   s
   z!CheckOverflow.has_overflow_serialc                 C   s  |d u r| j }| |}t |g}|r!tj|tjjt	 d | j
r1tj|tjjt d nR| jd url| jd ur^t| jd}|rI| jjdu sQ|s^| jjdu r^tj|tjj| j d tj|tjj| j d n| jd ur| jjdu rtj|tjjt d |d  }t|S )Nr   "pipeline_enable_backward_allreduceFr   )r|   r   r   
ByteTensorro   rs   r   r   r   r   rz   get_world_groupry   r{   rJ   r   enable_backward_allreduceget_data_parallel_groupr   rd   r   )r   r   r|   r   r   using_pipeliner   r   r   r      s.   



zCheckOverflow.has_overflowc              
   C   sv   z
t |    }W n ty% } zd|jd vr W Y d }~dS d }~ww |t dks7|t d ks7||kr9dS dS )Nzvalue cannot be convertedr   Tr   F)rn   sumRuntimeErrorr,   )xr   cpu_suminstancer   r   r   r     s   "zCheckOverflow._has_inf_or_nan)NNFN)Tr!   )r   r   r   r    r   r   r   r   r   staticmethodr   r   r   r   r   rx      s    



rx   c                 C   s   dd l }t }|dkr?d}t|j dD ]\}}|t|s'|} nqt	
d| d|  d| d| d|j 
 d S d S )Nr   r~   zrank z detected overflow z in tensor :z shape )mathro   get_rankr   rg   
contiguousviewisfinitern   r   infoshape)r   r   r   r   rankt_iv_irS   r   r   r   _handle_overflow%  s   .r   c                 C   s"   d}| D ]}||d 7 }qt |S )z( Compute total from a list of norms
            g       @r   )	norm_list
total_normnormr   r   r   get_global_norm1  s   r      c                 C   s  t | tjr	| g} ttdd | } t|}g }|tkrN| D ]}||jj	
    qt| }|t  }|durMtj|tjj| d nnd}| D ]1}|durs| dksbt|rr|jj	  |}|| qR|jj	  |}|| qRt|dkrt|   }nt dg}|t  }|durtj|tjj| d |d| }t  }|d ttj!|d }	|	}
tj|
|d |
}|| d j"}tj#t|g|j"d	}||d
  }tj#dg|j"d	}t$||}| D ]
}|jj	%| q|S )a9  Clips gradient norm of an iterable of parameters.

    This has been adapted from Nvidia megatron. We add norm averaging
    to consider MoE params when calculating norm as they will result
    in different norms across different ranks.

    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
    added functionality to handle model parallel parameters. Note that
    the gradients are modified in place.

    Arguments:
        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
            single Tensor that will have gradients normalized
        max_norm (float or int): max norm of the gradients
        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
            infinity norm.

    Returns:
        Total norm of the parameters (viewed as a single vector).
    c                 S   
   | j d uS r!   r   rK   r   r   r   <lambda>R     
 z!clip_grad_norm_.<locals>.<lambda>Nr   r   r   ri   rj   rV   ư>)&r_   rD   Tensorr`   filterrn   r   r   r   rg   absmaxstackr^   r   rr   ro   rs   r   r   r   get_model_parallel_rankrM   detachr   lensquarer   r   SUMpowr   _get_data_parallel_grouprp   rV   rq   minmul_)
parametersmax_norm	norm_typery   	all_normsrL   r   
param_normpgru   rv   	clip_coef
tmp_tensorr   r   r   clip_grad_norm_;  sT   
r   c                 C   s  t | tjr	| g} ttdd | } t|}|tkrCtdd | D }t 	t|g}|dur<t
j|t
jj| d |d  }nd}t| D ]p\}}|durt|| dkrtjd	d
gt  |jd|| jd d	}tj|jd d	 t  |jd}	|	d|| d
|d
d dd
 }	t|jj|	d |}
n	|jj |}
||
 | 7 }qIt 	t|g}|durt
j|t
jj | d |d  d|  }|tdks|td ks||krd
}|S )a  Get grad norm of an iterable of parameters.

    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
    added functionality to handle model parallel parameters. Note that
    the gradients are modified in place. Taken from Nvidia Megatron.

    Arguments:
        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
            single Tensor that will have gradients normalized
        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
            infinity norm.
        grad_norm_mask (List[Tensor]): A list of Tensor, where
            each Tensor is a 2D Tensor containing ranges of [start_index, end_index].
    Returns:
        Total norm of the parameters (viewed as a single vector).
    c                 S   r   r!   r   rK   r   r   r   r     r   z)get_flattened_grad_norm.<locals>.<lambda>c                 s   s     | ]}|j j  V  qd S r!   )r   rg   r   r   rR   rL   r   r   r   	<genexpr>  s    z*get_flattened_grad_norm.<locals>.<genexpr>Nr   r   r      r~   rl   ri   r   )!r_   rD   r   r`   r   rn   r   r   r   r   ro   rs   r   r   r   rd   r   r   rq   rr   rm   repeatr   zerosscatter_r   cumsumr   masked_fillr   rg   r   r   )r   r   ry   grad_norm_maskr   total_norm_cudaidxrL   cum_sum_pairsmask_tensorr   r   r   r   get_flattened_grad_norm  sJ   "r   c                 C   s   t | tjr	| g} ttdd | } d}t|d}| D ]%}t|tr&|jr&q|dkr/t	|s/q|j
 t|j
 }|| 7 }qt t|g}|durZtj|tjj| d |d  }|S )aP  Compute the number of grads with zero values.

    This is adapted from get_grad_norm

    Arguments:
        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
            single Tensor that will have gradients normalized

    Returns:
        Total number of params with zero values (viewed as a single vector).
    c                 S   r   r!   r   rK   r   r   r   r     r   z get_grad_zeros.<locals>.<lambda>r   ry   r   Nr   )r_   rD   r   r`   r   r   rJ   r   ds_pipe_replicatedrM   r   numelcount_nonzerord   r   r   rn   ro   rs   r   r   r   )r   ry   total_zerostensor_mp_rankrL   count_zerostotal_zeros_cudar   r   r   get_grad_zeros  s"   
r   c                 C   s<  t | tjr	| g} t|}|tkr:tdd | D }t t|g}|dur3tj	|tj
j| d |d  }nOd}t|d}| D ]"}t|trN|jrNqC|dkrWt|sWqC|j |}||| 7 }qCt t|g}|durtj	|tj
j| d |d  d|  }|td	ks|td	 ks||krd
}|S )a  Get norm of an iterable of parameters.

    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
    added functionality to handle model parallel parameters. Note that
    the gradients are modified in place. Taken from Nvidia Megatron.

    Arguments:
        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
            single Tensor that will have gradients normalized
        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
            infinity norm.

    Returns:
        Total norm of the parameters (viewed as a single vector).
        -1 if the norm value is NaN or Inf.
    c                 s   s    | ]
}|j   V  qd S r!   )rg   r   r   r   r   r   r   r     s    z"get_weight_norm.<locals>.<genexpr>Nr   r   r   r   ri   r   r~   )r_   rD   r   rn   r   r   r   r   ro   rs   r   r   r   rd   r   rJ   r   r   rM   rg   r   r   )r   r   ry   r   r   r   rL   r   r   r   r   get_weight_norm  s2   
"r   c                 C   s>   dd | D }t dt|D ]}||  ||d  7  < q|S )zk Compute an inclusive prefix sum.

    Example:
        >>> prefix_sum_inc([3,4,5])
        [3, 7, 12]
    c                 S   s   g | ]}|qS r   r   )rR   wr   r   r   rW   ,  s    z"prefix_sum_inc.<locals>.<listcomp>r   )ranger   )weightsweights_r   r   r   r   prefix_sum_inc%  s   r   c                 C   s   dd l }dg|d  }| |kr!t|d D ]	}t|| ||< q|S | | }| ||  }|d|d | |}t|D ]}||d d   d7  < q:| }|S )Nr   r   )rA   r   r   arangetolist)	num_items	num_partsrA   partsrL   	chunksizeresidualr   r   r   r   partition_uniform2  s   r   c                 C   s  ddl }t| }|}||krt||S ||d |d f|j}||d |d f|j}||d |d f|j}|j|d |d ftd}||d }	|| |	dd< d|d< d|d< td|d D ]\}
tdt	|
|d D ]O}t|
D ]H}t
|||d f |	|
 |	|  }t	|||d f |	|
 |	|  }|| }||
|f |kr|||
|f< |||
|f< |||
|f< |||
|f< qxqrqf|g}ttd|d D ]}
|||d |
f  q|  |S )z
    use dynamic programming solve `The Linear Partition Problem`.
    see https://www8.cs.umu.se/kurser/TDBAfl/VT06/algorithms/BOOK/BOOK2/NODE45.HTM
    r   Nr   rm   )r   r   r~   )rA   r   r   fullr   r   intr   r   r   r   reversedr   reverse)r   r   npnmdp_maxdp_mindp_costposition
prefix_sumr   jr[   max_summin_sumcostr   r   r   r   partition_balancedG  s@   
""r  c                       sf   e Zd Zd fdd	Zee  fddZdd Zddd	Z	d
d Z
dd Zdd Zdd Z  ZS )PartitionedTensorNc                    sp   t    || _tj| jd| _tj| jd| _t|	 | _
|j| _| |\| _| _| | j dk| _d S )Nrj   r   )superr   rk   ro   rp   r   r   r   r`   size	orig_sizerV   orig_device_partition_tensor
local_data	partitionr   
even_split)r   rq   rk   partition_meta	__class__r   r   r   r  s   
zPartitionedTensor.__init__c                 C   s   |j tjksJ ttj|d}| ||d}| }|dd|d   |_|d|d  d  }||_|	 |_
||_|j|d ksDJ |j|d ksMJ |dd  |_|S )Nrj   )rq   rk   r   r   r   )rm   rD   longonesro   rp   r   r	  r
  r   r  rk   r   r   r  )clsmeta
local_partrk   rV   dummypart_objr   r   r   	from_meta}  s   
zPartitionedTensor.from_metac                 C   sX   t | | jd}|| j }|| jd  | }|  djd||d }||fS )N)r   r   r   r~   r   startlength)	r   r   r   r   r   r   r   narrowclone)r   rq   r  r  r  tensor_partr   r   r   r    s
   
"z#PartitionedTensor._partition_tensorc                 C   s   |d u r| j }t|  }tj|g| jj|d}| jr&tj	|| j| j
d n2t| jD ],}| j|d  | j|  }|jd| j| |d}|| jkrO|| j t||| j
 q+||    S )Nrm   rV   rj   r   r   r  )r
  r   	full_sizerD   r   r  rm   r  ro   all_gather_into_tensorrk   r   r   r  r  r   copy_	broadcastr   r  r   )r   rV   
full_numelflat_tensorpart_id	part_sizebufr   r   r   r     s   
zPartitionedTensor.fullc                 C   sX   g }| t| j |t| j7 }| | j | | j || j7 }tj|d	| j
S )a!  Returns a torch.LongTensor that encodes partitioning information.

        Can be used along with ``data()`` to serialize a ``PartitionedTensor`` for
        communication.

        Returns:
            torch.LongTensor: a tensor encoding the meta-information for the partitioning
        )rg   )r   r   r	  r`   r   r   r  rD   
LongTensorr^   r
  )r   r  r   r   r   to_meta  s   	
zPartitionedTensor.to_metac                 C      | j S r!   )r  r4   r   r   r   rg        zPartitionedTensor.datac                 C   s
   | j  S r!   )r  r  r4   r   r   r   
local_size     
zPartitionedTensor.local_sizec                 C   r,  r!   )r	  r4   r   r   r   r!    r-  zPartitionedTensor.full_sizer!   )r   r   r   r   classmethodr   device_namer  r  r   r+  rg   r.  r!  __classcell__r   r   r  r   r  p  s    
r  r~   Fc           
      C   s  t  }|dkr||krd S t   |rt   t   t  }t  }|t }|t	 }|a	|at 
 }t  }	|d }|d }|d }|d }|d }|	d }	td| d| dt   d|dd|dd	|dd
|dd|dd	|	dd d S )Nr~      @zRANK=z	 MEMSTATSzdevice=z current alloc=z0.4fz
GB (delta=zGB max=zGB) current cache=zGB))ro   r   r   synchronizereset_max_memory_cachedreset_max_memory_allocatedmemory_allocatedmemory_cachedmem_alloced
mem_cachedmax_memory_allocatedmax_memory_cachedprintrr   )
msg
print_rank	reset_maxr   new_alloced
new_cacheddelta_alloceddelta_cachedmax_alloced
max_cachedr   r   r   memory_status  sH   






rG  c                   C   s"   t  rt  dksdS t  S Nr   )ro   is_initializedr   r   r7  r   r   r   r   get_ma_status  s   
rJ  c                   C   s   t    t    d S r!   )r   empty_cachereset_peak_memory_statsr   r   r   r   rK    s   
rK  c                 C   s   |sd S t  rt  dksd S t  t|  tdtt 	 d d dtt 
 d d dtt d d dtt d  d	 t }t|j|j d d}td	| d
|j d t   d S )Nr   zMA r3  r   z GB         Max_MA z GB         CA z GB         Max_CA z GB zCPU Virtual Memory:  used = z GB, percent = %)ro   rI  r   gccollectr   r   roundr   r7  r;  torch_memory_reservedtorch_max_memory_reservedpsutilvirtual_memorytotal	availablepercentrL  )messageforcevm_statsused_GBr   r   r   see_memory_usage  s"   

r\  c                 O   s^   |  d}|r|d dd |D 7 }|r|d7 }|r)|d dd | D 7 }|d7 }|S )a  Construct a string representation of a call.

    Args:
        base (str): name of the call
        args (tuple, optional): args to ``base``
        kwargs (dict, optional): kwargs supplied to ``base``

    Returns:
        str: A string representation of base(*args, **kwargs)
    (z, c                 s   s    | ]}t |V  qd S r!   repr)rR   argr   r   r   r   (      zcall_to_str.<locals>.<genexpr>c                 s   s&    | ]\}}| d t | V  qdS )=Nr^  )rR   keyr`  r   r   r   r   ,  s   $ ))joinrc   )baser,   r-   namer   r   r   call_to_str  s   
rh  c                 C   s,   t | }t|dkrtd|  |\}|S )Nr   z0expected there to be only one unique element in )setr   r   )rc   item_setunique_itemr   r   r   get_only_unique_item1  s
   rl        c                 C   s<   |   }|  }||}tjd|tjd}| || d S )Nrm  rl   )isinfisnan
logical_orrD   rq   rn   masked_fill_)inputrV   valnorm_is_infnorm_is_nan
inf_or_nanerrr   r   r    mask_nan_or_inf_with_val_inplace:  s
   
rx  c                 C   s$  t | tsJ dt|  tdd | D sJ dt|}g }|tkr| D ]}||j 	   q't
|	 }|t  }|durr|du sSt dkr`tj|tjj| d nt|dkrrtj|tjjt|d |durtj|tjj|d || d j}nd	tvsttd	 t| krd
d | D td	< td	 }	dd }
|rtd|
| |	| n|
| |	| |	d   }|dur|du st dkrtj|tjj| d nt|dkrtj|tjjt|d |durtj|tjj|d || d jd| }t||jd |S )a  Get norm of an iterable of tensors.

    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
    added functionality to handle model parallel parameters. Taken from Nvidia Megatron.

    Arguments:
        input_tensors (Iterable[Tensor]): an iterable of Tensors will have norm computed
        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
            infinity norm.

    Returns:
        Total norm of the tensors (viewed as a single vector).
    zexpected Iterable type not c                 S   s   g | ]}t |qS r   )rD   	is_tensorrR   tr   r   r   rW   Q  s    z.get_global_norm_of_tensors.<locals>.<listcomp>zexpected list of only tensorsNr   r   r   norm_tensors_compute_bufferc                 S   s$   g | ]}t jg t jt  d qS )r   )rD   emptyrn   r   rr   rz  r   r   r   rW   k  s    c                 S   sT   t | D ]#\}}|| j|j ||  |dkr'|d j|| j qd S rH  )r   rg   r#  rn   r   add_)tensor_list_compute_buffer
_norm_typer   r{  r   r   r   _norm_tensorsq  s    z1get_global_norm_of_tensors.<locals>._norm_tensorsFri   r   ) r_   r   typeallrn   r   r   rg   r   r   rD   r   r^   r   rr   r   %_get_expert_model_parallel_world_sizero   rs   r   r   r   r   r   rV   r"   r   r/   r   r   r   rx  )input_tensorsr   ry   	use_graphmoe_ep_groupr   r{  r   device_total_normcompute_bufferr  r   r   r   get_global_norm_of_tensorsB  sR   

r  ri   r   c           
      C   s   |du rt | ||d}|||  }|dk rS|rGdd }dtvr/tj|tjdt  td< td }|tj|tjd t	d|| | |S | D ]	}	|	
 | qI|S )	as  Clip list of tensors by global norm.
    Args:
        input_tensors: List of tensors to be clipped
        global_norm (float, optional): Precomputed norm. Defaults to None.
        mpu (optional): model parallelism unit. Defaults to None.
        eps (float, optional): epsilon value added to grad norm. Defaults to 1e-6
    Returns:
        float: the global norm
    N)ry   r  r   c                 S   s   | D ]	}|  | qd S r!   )r   r   )_tensor_list_clip_coef_tensorr{  r   r   r   clip_tensors  s   z1clip_tensors_by_global_norm.<locals>.clip_tensorsclip_coef_tensorr   F)r  r"   rD   rq   float32r^   r   r1  r#  r/   r   r   )
r  r   global_normry   epsr  r   r  r  r{  r   r   r   clip_tensors_by_global_norm  s&   
r  c                 C   sX   t dd | D }|| }|r(|| }tj|| d j| d jd}| |g }|S | }|S )Nc                 s   s    | ]}|  V  qd S r!   )r   rz  r   r   r   r     ra  z&align_dense_tensors.<locals>.<genexpr>r   rl   )r   rD   r   rV   rm   )r  	alignmentnum_elements	remainingelements_to_add
pad_tensorpadded_tensor_listr   r   r   align_dense_tensors  s   
r  c                 C   s`   t t| |D ]&\}\}}tj|| d}tj|| d}|dkr"qt||| ||  qd S )Nrj   r   )r   zipro   r   rp   r"  )groups_flatpartitioned_param_groupsdp_process_groupgroup_id
group_flatpartitioned_paramspartition_iddp_world_sizer   r   r    all_gather_into_tensor_dp_groups  s   r  c                 C   s   t  r
t| ||S t|D ]\}}t j|| d}t j|| d}|dkr'qtd||  | | }	||  |	 }
|
|
|  }
|
}|
|	 ||  ksPJ t|	D ]8}||	d krf||  ||
  }g }t|D ]}|| 	d||
 |
 }|| qlt ||| ||  qTqd S )Nrj   r   r   )ro   has_all_gather_into_tensorr  r   r   rp   r   r   r   r  r   r   
all_gather)r  r  r  start_alignment_factorallgather_bucket_sizer  r  r  r  
num_shards
shard_sizer  shard_id
shard_listdp_id
curr_shardr   r   r   all_gather_dp_groups  s,   r  c                       s6   e Zd Zd
 fdd	Zdd Zdd Zdd	 Z  ZS )TLinear c                    sh   || _ t j|jjd |jjd |jd ud t|jj| j_|j| _| jd ur.| j| _
d S | j	| _
d S )Nr   r   bias)rg  r  r   weightr   r  r   rg   _fwd_bias_add_fwd	_fwd_func)r   
orig_layerrg  r  r   r   r     s
   ("zTLinear.__init__c                 C   s   t || jS r!   )Flinearr  r   rr  r   r   r   r    s   zTLinear._fwdc                 C   s   t j|| j| jdS )Nr  )r  r  r  r  r  r   r   r   r       zTLinear._fwd_bias_addc                 C   s
   |  |S r!   )r  r  r   r   r   forward  r/  zTLinear.forward)r  )r   r   r   r   r  r  r  r2  r   r   r  r   r    s
    r  c                    s   ddl m   fdd| D S )Nr   ZeroParamStatusc                    s&   g | ]}t |d r|j jkr|qS )ds_id)rJ   	ds_statusNOT_AVAILABLE)rR   r}   r  r   r   rW     s    z'get_inactive_params.<locals>.<listcomp>)+deepspeed.runtime.zero.partition_parametersr  )
param_listr   r  r   get_inactive_params  s   r  c           	   	      s   dd  | g}|  D ]\}}t|||dt|d}|| qt fdd|D }|d r6dS |t	krB|
  }|S || }| d|  }|td	ks^|td	 kr`d}|S )
au   Compute the global norm with MoE experts

    Inputs:
    non_expert_norm (float) : the calculated norm of the non-expert params
    expert_tensors (Dict[ep_name, List[Tensor]): Dictionary of expert group name to list of grad tensors
    norm_type (int): the norm to use

    Returns:
        if norm is (-/+) inf, returns -1
        otherwise the global norm (float)
    c                 S   s   t  t|  S r!   )r   r   rn   r   )rS   r   r   r   	to_tensor  r  z+get_norm_with_moe_layers.<locals>.to_tensorF)r  ry   r   r  r  c                    s   g | ]} |qS r   r   )rR   r   r  r   r   rW   %  s    z,get_norm_with_moe_layers.<locals>.<listcomp>r~   ri   r   )rc   r  r   _get_expert_parallel_groupr   rD   r   eqanyr   r   rd   r   r   rn   )	non_expert_normry   expert_tensorsr   group_normsexp_nametensors
group_normr   r   r  r   get_norm_with_moe_layers  s*   r  c                 C   s
   |  dS )N_offload_bufferr   )rc  r   r   r   _make_offload_state_key5  r/  r  
pin_memorynon_blockingc                    sL    fdd}| j  D ]\}}d|v r||d d|v r#||d qdS )^Move optimizer states to device. Note that this assumes the state structure of DeepSpeed Adam.c                    sf   t |}|| vrtj| |  d| |< rt | | | |< | | j| | d | | | | _d S )Nr   r  )r  rD   
empty_liker   r  r#  rg   )staterc  offload_buf_keyrV   r  r  r   r   move_key<  s   z%offload_adam_states.<locals>.move_keyexp_avg
exp_avg_sqNr  rc   )	optimizerrV   r  r  r  _r  r   r  r   offload_adam_states9  s   	

r  c                    sJ    fdd}| j  D ]\}}d|v r||d d|v r"||d qdS )r  c                    s    | t | j d| | _d S )Nr  )r  r^   rg   )r  rc  rV   r  r   r   move_back_keyO  s    z)reload_adam_states.<locals>.move_back_keyr  r  Nr  )r  rV   r  r  r  r  r   r  r   reload_adam_statesL  s   

r  inputs1inputs2c                 C   sT  t | t |kr
dS t| trWt|trWt| t|krdS t| |D ]1\}}t|tjrMt|tjrM|t 	 }|t 	 }t
||sL dS q#||krT dS q#dS t| trt|tr|  | krkdS | D ]8}| | || }}t|tjrt|tjr|t 	 }|t 	 }t
||s dS qm||kr dS qmdS dS )a5  
    Compare two lists or dictionaries for equality, including any tensors they may contain.

    Args:
        inputs1: First input, either a list or a dictionary.
        inputs2: Second input, either a list or a dictionary.

    Returns:
        True if inputs1 and inputs2 are equal; False otherwise.
    FT)r  r_   r`   r   r  rD   r   r^   r   current_deviceequalrb   keys)r  r  val1val2rc  r   r   r   compare_tensors_in_structuresY  s@   r  )r   N)r   NNr!   )r~   F)F)Nrm  )r   NFN)ri   NNr   F)r   )FF)Zr    collections.abcr   r;   rS  rN  r   r   rA   r   rD   torch.nnr   r  
torch._sixr   ModuleNotFoundErrortypingr   r	   r
   r{   r   ro   deepspeed.moe.utilsr   deepspeed.utilsr   r   deepspeed.utils.bwcr   r   r   deepspeed.runtime.constantsr   deepspeed.acceleratorr   deepspeed.module_inject.policyr   memory_reservedrQ  max_memory_reservedrR  r   r"   r/   r0   objectr2   r@   rF   r   rM   rP   rf   rw   rx   r   r   r   r   r   r   r   r   r  r  r9  r:  rG  rJ  rK  r\  rh  rl  rx  r  r  r  r  r  nnLinearr  r  r  r  r  r  r  r   r   r   r   <module>   s   

p


H
B
(8)]
'

	

O"&
**