o
    Tic                     @   s<  d dl mZ d dlZd dlZd dlmZmZ d dlmZ	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZ d d	lmZmZmZm Z  d d
l!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 e2ej3e4 de dddZ5G dd deZ6dd Z7dS )    )OrderedDictN)_flatten_dense_tensors_unflatten_dense_tensors)comm)PIPE_REPLICATED)ZeROOptimizer)version)	get_global_norm_of_tensorsclip_tensors_by_global_norm
DummyOptimalign_dense_tensorsall_gather_dp_groupsis_model_parallel_parametersee_memory_usagegraph_processget_norm_with_moe_layers)link_hp_params#lazy_init_hp_params_optimizer_statefragment_addressgroups)is_moe_paramis_moe_param_group)bwc_tensor_model_parallel_rank)register_grad_hook)enable_universal_checkpoint)
DS_VERSIONPARTITION_COUNTBASE_OPTIMIZER_STATESINGLE_PARTITION_OF_FP32_GROUPS	CLIP_GRADGROUP_PADDINGSPARAM_SLICE_MAPPINGSr   Fc                 C   s(   t  dkr|s
|rt|  d S d S d S Nr   )distget_rankprint)messagedebugforce r)   T/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/bf16_optimizer.pyprint_rank_0   s   r+   c                       s  e Zd Z										dI fdd	Zd	d
 Zdd Zdd Zdd Zdd Zdd Z	dd Z
dd Zdd Zdd Ze dJddZdKdd Ze d!d" Ze dLd#d$Ze dLd%d&Ze d'd( Ze dLd)d*Ze d+d, Zd-d. ZdLd/d0ZdMd1d2Zd3d4 Zd5d6 Zd7d8 Z					dNd9d:ZdOd;d<Zd=d> Z d?d@ Z!e"dAdB Z#e"dCdD Z$dEdF Z%dGdH Z&  Z'S )PBF16_OptimizerN               rT FTc                    sF  t    tddd || _|| _|| _t| jt | _|	t	j
t	jfv s+J d|	 |	| _|| _|| _|| _|| _t|| _ | _tj| jd| _|| _g | _ fddtt| jjD | _| jrg|   t| _t | _!d| _"g | _#g | _$g | _%g | _&g | _'i | _(g | _)g | _*g | _+g | _,g | _-|
| _.| jr| /  td	dd d S )
Nzbegin bf16_optimizerTr(   z<BF16Optimizer: Unsupported gradient accumulation data type: groupc                    s   g | ]} qS r)   r)   ).0idp_process_groupr)   r*   
<listcomp>F   s    z+BF16_Optimizer.__init__.<locals>.<listcomp>r.   zend bf16_ optimizer)0super__init__r   timers	optimizerparam_names
isinstancer   using_real_optimizertorchfloat32bfloat16grad_acc_dtypeimmediate_grad_update	clip_grad	norm_typempuintallgather_bucket_sizer6   r#   r$   dp_rankhas_moe_layersnon_expert_gradientsrangelenparam_groupsreal_dp_process_group_configure_moe_settingsr   flattenr   	unflattennccl_start_alignment_factorbf16_groupsbf16_groups_flatbf16_partitioned_groupsfp32_groups_flat_partitionfp32_groups_gradientsfp32_groups_gradient_dictfp32_groups_gradients_flat!fp32_groups_actual_gradients_flat#fp32_groups_gradient_flat_partitionfp32_groups_has_gradientsgroup_paddingsgraph_harvesting_setup_for_real_optimizer)selfinit_optimizerr<   rF   rD   rE   rH   r6   r:   rB   r_   rC   rJ   	__class__r5   r*   r9   %   sP   

 zBF16_Optimizer.__init__c                 C   sX   t | jjD ]\}}| j| D ]}t|dd rd |_qq| jD ]}|  qtd d S )N_hp_mappingzRemoved grad acc hooks)		enumerater;   rN   rT   getattrre   _grad_acc_hooksremover+   )ra   r4   _phookr)   r)   r*   destroyg   s   

zBF16_Optimizer.destroyc                 C   s   t dd | jjD sJ dt| jjD ]!\}}t|r6tdd |d D s,J dt|d | j|< qi | _	| j
rKt  D ]	}g | j	|< qCd S d S )Nc                 S      g | ]}t |qS r)   )r   )r3   r2   r)   r)   r*   r7   r       z:BF16_Optimizer._configure_moe_settings.<locals>.<listcomp>zThe model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizerc                 S   rn   r)   )r   r3   paramr)   r)   r*   r7   w   s    paramsz*All params in MoE group must be MoE paramsname)anyr;   rN   rf   r   allr   _get_expert_data_parallel_grouprO   expert_gradientsrJ   $_get_expert_data_parallel_group_dictkeys)ra   r4   r2   keyr)   r)   r*   rP   p   s(   
z&BF16_Optimizer._configure_moe_settingsc              	      s  dd j D _tjjD ]&\ }tjj   d}td  dd tjj   d}dd |d D }j	
| j
j	  j|  jj	  j  d	 j   |  fd
dt|D }j
| j
||     dj  _dd j	  D }tjj  jd}j
| jrt|rj|d  
| nj
| jj  |d}j 
| |j! < t"|}	j#
t$j  dd|	 j%
t$j  d|  j&
dgt'j	    |tjj   dd krj   |	 }
nd}
j(
|
 j  g|d< td  dd qg _)j*rB+  ,  d_-.  / _0d S )Nc                 S   s   g | ]}t j|d qS )r1   )r#   get_world_size)r3   pgr)   r)   r*   r7      s    z<BF16_Optimizer._setup_for_real_optimizer.<locals>.<listcomp>r1   zbefore initializing group Tr0   c                 S   s   g | ]}|j r|qS r)   )requires_gradrp   r)   r)   r*   r7      s    rr   )tensor_listflat_tensorc                    s$   g | ]}j   d | qS )r   )rU   narrow)r3   dp_indexr4   partition_sizera   r)   r*   r7      s    c                 S   s   g | ]}|  qS r)   )numel)r3   tr)   r)   r*   r7      ro   )dtypers   )r   num_elem_listr   F   zafter initializing group )1rO   partition_countrf   r;   rN   r#   r{   r   r$   rT   appendrU   _flatten_dense_tensors_alignedrS   #_update_storage_to_flattened_tensorr   rL   rV   rW   clonefloatdetachr}   r?   
zeros_likerB   rZ   rJ   r   rw   rK   _split_flat_tensorrX   rY   sumr[   r   r\   r]   rM   r^   rh   rC   create_grad_acc_hooks_link_all_hp_params_hp_optimizer_states_linked_enable_universal_checkpoint_create_param_mapping_param_slice_mappings)ra   param_groupreal_dp_world_sizepartition_idtrainable_parametersbf16_dp_partitionsr   fp32_flat_bufferfp32_gradientslength_without_paddingpaddingr)   r   r*   r`      sl   
z(BF16_Optimizer._setup_for_real_optimizerc                 C   s   | j D ]}t|d qd S )N)
param_list)rT   r   )ra   lp_param_groupr)   r)   r*   r      s   
z+BF16_Optimizer._enable_universal_checkpointc                 C   s`   g }t | jjD ]%\}}t }| j| D ]}|jd ur'| j| }|j ||< q|| q|S N)	rf   r;   rN   r   rT   re   r<   get_hp_fragment_addressr   )ra   param_mappingr4   rj   param_mapping_per_grouplplp_namer)   r)   r*   r      s   

z$BF16_Optimizer._create_param_mappingc                 C   s   t | jjD ]:\}}tj| j| d}tj| j| d}| j|  | }| j	| }t
| j| || jd d||| || j| d	 qd S )Nr1   F)	lp_param_listflat_hp_partitiongradient_dictoffload_gradient_dictuse_offloadparam_group_indexpartition_startr   dp_group)rf   r;   rN   r#   r{   rO   r$   rU   r   rW   r   rT   rY   )ra   r4   rj   r   r   r   r   r)   r)   r*   r      s    

z"BF16_Optimizer._link_all_hp_paramsc                 C   sF   | j s!t| jjD ]\}}t| j| | j| | jj q	d| _ d S d S )NT)r   rf   r;   rN   r   rT   rW   state)ra   r4   rj   r)   r)   r*   $_lazy_init_hp_params_optimizer_state   s   
z3BF16_Optimizer._lazy_init_hp_params_optimizer_statec                 C   sL   t || ks
J g }d}|D ]}t|d||}|| ||7 }q|S r"   )r   r   r?   r   r   )ra   r   r   r~   offsetnum_elemdense_tensorr)   r)   r*   r      s   

z!BF16_Optimizer._split_flat_tensorc                 C   s,   |  ||}t||D ]\}}|j|_qd S r   )rR   zipdata)ra   r~   r   updated_paramsrk   qr)   r)   r*   r     s   
z2BF16_Optimizer._update_storage_to_flattened_tensorc                 C   s   |  t||S r   )rQ   r   )ra   r~   	alignmentr)   r)   r*   r        z-BF16_Optimizer._flatten_dense_tensors_alignedc                 C   s  |d urt | j d|  \}}t|| j| j| jd}|}| jr,t|| j|| jd}|| _	|dks5J | j
dkrJt| jdd| j
|| j| jd t| j| jD ]\}}|j|jkra||jn||_qQ| j  | jtjury| jD ]}d |_qs|   |   |   d S )Nz does not support closure.)input_tensorsrF   rE   	use_graph)rF   expert_tensorsrE   r-   T)for_clipping)r   max_normglobal_normrF   r   )NotImplementedErrorrd   get_grads_for_normr	   rF   rE   r_   rJ   r   _global_grad_normrD   r
   r   rW   r\   r   togradr;   steprB   r?   r@   r   update_lp_paramsclear_hp_grads)ra   closurenon_expert_grads_for_normexpert_grads_for_normnon_expert_groups_normall_groups_normparam_partitiongrad_partitionr)   r)   r*   r     sP   


zBF16_Optimizer.stepc                 K   s4   |    |jdd|i| |r| j|d dS dS )ae  Perform a backward pass and copy the low-precision gradients to the
        high-precision copy.

        We copy/accumulate to the high-precision grads now to prevent accumulating in the
        bf16 grads after successive backward() calls (i.e., grad accumulation steps > 1)

        The low-precision grads are deallocated during this procedure.
        retain_graphclear_lp_gradsNr)   )r   backwardupdate_hp_grads)ra   lossr   r   r   
bwd_kwargsr)   r)   r*   r   =  s
   	zBF16_Optimizer.backwardc                 C   s   |j d u rd S | j| | }|d us"J dt| d| d| d|j|j j|j|j ||_	d| j
| |< |rE|j   d S d S )Nz4high precision param has no gradient, lp param_id = z group_info = [z][]T)r   rX   idr   add_r   r   viewshape_hp_gradr]   zero_)ra   r   	group_idx	param_idxr   hp_gradr)   r)   r*   _update_hp_gradL  s   

 zBF16_Optimizer._update_hp_gradc                 C   s:   t | jD ]\}}t |D ]\}}| |||| qqd S r   )rf   rT   r   ra   r   r4   r2   jr   r)   r)   r*   _update_hp_grads_func]  s
   z$BF16_Optimizer._update_hp_grads_funcc                 C   sn   | j rd S | jrtd| j| n| | t| jD ]\}}t|D ]\}}|jd u r,q"d| j| |< q"qd S )NFT)rC   r_   r   r   rf   rT   r   r]   r   r)   r)   r*   r   c  s   

zBF16_Optimizer.update_hp_gradsc                 C   s   | j r	| j| jfS | ji fS r   )rJ   rK   rw   ra   r)   r)   r*   get_grads_for_reductions  s   
z&BF16_Optimizer.get_grads_for_reductionc                 C   s&  i }g }g }t | jd}t| jt| jjksJ t| jD ]l\}}t|D ]c\}}	|sBt|	tr5|	j	r5q&|dksBt
|	sBt|	sBq&| j| | sJq&|s| jj| }
| jrtt|
rt|
d |vreg ||
d < ||
d  | j| |  q&|| j| |  q&|| j| |  q&q|s||fS |S )z
        Returns:
            tuple[list[Tensor], dict[ep_name, List[Tensor]] | list:
            If for_clipping, return all gradients.
            Otherwise, separate and return dict of expert_grad and list of non_expert_grad
        )rF   r   rs   )r   rF   rM   rT   r;   rN   rf   hasattrr   ds_pipe_replicatedr   r   r]   rJ   r   r   rX   )ra   r   r   r   all_grads_for_cliptensor_mp_rankr4   r2   r   r   r   r)   r)   r*   r   y  s4   	z!BF16_Optimizer.get_grads_for_normc                 C   sd   t t| j| jD ]\}\}}tj| j| d}|| j|j q	t	| j
| j| j| j| jd d S )Nr1   )groups_flatpartitioned_param_groupsr6   start_alignment_factorrH   )rf   r   rV   rW   r#   r$   rO   r   copy_r   rU   rS   rH   ra   r4   bf16_partitionsfp32_partitionr   r)   r)   r*   r     s   
zBF16_Optimizer.update_lp_paramsc                 C   s@   | j D ]}|  qt| jD ]\}}dgt| | j|< qd S )NF)rZ   r   rf   rX   rM   r]   )ra   flat_gradientsr4   r2   r)   r)   r*   r     s
   

zBF16_Optimizer.clear_hp_gradsc                 C   s   | j r	|r	J dg }| jD ]#}|D ]}|rd |_q|jd ur0|jjd ur*|j  ||j qq|sAt|dkrCt| d S d S d S )Nz>graph harvesting is incompatible with setting lp grads to Noner   )	r_   rT   r   grad_fndetach_r   rM   r?   _foreach_zero_)ra   set_to_nonezero_grads_listr2   rq   r)   r)   r*   r     s    


zBF16_Optimizer.clear_lp_gradsc                 C   s   |  | |   d S r   )r   r   )ra   r   r)   r)   r*   	zero_grad  s   
zBF16_Optimizer.zero_gradc                 C   sP   i }| j |t< | j |t< | j|t< | j|t< | j	|t
< t|t< | j|t< |S r   )rD   r   r;   
state_dictr   rW   r   r^   r    r   r   r   r   r   r!   )ra   r   r)   r)   r*   r     s   




zBF16_Optimizer.state_dictc                 C   sH   t t| j| jD ]\}\}}tj| j| d}|j|| j q	d S )Nr1   )	rf   r   rV   rW   r#   r$   rO   r   r   r   r)   r)   r*   _restore_from_bit16_weights  s   z*BF16_Optimizer._restore_from_bit16_weightsc                 C   s   |    d S r   )r   r   r)   r)   r*   refresh_fp32_params  s   z"BF16_Optimizer.refresh_fp32_paramsc                 C   s(   |r|  ||| d S | ||| d S r   )_load_universal_checkpoint_load_legacy_checkpoint)ra   state_dict_listcheckpoint_folderload_optimizer_statesload_from_fp32_weightsload_serialparam_shapesr)   r)   r*   load_state_dict  s   zBF16_Optimizer.load_state_dictc           
      C   s   t j| jd}|| }|td}|sJ dt|}|t| j| _|r2t	d | j
|t  |rOt| j|t D ]\}}t|| }	|j|	j q<|rW|   d S d S )Nr1   Fz8Empty ds_version in checkpoint, not clear how to proceedz=_load_legacy_checkpoint current_rank_sd[BASE_OPTIMIZER_STATE])r#   r$   r6   getr   pkg_versionparser   rD   r%   r;   r  r   r   rW   r   _get_padded_tensorr   r   r   r   )
ra   r   r  r  rI   current_rank_sdckpt_versioncurrentsaved
src_tensorr)   r)   r*   r     s$   
z&BF16_Optimizer._load_legacy_checkpointc                 C   s   |  d| d S )NrT   ),load_hp_checkpoint_state_from_checkpoint_dir)ra   r  r  r  r)   r)   r*   r     r   z)BF16_Optimizer._load_universal_checkpointc                 C   s   d S r   r)   )ra   sdr)   r)   r*   _load_global_state  s   z!BF16_Optimizer._load_global_statec                 C      | j jS )z+Forward the wrapped optimizer's parameters.)r;   rN   r   r)   r)   r*   rN        zBF16_Optimizer.param_groupsc                 C   r  )z'Forward the wrapped optimizer's states.)r;   r   r   r)   r)   r*   r     r  zBF16_Optimizer.statec                 C   s    | j sJ | j|||dd d S )NFr   )rC   r   )ra   lp_paramr   r   r)   r)   r*   !accumulate_hp_grads_and_remove_lp  s   
z0BF16_Optimizer.accumulate_hp_grads_and_remove_lpc                    sH   t  jD ]\}}t |D ]\}}|jr  fdd}|||| qqd S )Nc                    s(    fdd}j t| d S )Nc                     s      d S r   )r  )	notneeded)r4   r   rq   ra   r)   r*   r  &  s   z`BF16_Optimizer.create_grad_acc_hooks.<locals>.wrapper.<locals>.accumulate_hp_grads_and_remove_lp)rh   r   r   )rq   r4   r   r  r   )r4   r   rq   r*   wrapper$  s   z5BF16_Optimizer.create_grad_acc_hooks.<locals>.wrapper)rf   rT   r}   )ra   r4   r   r   rq   r  r)   r   r*   r     s   z$BF16_Optimizer.create_grad_acc_hooks)
Nr-   r.   r/   NNNFTFr   )FTF)F)T)NTFNN)TF)(__name__
__module____qualname__r9   rm   rP   r`   r   r   r   r   r   r   r   r?   no_gradr   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r   r  propertyrN   r   r  r   __classcell__r)   r)   rc   r*   r,   #   st    B	T
-

-






r,   c                 C   sJ   |   |kr| S tj|| j| jd}t|dd|   }|j| j |S )N)r   devicer   )r   r?   zerosr   r  r   r   r   )r  sizepadded_tensorslice_tensorr)   r)   r*   r
  .  s   r
  )FF)8collectionsr   r?   systorch._utilsr   r   	deepspeedr   r#   deepspeed.runtime.constantsr    deepspeed.runtime.base_optimizerr   	packagingr   r  deepspeed.git_version_infodeepspeed.runtime.utilsr	   r
   r   r   r   r   r   r   r   deepspeed.utilsr   r   r   r   deepspeed.moe.utilsr   r   deepspeed.utils.bwcr   deepspeed.utils.torchr   deepspeed.checkpointr   deepspeed.checkpoint.constantsr   r   r   r   r   r    r!   setattrmodulesr  r+   r,   r
  r)   r)   r)   r*   <module>   s0   ,$
    