o
    Tif                     @   s   d dl Z d dlZd dlmZ d dlmZmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlT d d	lmZmZmZ d d
lmZ d dlmZ e add ZG dd deZdd ZG dd deZdS )    N)OrderedDict)z3_leaf_moduleset_z3_leaf_module)see_memory_usage)apply_to_tensors_onlyis_zero_param)OffloadDeviceEnum)_init_external_params)*)PartitionedParameterCoordinatorInflightParamRegistryiter_params)get_accelerator)utilsc                 C   sh   t |tu rg }|D ]}t| |||}|| q
t|S t |tju r2|| |jr0|| |S |S N)typetuple+_apply_forward_and_backward_to_tensors_onlyappendtorchTensorrequires_gradregister_hook)moduleforward_functionbackward_functionoutputstouched_outputsoutputtouched_output r    \/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/zero/parameter_offload.pyr      s   
r   c                       s4   e Zd Z fddZ fddZ fddZ  ZS )ZeROOrderedDictc                    s"   t  j|i | || _d| _dS )zA replacement for ``collections.OrderedDict`` to detect external ZeRO params.

        Args:
            parent_module (``collections.OrderedDict``): the collection to replace
        FN)super__init___parent_module_in_forward)selfparent_moduleargskwargs	__class__r    r!   r$   *   s   
zZeROOrderedDict.__init__c                    s$   t   ^}}}|| jfft| S r   )r#   
__reduce__r%   r   )r'   r0_r2r+   r    r!   r-   5   s   zZeROOrderedDict.__reduce__c                    sl   t  |}|d u r|S t|dr4|jtjkr4| jjjr4t	t
d | |  td| d|j dd |S )N	ds_statusz+Registering external parameter from getter z	 ds_id = Fforce)r#   __getitem__hasattrr1   ZeroParamStatusNOT_AVAILABLEr%   _parametersr&   register_external_parameterFWD_MODULE_STACK
all_gatherprint_rank_0ds_id)r'   keyparamr+   r    r!   r5   9   s   
zZeROOrderedDict.__getitem__)__name__
__module____qualname__r$   r-   r5   __classcell__r    r    r+   r!   r"   (   s    r"   c                 C   sV   |   D ]$} | j| _|tkr|| d}n| }| j D ]\}}|||< q|| _qd S )N)r(   )modulesr9   _original_parametersr"   items)r   cls	new_paramr?   r@   r    r    r!   _inject_parametersJ   s   
rJ   c                       s   e Zd Zdddddejddddddddf fdd		Zed
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd ZdgfddZe dd Ze dd Ze d d! Ze d"d# Zd$d% Zd&d' Zd(d) Z  ZS )*DeepSpeedZeRoOffloadTii ʚ;i NFr   c                    s
  t ddd td j dt  dd || _|| _t| d j| _|
| _	d | _
d| _|| _|| _|| _|| _|d urK|jtjkrK|j| _
|j| _| ||| | D ]}t| qVt|t t|| _t|	| _| | j| j| _t|| _t|| _t|| _ t! " rd n|rt! # nt! $ | _%t&|dst' |_(|j(| _)d| _*|dkrt+j,| _-d | _.t/ | _0g | _1| 2|| d| _*t3| j| j| j | j%| j)| j
tj4k| j| j| j| j*| jd	| _5g | _6g | _7| 8  td
t9| j6 dt9| j7 dd t ddd d S )Nz'DeepSpeedZeRoOffload initialize [begin]Tr3   zinitialized z with args: Fr   ds_inflight_param_registry)prefetch_bucket_szmax_reuse_distance_in_numel!max_available_parameters_in_numelallgather_streaminflight_param_registryprefetch_nvmetimerszero_quantized_weights#zero_quantized_nontrainable_weightsfast_sharding_for_leaf_modulelog_trace_cache_warningsz Created module hooks: forward = , backward = z%DeepSpeedZeRoOffload initialize [end]):r   r=   rA   localsr   rS   list
parametersdtypedp_process_groupoffload_deviceoffload_param_pin_memoryzero_param_parallel_grouprT   rU   rW   devicer   none
pin_memory_convert_to_zero_parametersrE   r	   rJ   r"   int!param_numel_persistence_thresholdmodel_persistence_thresholdmark_persistent_parameterspersistent_parameters_prefetch_bucket_sz_max_reuse_distance_in_numel"_max_available_parameters_in_numelr   is_synchronized_deviceStreamdefault_stream'_DeepSpeedZeRoOffload__allgather_streamr6   r   rL   ._DeepSpeedZeRoOffload__inflight_param_registryrV   sysmaxsizemin_granularity_valuemin_granularity_layersetgranularity_infoz3_leaf_layers!_set_z3_leaf_modules_by_thresholdr   nvmeparam_coordinatorforward_hooksbackward_hookssetup_zero_stage3_hookslen)r'   r   rS   	ds_configoverlap_commprefetch_bucket_sizemax_reuse_distancemax_live_parametersparam_persistence_thresholdrg   r]   offload_param_configmpur`   rT   rU   !zero_module_granularity_thresholdrW   mr+   r    r!   r$   [   s   








zDeepSpeedZeRoOffload.__init__c                 C   sF   |   | j t| jddD ]}|jtjkr t|  dqdS )zPartitioning Parameters that were not partitioned usually if parameters
        of modules whose input parameters do not require grad computation do not
        trigger post call and will therefore will remain unpartitionedTrecursez expected to be releasedN)	get_param_coordinatorrelease_and_reset_allr   r   r1   r7   r8   RuntimeError
ds_summary)r'   r@   r    r    r!   partition_all_parameters   s   z-DeepSpeedZeRoOffload.partition_all_parametersc                 C   s   | j S r   )r{   r'   r    r    r!   r      s   z*DeepSpeedZeRoOffload.get_param_coordinatorc                 C      |    d S r   )r   r   r    r    r!   empty_partition_cache      z*DeepSpeedZeRoOffload.empty_partition_cachec                 C   s   dd |  D }|r>dd |  D }|r |d j|d d S d }|r(| }t||| j|| j| j|| j| j| j	d
 d S d S )Nc                 S   s   g | ]}t |s|qS r    r   .0pr    r    r!   
<listcomp>       zDDeepSpeedZeRoOffload._convert_to_zero_parameters.<locals>.<listcomp>c                 S   s   g | ]}t |r|qS r    r   r   r    r    r!   r      r   r   )
param_list)
r   data_parallel_groupr\   config_dict_or_pathremote_devicerc   r   r`   rT   rU   )
r[   convert_to_zero_parametersget_data_parallel_groupInitr\   r^   r_   r`   rT   rU   )r'   r   r   r   non_zero_paramszero_paramsgroupr    r    r!   rd      s(   
z0DeepSpeedZeRoOffload._convert_to_zero_parametersc                 C   r   r   )_remove_module_hooksr   r    r    r!   destroy   r   zDeepSpeedZeRoOffload.destroyc                 C   sb   t | j}t | j}| jD ]}|  q| jD ]}|  q| j  td| d| dd d S )Nz Deleted module hooks: forward = rX   Fr3   )r   r|   r}   removefwd_pre_hookr=   )r'   num_forward_hooksnum_backward_hookshookr    r    r!   r      s   







z)DeepSpeedZeRoOffload._remove_module_hooksc                    s@   d _ t fdd} j| _  j t j d S )Nr   c                    s        d S r   )r   
reset_stepr   r)   r   r    r!   _start_of_forward_hook   s   zLDeepSpeedZeRoOffload.setup_zero_stage3_hooks.<locals>._start_of_forward_hook)	hierarchyinstrument_w_nvtxr   register_forward_pre_hookr   _register_deepspeed_moduler;   r   )r'   r   r    r   r!   r~      s   z,DeepSpeedZeRoOffload.setup_zero_stage3_hooksc                 C   s~   g }d}d}| j jddD ]"\}}|j| |krq|j|kr/|d7 }d|_|| ||j7 }qtd| d| ddd |S )	Nr   Tr      z0Parameter Offload: Total persistent parameters: z in z paramsr3   )r   named_parametersds_numel
ds_persistr   r=   )r'   param_thresholdmodel_thresholdpersistent_paramstotal_persistent_parametersparams_countnamer@   r    r    r!   rh     s"   


z/DeepSpeedZeRoOffload.mark_persistent_parametersc                    sp  |d }|_ tr D ]}|_qn D ]}|d d |d< j||d qtjjfdd}t	fdd}dd	   fd
d}fdd}	tjj fdd}
j
| j
| tdst	fddG fdddtjj}|_j| tdst	fddG fdddtjj}|_j|
 d S )Nr   r   )countc                    s     |  d S r   )pre_sub_module_forward_functionr   r   r    r!   _pre_forward_module_hook'  s   zQDeepSpeedZeRoOffload._register_deepspeed_module.<locals>._pre_forward_module_hookc           	         s^  t   |d u rg }n7t|ttfsBt|r|g}n'g }t|tr$|nt|}|	 D ]\}}|
ds?t|r?|| q,|}tdd |D ]^}t|drTt|nt|j t|dr`|n|j}t fddt D sd|_t d }t|| td	|jj d
|j ddd  | jv rtd| jj d
|j dd t| | |  qI|  d S )N__c                 S   s   t | pt| dS )Nds_param_alias)r   r6   )itemr    r    r!   <lambda>>  s    zdDeepSpeedZeRoOffload._register_deepspeed_module.<locals>._post_forward_module_hook.<locals>.<lambda>r>   c                 3   s    | ]} |j v V  qd S r   )_external_params)r   r   r?   r    r!   	<genexpr>B  s    zeDeepSpeedZeRoOffload._register_deepspeed_module.<locals>._post_forward_module_hook.<locals>.<genexpr>Tr2   z*Registering dangling parameter for module z
, ds_id = .Fr3   z6  Unregistering nested dangling parameter from module )r;   pop
isinstancerZ   r   r   	is_tensordictvarsrG   
startswithr   filterr6   idr   anyis_external_paramr:   r=   r,   rA   r>   r   unregister_external_parameterr<    post_sub_module_forward_function)	r   inputr   r   r   valr   actual_external_parammodule_to_registerr   r   r!   _post_forward_module_hook+  sB   




zRDeepSpeedZeRoOffload._register_deepspeed_module.<locals>._post_forward_module_hookc                 S   s   dt |  dS )Nz-A module has unknown inputs or outputs type (z) and the tensors embedded in it cannot be detected. The ZeRO-3 hooks designed to trigger before or after backward pass of the module relies on knowing the input and output tensors and therefore may not get triggered properly.)r   )valuer    r    r!   _bwd_hook_unexpected_inputs_msgV  s   zXDeepSpeedZeRoOffload._register_deepspeed_module.<locals>._bwd_hook_unexpected_inputs_msgc                    s   t | jj| dS )Nwarning_msg_fn)r   
pre_bwd_fnapply)r   inputsr   r   r    r!   _pre_backward_module_hook[  s   zRDeepSpeedZeRoOffload._register_deepspeed_module.<locals>._pre_backward_module_hookc                    s.   d _  fdd} fdd}t |||S )Nr   c                     s(    j d  _  j dkr  d S d S Nr   r   ds_grads_remaining!post_sub_module_backward_function)unused)r   r'   r    r!   _run_after_backward_hookh  s   
zDeepSpeedZeRoOffload._register_deepspeed_module.<locals>._alternate_post_backward_module_hook.<locals>._run_after_backward_hookc                    s   | j r  jd7  _d S d S Nr   )r   r   )r   r   r    r!   _run_before_forward_functionn  s   zDeepSpeedZeRoOffload._register_deepspeed_module.<locals>._alternate_post_backward_module_hook.<locals>._run_before_forward_function)r   r   )r   r   r   r   r   r   r!   $_alternate_post_backward_module_hookc  s   z]DeepSpeedZeRoOffload._register_deepspeed_module.<locals>._alternate_post_backward_module_hookc                    s   d| _ t| jj| dS )Nr   r   )r   r   post_bwd_fnr   )r   r   r   r    r!   _post_backward_module_hooku  s
   zSDeepSpeedZeRoOffload._register_deepspeed_module.<locals>._post_backward_module_hookr   c                    s*   | j dkr |  |  j d8  _ d S d S )Nr   r   )applied_pre_backward_ref_cnt pre_sub_module_backward_function
sub_moduler   r    r!   _run_before_backward_function  s   

zVDeepSpeedZeRoOffload._register_deepspeed_module.<locals>._run_before_backward_functionc                       *   e Zd Ze fddZedd ZdS )zUDeepSpeedZeRoOffload._register_deepspeed_module.<locals>.PreBackwardFunctionForModulec                    s<   | _  | _t| j dsd| j _| j  jd7  _| }|S )Nr   r   r   )r   pre_backward_functionr6   r   detach)ctxr   r   r   r    r!   forward  s   z]DeepSpeedZeRoOffload._register_deepspeed_module.<locals>.PreBackwardFunctionForModule.forwardc                 W   s   |  | j |S r   )r   r   r   r)   r    r    r!   backward  s   z^DeepSpeedZeRoOffload._register_deepspeed_module.<locals>.PreBackwardFunctionForModule.backwardNrA   rB   rC   staticmethodr   r   r    r   r    r!   PreBackwardFunctionForModule  s
    
r   r   c                    s   | j dkr |  d S d S )Nr   r   r   r   r    r!   _run_after_backward_function  s   
zUDeepSpeedZeRoOffload._register_deepspeed_module.<locals>._run_after_backward_functionc                       r   )zSDeepSpeedZeRoOffload._register_deepspeed_module.<locals>.PostBackwardFunctionModulec                    s,   | _ |jr jd7  _ | _| }|S r   )r   r   r   post_backward_functionr   )r   r   r   r   r    r!   r     s   	z[DeepSpeedZeRoOffload._register_deepspeed_module.<locals>.PostBackwardFunctionModule.forwardc                 W   s,   | j jd | j _| j jdkr| | j  |S r   )r   r   r   r   r    r    r!   r     s   z\DeepSpeedZeRoOffload._register_deepspeed_module.<locals>.PostBackwardFunctionModule.backwardNr   r    r   r    r!   PostBackwardFunctionModule  s
    r  )r>   r   r[   ds_z3_leaf_modulechildrenr   r   compilerdisabler   r|   r   r   register_forward_hookr6   autogradFunctionr   r}   r   )r'   r   r   my_countr@   childr   r   r   r   r   r   r  r    )r   r   r   r   r'   r!   r     s@   *


z/DeepSpeedZeRoOffload._register_deepspeed_modulec                 C   sn   t d|jj dd t| |  }|| | r"|| |j	|dd t d|jj ddd d S )NzBefore sub module function Fr3   Tr   z after fetch)
r   r,   rA   r;   r   r   trace_prologueis_record_tracerecord_modulefetch_sub_moduler'   r   r{   r    r    r!   r     s   


z4DeepSpeedZeRoOffload.pre_sub_module_forward_functionc                 C   sV   t d|jj d|j ddd |  }|| t d|jj d|j ddd d S )NzAfter sub module function   before releaseFr3   z   after releaser   r,   rA   r>   r   release_sub_moduler  r    r    r!   r     s   

z5DeepSpeedZeRoOffload.post_sub_module_forward_functionc                 C   s6   |   }|| | r|| |j|dd d S )NFr  )r   r  r  r  r  r  r    r    r!   r     s
   

z5DeepSpeedZeRoOffload.pre_sub_module_backward_functionc                 C   sR   t d|jj d|j ddd |  | t d|jj d|j ddd d S )Nz#After sub module backward function r  r  Fr3   r  r  )r'   r   r    r    r!   r     s   
z6DeepSpeedZeRoOffload.post_sub_module_backward_functionc              
   C   s   |  | tdd ddd dd | jD ]}t|dd q| j|krL| || tj	d|  | j
D ]}t|jj d	|j dd q9d S tjd
| j d	| j d| j d|  d S )NzMODULE NAME   |zGRANULARITY VALUE   Tr3   z>z3_leaf_module was set by stage3_module_granularity_threshold::z$The smallest module granularity is [zq]. To make stage3_module_granularity_threshold effective, you need to set stage3_module_granularity_threshold >= z. Current Value:)_get_granularity_recursivelyr=   ljustrjustrw   rt   _set_leaf_by_threshold_preorderr   loggerinforx   r,   rA   ds_model_granularitywarningru   )r'   r   r   granularitylayerr    r    r!   ry     s&   
"


z6DeepSpeedZeRoOffload._set_z3_leaf_modules_by_thresholdc           	      C   s  d}t | stj|_dS d}d}|tdd |jddD 7 }t| s/tj|_d|fS | D ]}| |\}}||7 }||7 }q3|j	j
tjjjjv rWtj|_||fS |d7 }||krc|| ntj}||_| j|krv|| _|j	j
| _| j|j	j
d	 d
t|d  ||fS )zKThis function is used to recursively obtain the granularity of each module.g    eA)r   r   r   c                 s       | ]}|j V  qd S r   r   r   r    r    r!   r   !      zDDeepSpeedZeRoOffload._get_granularity_recursively.<locals>.<genexpr>Fr   r   r  r  r  )rZ   r[   rr   rs   r   sumr   r  r  r,   rA   r   nnrE   	container__all__rt   ru   rw   addr  strr  )	r'   r   Z3_MAX_LEAF_SIZE
num_layers
num_paramsr
  layers_in_childparams_in_childr   r    r    r!   r    s2   


*z1DeepSpeedZeRoOffload._get_granularity_recursivelyc                 C   sd   t dd | D }|dkrdS |j|kr#t|d | j| dS | D ]}| || q'dS )zNSet modules as leaf modules based on the threshold, prioritizing parent nodes.c                 s   r$  r   r%  r   r    r    r!   r   B  r&  zGDeepSpeedZeRoOffload._set_leaf_by_threshold_preorder.<locals>.<genexpr>r   NT)r'  r[   r   r   rx   r   r  r  )r'   r   granularity_treshholdr/  r   r    r    r!   r  ?  s   

z4DeepSpeedZeRoOffload._set_leaf_by_threshold_preorder)rA   rB   rC   rr   rs   r$   r   r   r   r   rd   r   r   r~   rh   r   r   no_gradr   r   r   r   ry   r  r  rD   r    r    r+   r!   rK   Y   sL    `
	 5



,rK   )rr   r   collectionsr   deepspeed.utilsr   r   deepspeed.runtime.utilsr   deepspeed.runtime.zero.utilsr   r   %deepspeed.runtime.zero.offload_configr   +deepspeed.runtime.zero.partition_parametersr	   4deepspeed.runtime.zero.partitioned_param_coordinatorr   r   r   deepspeed.acceleratorr   	deepspeedr   rZ   r;   r   r"   rJ   objectrK   r    r    r    r!   <module>   s"   "