o
    Ti}P                     @   sP  d dl Z d dlmZ d dlZd dlmZmZ d dlmZ d dl	m
Z
 zd dlmZ W n	 ey3   Y nw ddlmZ dd	lmZ dd
lmZ d dlmZ dZdd ZdZdadadai Zi Zd add Z da!da"da#dBddZ$dBddZ%dBddZ&dBddZ'dd Z(dd Z)dBddZ*dBdd Z+d!d" Z,d#d$ Z-d%d& Z.d'd( Z/d)d* Z0g Z1g a2g a3g Z4d a5d+ed,e6d-ee6 d.ed/e7d0ed1e8d2efd3d4Z9d+efd5d6Z:d+ed,e6d-ee6 d.ed/e7d0ed1e8d2efd7d8Z;d9ed,e6d-ee6 d/e7d0ed1e8d2efd:d;Z<d9ed,e6d-ee6 d/e7d0ed1e8d2efd<d=Z=d9ed,e6d-ee6 d/e7d0ed1e8d2efd>d?Z>d@dA Z?dS )C    N)List)GraphGraphModule)get_accelerator)_make_offload_state_key)unset_fake_temporarily   )ProfilingResult)DSGraphParamManager)move_primals_to_headoffload_adam_statesc                 C   s   t  dkrt|  d S d S Nr   )distget_rankprint)msg r   `/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/compile/passes/offload_adam_states.pyprint_r0   s   r   g?c                   C   s.   t d u rt  a t  at  ad S d S N)copy_streamr   StreamEventoffload_eventreload_eventr   r   r   r   	lazy_init/   s
   

r   c                 C   s   t |}|| vrt tj| | dd| |< || vrd S t t | | j| | dd W d    n1 s9w   Y  |d u rJtj	td d S |j	td d S )NcpudeviceTnon_blockingstream)
r   r   
pin_memorytorch
empty_liker"   r   copy_r   record)statekey	key_eventoffload_buf_keyr   r   r   move_key@   s   r,   c                 C   s   t  t" tj| t| td| |< | | j| t| dd W d    n1 s+w   Y  |d u r<tj	td d S |j	td d S Nr   Tr   r!   )
r   r"   r   r$   r%   r   r   r&   r   r'   )r(   r)   r*   r   r   r   move_back_keyQ   s   r.   c                 C   sh   t  t |j| dd || _W d    n1 sw   Y  |d u r,tjtd d S |jtd d S )NTr   r!   )r   r"   r   r&   datar   r'   
src_tensordest_bufr*   r   r   r   move_hp_param]   s   r3   c                 C   sr   t  t tj| td|_|j| dd W d    n1 s w   Y  |d u r1tj	td d S |j	td d S r-   )
r   r"   r   r$   r%   r   r/   r&   r   r'   r0   r   r   r   move_back_hp_paramh   s   r4   c                  C   s   t  e ttdsdd tjD t_ttj D ]\} \}}d|v r(t|d d|v r1t|d qtj D ]\}}d|v rB|d= d|v rI|d= q7t	tjtjD ]	\}}t
|| qQt   W d    d S 1 skw   Y  d S )Nhp_params_pin_buffersc                 S   s"   g | ]}t  tj|d dqS )r   r   )r   r#   r$   r%   ).0tr   r   r   
<listcomp>x   s    z,offload_adam_states_sync.<locals>.<listcomp>exp_avg
exp_avg_sq)r   hasattr	optimizerfp32_partitioned_groups_flatr5   	enumerater(   itemsr,   zipr3   r   synchronize)ikr(   _r1   r2   r   r   r   offload_adam_states_syncs   s*   


"rE   c                  C   s   t  ? tj D ]\} }td|v rt|d td|v r#t|d q	ttjtjD ]	\}}t	|| q+t
   W d    d S 1 sEw   Y  d S )Nr9   r:   )r   r<   r(   r?   r   r.   r@   r5   r=   r4   r   rA   )rD   r(   srcdestr   r   r   reload_adam_states_sync   s   

"rH   c                 C   6   t  r	t  d S | d u rtt d S | t d S r   )nz3is_profilingrE   r   waitr   eventr   r   r   sync_offload_states   
   
rO   c                 C   rI   r   )rJ   rK   rH   r   rL   r   rM   r   r   r   sync_reload_states   rP   rQ   c                        fdd}|S )Nc                     s   t  d d u rt  t  d <  d dkr.t d d  d d t  d d   d S  d tjv s?J d d  dtj d  } t|  d t  d   d S )N   r   hp_paramr   zState z not found in optimizer)offload_key_eventsgetr   r   r3   r<   r(   r,   r(   taskr   r   run_offload_task   s   ,"z+make_offload_task.<locals>.run_offload_taskr   )rY   rZ   r   rX   r   make_offload_task   s   r[   c                    rR   )Nc                     sP   t  d  } |    d dkr$tj d  } d }||v r&||= d S d S d S )NrS   r   rT   )rU   rA   r<   r(   )rN   r(   r)   rX   r   r   run_offload_sync   s   
z+make_offload_sync.<locals>.run_offload_syncr   )rY   r\   r   rX   r   make_offload_sync      r]   c                    rR   )Nc                     s   t  sEt d d u rt  t d <  d dkr0t d d  d d t d   d S tj d  } t	|  d t d   d S d S )NrS   r   rT   r   )
rJ   rK   reload_key_eventsrV   r   r   r4   r<   r(   r.   rW   rX   r   r   run_reload_task   s   (z)make_reload_task.<locals>.run_reload_taskr   )rY   r`   r   rX   r   make_reload_task   r^   ra   c                 C   s   t   }tt|ad S r   )r   max_memory_allocatedmax
max_memory)namememr   r   r   update_max_memory   s   
rg   c                   C   s   t    d S r   )r   empty_cacher   r   r   r   rh      s   rh   graphgraph_idgraph_orderprofiling_results
mem_budgetparam_managerbwdreturnc           &      C   sJ  g }| j D ]}|jdkr|jtttttfv r|| q|D ]}| 	| qt
 }	|	 dt  }
td| d| dt d|
  |rH|| jn|| j}dd |D }d	}i }|r_t| j n| j }|D ]}||j |krr||j }|||j< qd|s#||d	 d	 k}|r1t  t  t  t  W d    n1 sw   Y  d	}tttj tjtjD ]\}\\}}}}td
|v rtd
}||  ||   }t||d
||  ||   || jf td|v rtd}||  ||   }t||d||  ||   || jf | |  }t|||fd| |  |jf q| j D ]}|j|vsH|jdksHd|jv rJq4g }t dd tD }|
||j  | d	k rt!td	krint"d	}|| t dd tD }|
||j  | d	k sa|D ]B}| #| | j$dt%|di d|d	  d|d  d W d    n	1 sw   Y  td|d	  d|d   t&| qq4| j D ]G}|jdkrtd|j  t&D ]1}d|d	  d|d  }| #| | j$dt'|di |d}W d    n	1 sw   Y  q nqtd|  | S dd |D }||d k}||d	 k} |rmd}!| j D ].}|jdkrk|!sk| #| | j$dt(di dd W d    n	1 sdw   Y  d}!q>t))t&a*| j D ]}|j|vs|jdks|jdksd|jv rqut!t*d	kr+t*d	 }|d  }"|}#|
||j t+ |" kr+||j t+ }$td!|d	  d|d  d"|#j d#|" d$|j d%||j  d&t+ d'|$  | ,|# | j$dt-|di d(|d	  d|d  d}#W d    n	1 sw   Y  t+|"7 a+t*"d	 t!t*d	krnt*d	 }|d  }"|
||j t+ |" ksqu| r| j D ]_}|jdkrt*D ]/}| #| | j$dt-|di d(|d	  d|d  d W d    n	1 sfw   Y  q=d)d* }%| #| | j$d|%di d+d W d    n	1 sw   Y  q3td,| d-| d.| d/|   | S )0Ncall_functionrS   z#offload_opt_states_inc start graph z bwd=z max_memory=z total_mem=c                 S   s   i | ]	\}}}}||qS r   r   )r6   re   	alloc_memdeltapeakr   r   r   
<dictcomp>  s    z*offload_opt_states_inc.<locals>.<dictcomp>r   r9   r:   rT   placeholderoffload_opt_c                 S      g | ]}|d  qS    r   r6   rY   r   r   r   r8   b      z*offload_opt_states_inc.<locals>.<listcomp>c                 S   rx   ry   r   r{   r   r   r   r8   m  r|   r   offload_opt_sync_rD   r   re   zInserting fwd offload_opt_sync_z#Inserting all offload tasks before z$offload_opt_states_inc finish graph c                 S      g | ]
}|d  r|d qS rS   r   r   r6   gr   r   r   r8         Frh   Toutputrz   z! Inserting reload_opt reload_opt_z after z
 next_inc=z
 peak_mem[z]=z inc_total=z expected_mem=reload_opt_c                   S   s   t  S r   )r   rA   r   r   r   r   <lambda>  s    z(offload_opt_states_inc.<locals>.<lambda>sync_offload_copy_streamzoffload_opt_states_inc graph z graph_order z bwd is_first_graph z is_last_graph ).nodesoptargetrE   rO   rH   rQ   rg   append
erase_noder   total_memoryMARGINr   rd   bwd_memfwd_memreversedre   r   r>   r@   r<   r(   r?   r=   r5   r   numelelement_sizeoffload_tasksdtypesumlenpopinserting_beforecreate_noder]   offload_tasks_scheduledr[   rh   copyreload_tasks_remainingtotal_reload_meminserting_afterra   )&ri   rj   rk   rl   rm   rn   ro   	to_removenodeaccelerator	total_memrf   mem_dictcurrent_peak_mempeak_memordered_nodeis_first_graphreload_sizerB   rC   r(   rT   hp_param_cpur)   sizehp_param_size
to_offload
optim_sizerY   re   offload_nodegraph_order_with_backwardis_last_graphinserted_syncnext_reload_mem
insert_posexpected_memsync_fnr   r   r   offload_opt_states_inc   s*  


 
&&




	J


D



r   c              	   C   sz   t | j}|D ]3}|jdks|jdkrq| | d|j }| jdt|fi |d W d    n1 s5w   Y  qd S )Nr   rv   update_max_memory_rq   r~   )listr   r   r   re   r   rg   )ri   r   r   re   r   r   r   add_record_max_mem_nodes  s   
r   c              	   C   s  |rBdd |D }||d k}d}	| j D ],}
|
jdkr@|	s@|r@| |
 | jdtdi dd	 W d    n1 s9w   Y  d
}	qnF||d d k}t| } d}| j D ]4}
|
jdkr|s|rtd|
j  | |
 | jdtdi dd	 W d    n1 sw   Y  d
}qSt	|  | S )Nc                 S   r   r   r   r   r   r   r   r8     r   z-insert_offload_opt_states.<locals>.<listcomp>r   Fr   rq   r   
reload_optr~   Trv   zInserting offload_opt before offload_opt)
r   r   r   r   rH   r   r   re   rE   r   )ri   rj   rk   rl   rm   rn   ro   r   r   inserted_reloadr   r   inserted_offloadr   r   r   insert_offload_opt_states  s2   

r   gmc                 C      t | j||||||| _| S r   )r   ri   r   rj   rk   rl   create_inputs_fnrm   rn   ro   r   r   r   move_opt_states     r   c                 C   r   r   )r   ri   r   r   r   r   move_opt_states_sync
  r   r   c                 C   sN   |s#||d d kr%t   t  W d    d S 1 sw   Y  d S d S d S r   )r   rE   r   r   r   r   offload_adam_states_for_init  s
   "r   c                 C   s"   t   | att  a|ad S r   )r   r<   r$   r   r   current_devicerJ   )adam_optimizer_nz3r   r   r   init_offload_opt_states  s   r   r   )@r   typingr   r$   torch.fxr   r   deepspeed.acceleratorr   %deepspeed.runtime.zero.offload_statesr   torch._subclasses.fake_tensorr   ImportError	profilersr	   graph_paramr
   fxr   deepspeed.commcommr   NAMEr   r   r   r   r   rU   r_   rd   r   r<   r   rJ   r,   r.   r3   r4   rE   rH   rO   rQ   r[   r]   ra   rg   rh   r   offload_tasks_remainingr   reload_task_remainingr   intfloatboolr   r   r   r   r   r   r   r   r   r   r   <module>   s   








 V
%


	