o
    Ti                     @   s  d dl Z d dlmZmZ d dlZd dlmZmZmZ ddl	m
Z
mZmZmZmZ ddlmZmZmZmZ ddlmZ ddlmZ d dlmZ d d	lmZ d
ZdedededefddZdedededededefddZ dededede!def
ddZ"dededee defddZ#dededee dee!ef def
dd Z$	!d,d"eded#ee defd$d%Z%	!d,d"eded#ee defd&d'Z&d"eded#ee d(e'd)e(defd*d+Z)dS )-    N)ListDict)GraphNodeGraphModule   )get_input_nodesget_param_nodesget_index_by_graph_idget_deepcompile_handleget_real_uses)add_postprocess_make_node_metaget_output_nodemove_primals_to_head)ProfilingInterpreter)fast_free_schedule)get_acceleratorzero3_compilegraph_idgraphnodeds_idc              
   C   s   t ||tjjjj| |gd|j d| t||dd}|jd |jd< t	|}|
|| t ||tjjjj| |gd|j d| t||dd}|jd |jd< |S )Nallgather_ds_param__T
extra_argsnamemetavalwait_allgather_ds_param__F)r   torchopsdcallgather_paramdefaulttargetr   r   r   replace_input_withwait_allgather)r   r   r   r   new_ag_nodeoutput_nodenew_wait_node r,   Z/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/compile/passes/zero3_compile.pyadd_allgather   s&   



r.   release_noden_usersc                 C   sN   t ||tjjjj| ||gd|j d|j d| t||dd}d |j	d< d S )Nrelease_ds_param_r   Fr   r   )
r   r!   r"   r#   release_paramr%   r&   r   r   r   )r   r   r   r/   r   r0   new_noder,   r,   r-   add_release1   s   

r4   	grad_node
param_namec              
   C   s<   t ||tjjjj| |gd| t||dd}d |jd< d S )Nreduce_ds_param_Tr   r   )r   r!   r"   r#   reduce_gradr%   r   r   )r   r   r5   r6   r   r3   r,   r,   r-   
add_reduce;   s   

r9   param_nodesreturnc           	   
   C   sd   t |}|D ]'}t| |||j|j  |j|j }|| }|D ]}t| ||||t| qqt|S N)r   r.   ds_idsr   r4   lenr   )	r   r   param_managerr:   node_to_usespnr   usersuserr,   r,   r-   add_gather_and_releaseE   s   rD   param_nodes_bwparam_name_to_gradc                 C   s<   t | ||| |jD ]}t| ||| ||j|  q
t|S r<   )rD   param_namesr9   r=   r   )r   r   r?   rE   rF   r6   r,   r,   r-   add_gather_and_reduceR   s   
rH   Fgmgraph_orderc                 C   sH  t  }| }|| j}	t|| j|| t| j|	| _||dd |	D  t| |d}
|
j|  ~
t	  t
   t }t||}|dkrW|rWtd| d| d| j  | jjD ]%}|j|| jv }d|jv r|rtjdg|jd j|jd jd	|jd< q[t| jt
  d|d| _|dkr|rtd
| d| d| j  | S )Nc                 S   s   g | ]}|d  qS )   r,   ).0vr,   r,   r-   
<listcomp>m   s    z,add_z3_gather_release_fw.<locals>.<listcomp>	debug_logr   zFwd before scheduling graph 
 graph_id= r   )dtypedevicezFwd after scheduling graph )r   param_indicesrD   r   r	   register_graph_z3r   rungccollectr   empty_cachedistget_rankr
   printnodesr   r=   r   r!   emptyrS   rT   r   available_memory)rI   r   rJ   profiling_resultscreate_inputs_fnr?   rP   nz3real_inputsrU   profilerrankgraph_indexnis_ds_paramr,   r,   r-   add_z3_gather_release_fw]   s<   




(rj   c                 C   s   ||  | j\}}t|| j|| ||| _t| j}	| }
t|	t|
ks4J dt|
 dt|	 t| |dj|
 }~t  t	 
  t }t||}|dkrd|rdtd| d| d| j  t| jt	  d|d| _| S )Nz	Expected z inputs, got rO   r   zBwd before scheduling graph rQ   rR   )get_bwd_mappingr   rH   r   r>   r   rW   rX   rY   r   rZ   r[   r\   r
   r]   r   r`   )rI   r   rJ   ra   rb   r?   rP   rE   rF   input_nodesrd   real_outputsrf   rg   r,   r,   r-   add_z3_gather_release_bw   s(   
,

rn   
mem_budgetbwdc              	   C   s0   |rt | |||||ddS t| |||||ddS )NFrO   )rn   rj   )rI   r   rJ   ra   rb   ro   r?   rp   r,   r,   r-   add_z3_gather_release   s"   rq   )F)*rX   typingr   r   r!   torch.fxr   r   r   utilr   r	   r
   r   r   fxr   r   r   r   profilers.graph_profiler   list_scheduler   deepspeed.commcommr[   deepspeed.acceleratorr   NAMEintr.   r4   strr9   rD   rH   rj   rn   floatboolrq   r,   r,   r,   r-   <module>   sV   "




5
#