o
    Ti:                     @   s   d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	m
Z
 ddlmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ dZdd Zdd Zdd Ze dkrse  dS dS )a1  
DeepSpeed launcher, this is similar to torch's distributed.launch but supports
additional features such as arbitrary gpu exclusion.

deepspeed.launcher.launch is intended to be run on a single worker node and
will spawn several worker sub-processes depending on how many devices/ranks
are on the worker.
    N)defaultdict)Dict)ArgumentParser	REMAINDER)get_accelerator   )TORCH_DISTRIBUTED_DEFAULT_PORT)DLTS_POD_ENV_PATH)loggerget_numactl_cmd)is_torch_elastic_compatible   )ELASTIC_TRAINING_ID_DEFAULTz/tmpc                  C   s"  t dd} | jdtddd | jddtd	d
 | jdttdd
 | jddtdd
 | jdddd | jdddd | jdddd | jdtddd | jdtddd | jdddd | jdtdd d | jd!dtd"d
 | jd#dd$d | jd%td d&d | jd'td(d) | jd*td+ |  S ),NzjDeepSpeed distributed training launch utility that creates multiple distributed processes on a single node)descriptionz--node_rankr   z8The rank of the node for multi-node distributed training)typedefaulthelpz--master_addrz	127.0.0.1zMaster node (rank 0)'s address, should be either the IP address or the hostname of node 0, for single node multi-proc training, the --master_addr can simply be 127.0.0.1)r   r   r   z--master_portzdMaster node (rank 0)'s free port that needs to be used for communication during distributed trainingz--world_infoNonez$world info base64 encoded dictionaryz--module
store_truezwChange each process to interpret the launch script as a Python module, executing with the same behavior as 'python -m'.)actionr   z--no_pythonzMSkip prepending the training script with 'python' - just execute it directly.z--enable_elastic_trainingz Enable elastic training support.z--min_elastic_nodesz(Min number of nodes in elastic training.z--max_elastic_nodesz(Max number of nodes in elastic training.z--no_local_rankzNDo not pass local_rank as an argument when calling the user's training script.z
--save_pidz5main launching process pid, for internal pid trackingz--enable_each_rank_logzFredirect the stdout and stderr from each rank into different log filesz--bind_cores_to_rankzgBind each rank to different cores of the host. This improves host efficiency especially for CPU backendz--bind_core_listzList of cores to bind to with comma separated list of numbers and range. i.e. 1,3-5,7 => [1,3,4,5,7].  When not specified, all cores on system would be used rank bindingtraining_scriptzThe full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script)r   r   training_script_args)nargs)r   add_argumentintstrr   r   
parse_args)parser r   M/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/launcher/launch.pyr   #   sr   
r   c              	   C   sr   t | }|jdd}|| |D ]}z|  W q t jy$   Y qw t j|dd\}}|D ]}|  q0d S )NT)	recursive   )timeout)psutilProcesschildrenappend	terminateNoSuchProcess
wait_procskill)pidprocessr&   childgonealivepr   r   r    terminate_process_treew   s   


r2   c            .         sj  t   tj } |  D ]}d|v r"t j d| d| |   q jdkr,t	dt
 j}t|}td|  t| }t| _| j }|| }t|}td j d| d	 j  tt}d
}d
}	|D ]}
||
 }|	t|7 }	|D ]}||
 | |d7 }qqqtd|  td|	  t | | t  D ]}td| d| |   q j| d< t j| d< t|	| d< t j| d< t j| d< t|| d<  jrtdt   d  jr(t }tjt j dtjrJ dt d}|!|  W d    n	1 s#w   Y  t" s8 j#r8td d _#tj$t%rt t%6}|& }dd |D }|D ] }|'ds_|'drp|( d }|(d\}}|| |< qQW d    n	1 s}w   Y  g g  j#s~ j)dkrtj j)rt	 j) dtj$ j)szt* j) W n t+y } zt| t	d  j) d!d }~ww t,-d"t,. }t/d
|D ]}|| | }|| }t|| d#< t|| d$< g  j0rt1 j2||\}}| | d%< |  j3s!t4j5 d&  j6r d' n j6r)t	d( j7  j8s;d)|   j97  j)dkrctj j)| d*| d+}t |d}t:j;| ||d,}nt:j;| d-}td.|j< d/  | qnd0d1l=m>}  d
d2l?m@}! d
d3lAmB}" d
d lCmD  mE  mF  mG}# d
d4lHmI}$  jJd5krd _J jKd5kr j _K jKd
krĈ jJd
ksJ d6td| d7< g  j3st4j5d&g j6rd' n j6rt	d( j7  j97 dd  }%d8d9i}&tjLd:tM}'|!dId; jd< t j |' jJ jKd=|&}(|"d>|d
 dd  |#N|(d9d?|$Od@|$Od@d d dA})| |)| }*|*P  dBdCdDd  fdEdF}+tQQtQjR|+ tQQtQjS|+ tT},t|,rg }-|,D ],}|U d u r}qr|jVd
kr|jV|+tQjSd  qrtdG|j< dH |-| qrtT|,tT|- },t,Wd t|,snd S d S )JNNCCL =r   zworld_info can not be NonezWORLD INFO DICT: znnodes=z, num_local_procs=z, node_rank=r   r   zglobal_rank_mapping=zdist_world_size=zSetting MASTER_ADDRMASTER_PORT
WORLD_SIZE
CROSS_RANK
CROSS_SIZE
LOCAL_SIZEzlauncher pid: z
.deepspeedzpid file exists but shouldn'twzgDisabling elastic training support as                     PyTorch version should be greater than 1.11.xFc                 S   s   g | ]}|  qS r   )rstrip).0liner   r   r    
<listcomp>   s    zmain.<locals>.<listcomp>zexport FC_TASKROLE_NAMEzexport FC_TASK_INDEXz0 should not be a file, it should be a directory.zunable to create directory z for each rank log.z%Y%m%d%H%M%SRANK
LOCAL_RANKOMP_NUM_THREADSz-uz-mzODon't use both the '--no_python' flag and the '--module' flag at the same time.z--local_rank=_rankz.log)envstdoutstderr)rE   zprocess z spawned with command: r   )DSElasticAgent)RendezvousParameters)
WorkerSpec)Stdr   z$Max and Min nodes should be positiveNCCL_ASYNC_ERROR_HANDLINGr#   d   ELASTIC_RUN_IDc10d:)backendendpointrun_id	min_nodes	max_nodestrainer   0)rolelocal_world_size
entrypointargsrdzv_handlermax_restartsmonitor_interval	redirectsteemaster_addrmaster_portSIGINTSIGTERM)r      c              	      s   D ]}t d|j  zt|j W q ty   Y qw d ur1t  d  t | v r@t d|   d  jrNt	j
rNt	 td d S )NzKilling subprocess z exits with return code = zMain process received z	, exitingr   )r
   infor,   r2   	Exceptionerrorsysexitsave_pidospathisfileremove)signumframer-   r\   cmdlast_return_codepid_file	processes	sig_namesr   r    sigkill_handler=  s    

zmain.<locals>.sigkill_handlerzProcess z exits successfully.r   )Xr   rm   environcopykeysr
   rg   	node_rank
world_info
ValueErrorbase64urlsafe_b64decodejsonloadslistlennnodesr   r'   r   set_visible_devices_envsvisible_devices_envsrb   r   rc   rl   printgetpidrn   joinPID_FILE_BASEPATHro   openwriter   enable_elastic_trainingexistsr	   	readlines
startswithsplitenable_each_rank_logmakedirsrh   timestrftime	localtimerangebind_cores_to_rankr   bind_core_list	no_pythonrj   
executablemoduler   no_local_rankr   
subprocessPopenr,   
elasticityrH   $torch.distributed.elastic.rendezvousrI   *torch.distributed.elastic.agent.server.apirJ   -torch.distributed.elastic.rendezvous.registrydistributedelastic
rendezvousregistry)torch.distributed.elastic.multiprocessingrK   min_elastic_nodesmax_elastic_nodesgetr   get_rendezvous_handlerfrom_strrunsignalrd   re   setpoll
returncodesleep).current_envkr~   	node_list
local_nodelocal_accelerator_idsnum_local_procsglobal_rank_mappingcurr_global_rankdist_world_sizenode_idgidsgidrE   launcher_pidfdfilelinesr?   key_valkeyvalelog_name_prefix
local_proc	dist_rank
local_rankcores_per_ranknumactl_cmdlog_filelog_fdr-   rH   rI   rJ   rdzv_registryrK   cmd_argsrdzv_configsrS   rdzv_parametersspecagentry   alive_processesfinished_processesr   rs   r    main   s^  
 



 



	




& 







r   __main__)!__doc__rj   r   rm   r   r   r   r   r$   collectionsr   typingr   argparser   r   deepspeed.acceleratorr   	constantsr   nebula.constantsr	   utilsr
   r   r   r   r   r   r   r2   r   __name__r   r   r   r    <module>   s4   	T b
