o
    TiH                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
 d dlmZ ddlmZmZ ddlmZmZ G d	d
 d
e	ZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZdS )    N)split)ABCabstractmethod)get_accelerator   )loggerget_numactl_cmd   )PDSH_MAX_FAN_OUTMVAPICH_TMP_HOSTFILEc                   @   sP   e Zd Zdd Zedd Zedd Zdd Zd	d
 Ze	dd Z
dd ZdS )MultiNodeRunnerc                 C   s0   || _ |   |  | _|j| _|| _i | _d S N)argsvalidate_argsparse_user_argsuser_argumentsuser_scriptworld_info_base64exportsselfr   r    r   W/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/launcher/multinode_runner.py__init__   s   

zMultiNodeRunner.__init__c                 C      dS )z/Return whether the corresponding backend existsNr   r   r   r   r   backend_exists       zMultiNodeRunner.backend_existsc                 C   r   )z%Return the command to execute on nodeNr   )r   environmentactive_resourcesr   r   r   get_cmd!   r   zMultiNodeRunner.get_cmdc                 C   s2   |  }td|rd| d}|| j|  < d S )Nz[^\w@%+=:,./-]")stripresearchr   )r   keyvarr   r   r   
add_export%   s   zMultiNodeRunner.add_exportc                 C      | j jS r   )r   	user_argsr   r   r   r   r   +   s   zMultiNodeRunner.parse_user_argsc                 C   r(   )zReturn the name of the backend)	__class____name__r   r   r   r   name.   s   zMultiNodeRunner.namec                 C   r   )zValidate self.argsNr   r   r   r   r   r   3   r   zMultiNodeRunner.validate_argsN)r+   
__module____qualname__r   r   r   r    r'   r   propertyr,   r   r   r   r   r   r      s    


r   c                       s@   e Zd Z fddZdd Zdd Zedd Zd	d
 Z  Z	S )
PDSHRunnerc                    s   t  || d S r   )superr   r   r*   r   r   r   9   s   zPDSHRunner.__init__c                 C   
   t dS Npdshshutilwhichr   r   r   r   r   <      
zPDSHRunner.backend_existsc                 C   s:   g }| j jD ]}d|v rd|dd}|| q|S )N z"{}"r!   z\")r   r)   formatreplaceappend)r   processed_argsargr   r   r   r   ?   s   zPDSHRunner.parse_user_argsc                 C   r   r4   r   r   r   r   r   r,   J      zPDSHRunner.namec                 C   s  d|d< | j jd ur|dd d| j j |d< d| }td|  dd	d
ttd|gt	| j j
 }d}| j D ]\}}|d||7 }q>|dtjd dtjdddd| j dd| j j d| j j g
}| j jrw|d | j jr|d | j jr|d | j jr|dt  g7 }| j jr|d| j j  | j jr|d |d| j j  |d| j j  dd  |d!d" D }	|d#d$|	d d% g }
|| | j g | j! |
|fS )&NsshPDSH_RCMD_TYPEPDSH_SSH_ARGS_APPEND z             -p ,z$Running on the following workers: %sr5   z-Sz-fz-wzexport {}={}; zcd .;-u-mzdeepspeed.launcher.launchz--world_info=z--node_rank=%nz--master_addr=z--master_port=--no_python--modulez--no_local_rankz
--save_pidz--enable_each_rank_log=z--enable_elastic_trainingz--max_elastic_nodes=z--min_elastic_nodes=c                 S   s   g | ]}|d  qS )\r   ).0ir   r   r   
<listcomp>v       z&PDSHRunner.get_cmd.<locals>.<listcomp>r      z	pkill -f r:   )"r   ssh_portgetjoinkeysr   infostrr
   r   launcher_argsr   itemsr;   ospathabspathsys
executabler   master_addrmaster_port	no_pythonr=   moduleno_local_ranksave_pidgetpidenable_each_rank_logelastic_trainingmax_elastic_nodesmin_elastic_nodesr   r   )r   r   r   active_workerspdsh_cmd_argsr   r%   valdeepspeed_launchcmd_to_searchkill_commandr   r   r   r    N   sD   




zPDSHRunner.get_cmd)
r+   r-   r.   r   r   r   r/   r,   r    __classcell__r   r   r2   r   r0   7   s    
r0   c                       sL   e Zd Z fddZdd Zedd Z fddZd	d
 Zdd Z	  Z
S )OpenMPIRunnerc                    s$   t  || || _| dd d S )NUCX_TLStcp)r1   r   resource_poolr'   r   r   r   ru   r2   r   r   r   ~   s   zOpenMPIRunner.__init__c                 C   r3   )N	ompi_infor6   r   r   r   r   r         
zOpenMPIRunner.backend_existsc                 C   r   )Nopenmpir   r   r   r   r   r,      r@   zOpenMPIRunner.namec                    sf   t    |   | jjdks| jjdkrt| j d| jjdks)| jj	dkr1t| j dd S NrD   z2 backend does not support worker include/exclusionz1 backend does not support limiting num nodes/gpus)
r1   r   _setup_mpi_environmentr   includeexclude
ValueErrorr,   	num_nodesnum_gpusr   r2   r   r   r      s   
zOpenMPIRunner.validate_argsc                 C   sV   g d}t dd |D stdtjd tjd< tjd tjd< tjd	 tjd
< dS )zPSets up MPI-related environment variables or raises an error if they're missing.)OMPI_COMM_WORLD_LOCAL_RANKOMPI_COMM_WORLD_RANKOMPI_COMM_WORLD_SIZEc                 s   s    | ]}|t jv V  qd S r   )r[   environ)rM   r&   r   r   r   	<genexpr>   s    z7OpenMPIRunner._setup_mpi_environment.<locals>.<genexpr>ziMPI environment variables are not set. Ensure you are running the script with an MPI-compatible launcher.r   
LOCAL_RANKr   RANKr   
WORLD_SIZEN)allEnvironmentErrorr[   r   )r   required_varsr   r   r   r|      s   z$OpenMPIRunner._setup_mpi_environmentc                 C   s   t | j }t| jj}g d}t|dkr4tt|d D ]}|| dv r3||d  dkr3g } nqdd| d| jj d	d
dg| | }g }| j	
 D ]\}	}
|dd|	|
g7 }qMg }| jjsptjdg}| jjrp|d || | | jg | j S )N)--mcabtl_tcp_if_includeeth0r   r	   )z-mcar   r   mpirun-nz	-hostfiler   btlz^openibz-x{}={}rH   rI   )sumru   valuesr   r   rY   lenrangehostfiler   rZ   r;   rb   r^   r_   rc   r=   r   r   )r   r   r   total_process_countrY   btl_tcp_optrN   
mpirun_cmd
export_cmdkvpython_execr   r   r   r       s>   		

zOpenMPIRunner.get_cmd)r+   r-   r.   r   r   r/   r,   r   r|   r    rq   r   r   r2   r   rr   |   s    
rr   c                       D   e Zd Z fddZdd Zedd Z fddZd	d
 Z  Z	S )MPICHRunnerc                       t  || || _d S r   r1   r   ru   rv   r2   r   r   r         
zMPICHRunner.__init__c                 C   r3   Nr   r6   r   r   r   r   r      rx   zMPICHRunner.backend_existsc                 C   r   )Nmpichr   r   r   r   r   r,      r@   zMPICHRunner.namec                    ^   t    | jjdks| jjdkrt| j d| jjdks%| jjdkr-t| j dd S rz   	r1   r   r   r}   r~   r   r,   r   r   r   r2   r   r   r         
zMPICHRunner.validate_argsc                    s  | j  }t|}t|d  t fdd|D stddd| d  gt| jj }g }| j	
 D ]\}}|dd	||g7 }q4|dd
t| jjg7 }|ddt| jjg7 }|ddt|g7 }|ddt g7 }|dg7 }d}	t| j  D ]\}
}|
dkr| }	qy|	d| 7 }	qy||	g7 }dg| jjg }g }| jjs|tjdg7 }| jjr|d |d n|d ttjtjtd }|g| | jg | j }|| | | S )Nr   c                       g | ]}| kqS r   r   rM   nprocess_per_noder   r   rO      rP   z'MPICHRunner.get_cmd.<locals>.<listcomp>z.MPICH requires same number of devices per noder   r   -ppn-genvr   MASTER_ADDRMASTER_PORTr   
LOCAL_SIZE-hostsrD   rE   z
--launcherrH   rI   rK   rJ   z/launcher_helper.py)ru   r   r   listr   r   r   r   rY   r   rZ   r;   rX   r`   ra   	enumeraterV   launcherrb   r^   r_   rc   r=   r[   r\   dirnamerealpath__file__r   r   )r   r   r   devices_per_noder   r   r   r   r   hostsrN   hosthelper_argsr   
helper_cmdr   r   r   r       sN   






zMPICHRunner.get_cmd
r+   r-   r.   r   r   r/   r,   r   r    rq   r   r   r2   r   r          
	r   c                       r   )
IMPIRunnerc                    r   r   r   rv   r2   r   r   r     r   zIMPIRunner.__init__c                 C   r3   r   r6   r   r   r   r   r     rx   zIMPIRunner.backend_existsc                 C   r   )Nimpir   r   r   r   r   r,     r@   zIMPIRunner.namec                    r   rz   r   r   r2   r   r   r   "  r   zIMPIRunner.validate_argsc                    sH  | j  }t|}t|d  t fdd|D stddd  gt| jj }g }| j	
 D ]\}}|d| | g7 }q1| jjrVt| jj d\}	}
|ddt|	g7 }|dd	t| jjg7 }|dd
t| jjg7 }|ddt|g7 }|ddt g7 }|g d7 }|dg7 }d}t| j  D ]\}}|dkr| }q|d| 7 }q||g7 }g }t|D ]d}|  }g }| jjrt| jj |\}
}||7 }| jjs|tjdg7 }| jjr|d ddt|g}|ddt|g7 }|dkrddg| | | jg | j }q|g d | | | jg | j }qt|| |  || | S )Nr   c                    r   r   r   r   r   r   r   rO   /  rP   z&IMPIRunner.get_cmd.<locals>.<listcomp>z2Intel MPI requires same number of devices per noder   r   r   OMP_NUM_THREADSr   r   r   r   )r   	I_MPI_PIN0r   rD   rE   rH   rI   -envr   r   r   1):r   r   )ru   r   r   r   r   r   r   r   rY   r   rZ   bind_cores_to_rankr   bind_core_listrX   r`   ra   r   rV   r   rb   r^   r_   rc   r=   r   r   print)r   r   r   r   r   r   r   r   r   cores_per_rank_r   rN   r   per_host_cmd
local_rankr   numactl_cmdenv_mappingr   r   r   r    +  sb   





 zIMPIRunner.get_cmdr   r   r   r2   r   r     r   r   c                       s8   e Zd Z fddZdd Zedd Zdd Z  ZS )	SlurmRunnerc                    r   r   r   rv   r2   r   r   r   k  r   zSlurmRunner.__init__c                 C   r3   )Nsinfor6   r   r   r   r   r   o  r9   zSlurmRunner.backend_existsc                 C   r   )Nslurmr   r   r   r   r   r,   r  r@   zSlurmRunner.namec           
      C   sH  t | jddrJ dt| j }dd| gt| jj }t | jddr-|d| jjg7 }| jjdkr@|	d	 |	| jj  | jj
dkrS|	d
 |	| jj
  | jjdkrf|	d |	| jj  | jjdkry|	d |	| jj  d}| j D ]\}}|d| d| 7 }qtjdg}||g | | jg | j }	|	S )Ndetect_nvlink_pairsFz8slurm backend does not support remapping visible devicessrunr   slurm_commentrD   z	--commentz	--includez	--excluder   z--nodesz--gpusz--export=ALLrE   =rH   )getattrr   r   ru   r   r   rY   r   r}   r=   r~   r   r   r   rZ   r^   r_   r   r   )
r   r   r   r   srun_cmdr   r%   rm   r   commandr   r   r   r    v  s@   





zSlurmRunner.get_cmd)	r+   r-   r.   r   r   r/   r,   r    rq   r   r   r2   r   r   i  s    
r   c                       r   )MVAPICHRunnerc                    sz   t  || || _| dd | dd t  dkr#| dd | dd | dd | d	d
 | dd d S )NMV2_SMP_USE_CMAr   MV2_DEBUG_SHOW_BACKTRACEr   cudaMV2_USE_CUDAMV2_SUPPORT_DLMV2_ENABLE_AFFINITYMV2_INTER_ALLGATHER_TUNING5MV2_CUDA_USE_NAIVE)r1   r   ru   r'   r   device_namerv   r2   r   r   r     s   zMVAPICHRunner.__init__c                 C   s^   t d}d}|std |S tdg}|d }d|v r%d}|S td|  |S )NmpinameFz9mpiname does not exist, mvapich is not installed properlyzutf-8zMVAPICH2-GDRTz9Expected MVAPICH2-GDR as return for mpiname but received )r7   r8   warningswarn
subprocesscheck_outputdecoder"   )r   mpiname_existsexistsresultsmpiname_resultsr   r   r   r     s   

zMVAPICHRunner.backend_existsc                 C   r   )Nmvapichr   r   r   r   r   r,     r@   zMVAPICHRunner.namec                    r   rz   r   r   r2   r   r   r     s   
zMVAPICHRunner.validate_argsc                    s  | j  }t|}t|d  t fdd|D stdttd}| j  D ]
}|	| d q)W d    n1 s>w   Y  dd| d	  d
t gt
| jj }g }| j D ]\}	}
|dd|	|
g7 }q\g }| jjstjdg}| jjr|d || | | jg | j S )Nr   c                    r   r   r   r   r   r   r   rO     rP   z)MVAPICHRunner.get_cmd.<locals>.<listcomp>z0mvapich requires same number of devices per nodew
r   z-npr   z
--hostfiler   r   rH   rI   )ru   r   r   r   r   r   openr   rV   writer   r   rY   r   rZ   r;   rb   r^   r_   rc   r=   r   r   )r   r   r   r   r   fdr   r   r   r   r   r   r   r   r   r      s:   




zMVAPICHRunner.get_cmdr   r   r   r2   r   r     s    
r   )r[   r^   r7   r   r   r#   shlexr   abcr   r   deepspeed.acceleratorr   utilsr   r   	constantsr
   r   r   r0   rr   r   r   r   r   r   r   r   r   <module>   s$   $EPHU0