o
    iZ6                     @   s2  d dl Z d dlZd dlZd dlmZ d dlZd dlZe jG dd dZdd Z	de
fdd	Zde
fd
dZdee dee fddZdd Zd dedee fddZd dedefddZd dedee fddZd dedee fddZd!dee fddZd dedee fddZd dedee fddZdS )"    N)Optionalc                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZ	e
e ed< dZe
e ed	< dZe
e ed
< dZeed< dZe
e ed< dZe
e ed< dZe
e ed< dZeed< dd Zdd ZdS )DistributedOptionFdistributednccldist_backendenv://dist_init_methodNdist_world_size	dist_rank
local_rankr   ngpudist_master_addrdist_master_portdist_launcherTmultiprocessing_distributedc                 C   sn  | j r| jdkr t| j| jd u rtdt| jd u r tdt| j	| j| _	t
| j| j| _t| j| j| _| jd uri| jdkrJtd| j dtjv ritjd }| jt|dkritd| j d	| | j	d ur| jd ur| j	| jkrtd
| j	 d| j | jdkrt| j| j| _t| j| _| jd ur| jd urd| j d| j | _d S d S d S d S d S )Nr   zO--dist_master_addr or MASTER_ADDR must be set if --dist_init_method == 'env://'zM--dist_master_port or MASTER_PORT must be set if --dist_init_port == 'env://'   z!Assuming 1GPU in this case: ngpu=CUDA_VISIBLE_DEVICES,zLOCAL_RANK=z/ is bigger than the number of visible devices: zRANK >= WORLD_SIZE: z >= ztcp://:)r   r   get_master_addrr   r   RuntimeErrorget_master_portr   get_rankr
   get_world_sizer	   get_local_rankr   r   osenvironlensplit)selfcvd r!   S/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/train/distributed_utils.pyinit_options   sZ   











,zDistributedOption.init_optionsc                 C   sp   | j r2tjdd tjdd tj j| j| j| j| j	d | j
d ur4| jdkr6tj| j
 d S d S d S d S )N
NCCL_DEBUGINFONCCL_BLOCKING_WAIT1)backendinit_method
world_sizerankr   )r   r   r   
setdefaulttorchinit_process_groupr   r   r	   r
   r   r   cuda
set_device)r   r!   r!   r"   init_torch_distributedV   s   z(DistributedOption.init_torch_distributed)__name__
__module____qualname__r   bool__annotations__r   strr   r	   r   intr
   r   r   r   r   r   r   r#   r1   r!   r!   r!   r"   r   
   s   
 :r   c                 C   s  | j r@t| j| j}|dkrd| _n| jdkrd| _nd| _| jdkr&d| _ | jdkr.d| _|dkr?t| j| jd u r?t	dn5t
| j| jdkrMd| _nd| _| jre| jdkret| j| jd u ret	d| jrut| j| jd u rut	d| jr| jdkrt st	d	d S d S d S )
Nr   TFr   zH--dist_rank or RANK must be set if --multiprocessing_distributed == truezP--local_rank or LOCAL_RANK must be set if --multiprocessing_distributed == falsezI--dist_rank or RANK must be set if --multiprocessing_distributed == falseslurmz3Launch by 'srun' command if --dist_launcher='slurm')r   get_num_nodesr	   r   r   r   r   get_node_rankr
   r   r   r   is_in_slurm_step)args	num_nodesr!   r!   r"   resolve_distributed_modep   s@   



r?   returnc                   C   s   dt jv o	dt jv S )NSLURM_PROCIDSLURM_NTASKS)r   r   r!   r!   r!   r"   is_in_slurm_job   s   rC   c                   C   s   t  odtjv odtjv S )NSLURM_STEP_NUM_NODESSLURM_STEP_NODELIST)rC   r   r   r!   r!   r!   r"   r<      s
   r<   xc                 C   s   | d u r| S t | S N)r8   )rF   r!   r!   r"   _int_or_none   s   rH   c                  C   sJ   t  t jt j} | d |  d W  d   S 1 sw   Y  dS )zFind free port using bind().

    There are some interval between finding this port and using it
    and the other process might catch the port by that time.
    Thus it is not guaranteed that the port is really empty.

    ) r   r   N)socketAF_INETSOCK_STREAMbindgetsockname)sockr!   r!   r"   	free_port   s   

$rP   launcherc                 C   sr   | d u r)|dkrt  stdtjd } n|dkrtd|d ur)td| d| d ur1t| S ttjdS )	Nr9   /This process seems not to be launched by 'srun'rA   mpi;launcher=mpi is used for 'multiprocessing-distributed' mode
launcher='' is not supportedRANK)r<   r   r   r   r8   rH   getpriorrQ   r!   r!   r"   r      s   r   c                 C   sx   | d u r+|dkrt  stdttjd } n|dkrtd|d ur+td| d| d ur3t| S ttjdd	S )
Nr9   rR   rB   rS   rT   rU   rV   
WORLD_SIZEr'   )r<   r   r8   r   r   rX   rY   r!   r!   r"   r      s   r   c                 C   s   | d u r+|dkrt  stdttjd } n|dkrtd|d ur+td| d| d ur3t| S dtjv r?ttjd S d	tjv ratjd	 d
}t|dkr_dtjvr_ttjd	S d S d S )Nr9   rR   SLURM_LOCALIDrS   rT   rU   rV   
LOCAL_RANKr   r   r   )r<   r   r8   r   r   r   r   pop)rZ   rQ   r    r!   r!   r"   r      s*   

r   c                 C   sd   | d u r$|dkr$t  stdtjd }|dd dd dd} | d ur,t| S tjd	S )
Nr9   rR   rE   r   r   -[rI   MASTER_ADDR)r<   r   r   r   r   replacer7   rX   )rZ   rQ   nodelistr!   r!   r"   r     s   
 r   c                 C   s   | d ur| S t tjdS )NMASTER_PORT)rH   r   r   rX   )rZ   r!   r!   r"   r   +  s   r   c                 C   s   | dur| S |dkr&t  stdtjd tjd krtdttjd S |dkr7d	d
lm} |j}| S |durCtd| dt	tj
dS )zGet Node Rank.

    Use for "multiprocessing distributed" mode.
    The initial RANK equals to the Node id in this case and
    the real Rank is set as (nGPU * NodeID) + LOCAL_RANK in torch.distributed.

    Nr9   rR   rD   rB   @Run with --ntasks_per_node=1 if mutliprocessing_distributed=trueSLURM_NODEIDrS   r   MPIrU   rV   rW   )r<   r   r   r   r8   mpi4pyrh   
COMM_WORLDGet_rankrH   rX   rZ   rQ   rh   commr!   r!   r"   r;   2  s"   r;   c                 C   s   | dur| S |dkr&t  stdtjd tjd krtdttjd S |dkr7dd	lm} |j}| S |durCtd
| dttj	ddS )zGet the number of nodes.

    Use for "multiprocessing distributed" mode.
    RANK equals to the Node id in this case and
    the real Rank is set as (nGPU * NodeID) + LOCAL_RANK in torch.distributed.

    Nr9   rR   rD   rB   re   rS   r   rg   rU   rV   r[   r   )
r<   r   r   r   r8   ri   rh   rj   Get_sizerX   rl   r!   r!   r"   r:   S  s"   r:   )NNrG   )dataclassesr   rJ   typingr   r-   torch.distributed	dataclassr   r?   r5   rC   r<   r7   r8   rH   rP   r   r   r   r   r   r;   r:   r!   r!   r!   r"   <module>   s(    e9+!