o
    Ti                     @   s\   d dl Z d dlmZ d dlZd dlZd dlmZ d dl	m
Z
 d dlmZ G dd deZdS )    N)comm)CupyBackend)required_torch_version)get_acceleratorc                   @   s6   e Zd ZdddZdd Zdd Zdejfd	d
ZdS )NcclBackendNc                 C   sj   |d u rt jtt  d| _n	|| _| j | _t j| jd| _t j| jd| _	t
 | _tdd| _d S )N)ranksgroupg?)min_version)dist	new_grouprangeget_world_sizeworld_groupmpuget_data_parallel_groupget_rankranksizer   compression_backendr   bool_not_supported)selfr    r   O/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/comm/nccl.py__init__   s   zNcclBackend.__init__c           	      C   sd   g }||kr%t |D ]}||kr|tj|| ||d q
|||< q
|S |tj|||d |S N)srcr	   )r	   dst)r   appendr   irecvisend)	r   r   r   r	   sendbufrecvbufrootreqidxr   r   r   
my_igather   s   
zNcclBackend.my_igatherc                 C   sT   ||kr t |D ]}||krtj|| ||d q|||< qd S tj|||d d S r   )r   r   recvsend)r   r   r   r	   r!   r"   r#   r%   r   r   r   	my_gather)   s   
zNcclBackend.my_gatherbuffer_mc                    s   |  }t|dkrt|}| }| }tj  ||kr4tj	|| |j
d}t||g}|| tj|t|  ||| d  dd   jrzjj| d jtjdj njj| d j j}	tj	j j j gd jd}
fddtj D }j |
}fd	dtj D }t!j"|t#|j$d
 t!j%|j$d
 d j|}
j t&|
 'j d ddt#|dj  (d}|| tj|t|  }|||| d  dd   jrNjj| d jtjdd}njj| d d}d }tj	j |d j g|
jdd }
j |d g}fddtj D }tj	j dg|	jd  fddtj D }t!j%||d j$d
 t!j%||j$d
 d }t#|}j||j)*j t& 'j d ddj   j) ||kr|d| }t|dkr|'|}|S )N   )deviceg      g       @)dtyper   c                       g | ]
}j  | qS r   r   
cupy2torch.0r%   )cupy_sign_list_packedr   r   r   
<listcomp>Q       z4NcclBackend.compressed_allreduce.<locals>.<listcomp>c              
      s,   g | ]}t jd jt t  dqS )r+   )r-   r,   )torchzerosr-   r,   r   device_name)r2   i)
local_rankworker_scaler   r   r4   X   s    r   c                    r.   r   r/   r1   )cupy_recvbuf_sign_serverr   r   r   r4      r5   c                    r.   r   r/   r1   )cupy_recvbuf_scale_serverr   r   r   r4      r5   )+r   lenr6   flattennumelcupycudaDeviceuser7   r,   catadd_linalgnormnpsqrtset_signboolfloatmul_r   r   compress_by_chunk
torch2cupysign_touint8r   r-   r   r0   r   all_to_all_singlestackr   
all_gather
unpackbitsreshapesumdatacopy_)r   r*   worker_errorserver_errorr:   original_shapeoriginal_sizeworker_error_sizeempty_tensorcupy_worker_scalecupy_recvbuf_signsign_list_packedrecvbuf_signrecvbuf_scalecompensated_server_mserver_scalecupy_server_sign_packedserver_sign_packedrecvbuf_sign_serverrecvbuf_scale_serverr   )r>   r=   r3   r:   r   r;   r   compressed_allreduce3   s   

0&
$"



z NcclBackend.compressed_allreduce)N)	__name__
__module____qualname__r   r&   r)   r6   tensorro   r   r   r   r   r      s
    

r   )r6   	deepspeedr   r   rB   numpyrJ   "deepspeed.runtime.compression.cupyr   deepspeed.utils.torchr   deepspeed.acceleratorr   objectr   r   r   r   r   <module>   s   