o
    Ti8                     @   s8   d dl Zd dlZd dlZd dlmZ G dd deZdS )    Nc                   @   s6   e Zd ZdddZdd Zdd Zdejfd	d
ZdS )HcclBackendNc                 C   sV   |d u rt jtt  d| _n	|| _| j | _t j| jd| _t j| jd| _	d S )N)ranksgroup)
dist	new_grouprangeget_world_sizeworld_groupmpuget_data_parallel_groupsizeget_rankrank)selfr    r   O/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/comm/hccl.py__init__   s   zHcclBackend.__init__c           	      C   sd   g }||kr%t |D ]}||kr|tj|| ||d q
|||< q
|S |tj|||d |S N)srcr   )r   dst)r   appendr   irecvisend)	r   r   r   r   sendbufrecvbufrootreqidxr   r   r   
my_igather   s   
zHcclBackend.my_igatherc                 C   sT   ||kr t |D ]}||krtj|| ||d q|||< qd S tj|||d d S r   )r   r   recvsend)r   r   r   r   r   r   r   r   r   r   r   	my_gather#   s   
zHcclBackend.my_gatherbuffer_mc                    s  |  }t|dkrt|}| }| }||kr,tj|| |jd}t||g}|| tj	
|tt| ||| d  dd   t|| j tjtj| j t| j gd jjd}	fddt| j D }
 fd	dt| j D }tj|	t|
| jd
 tj|| jd
 |	tj }t|| j tj t|d| j  !d}|| tj	
|t|  }|||| d  dd   t|dtj}tj| j t|d g|	j|jdfddt| j D }tj| j dgj|jdfddt| j D }tj||d | jd
 tj||| jd
 t|}|tj }|j"#t|| j tj  j" ||krm|d| }t|dkry|$|}|S )N   )deviceg      g       @r   dtyper%   c                       g | ]} | qS r   r   .0r   )sign_list_packed_tmpr   r   
<listcomp>D       z4HcclBackend.compressed_allreduce.<locals>.<listcomp>c                    s$   g | ]}t jd jt  dqS )r$   r&   )torchzerosr'   r%   )r*   _)
local_rankworker_scaler   r   r,   F   s    r   c                    r(   r   r   r)   )recvbuf_sign_server_tmpr   r   r,   b   r-   c                    r(   r   r   r)   )recvbuf_scale_server_tmpr   r   r,   i   r-   )%r   lenr.   flattennumelr/   r%   catadd_linalgnormnpsqrtset_signboolfloatmul_	torch_npunpu_sign_bits_packtypeint8r   r'   r   r   all_to_all_singlestackr
   
all_gatheruint8npu_sign_bits_unpackfloat32sumdatacopy_reshape)r   r#   worker_errorserver_errorr1   original_shapeoriginal_sizeworker_error_sizeempty_tensorrecvbuf_signsign_list_packedrecvbuf_scaleflattened_recvbuf_signcompensated_server_mserver_scaleserver_sign_packedrecvbuf_sign_serverrecvbuf_scale_serverflattened_recvbuf_sign_serverr   )r1   r4   r3   r+   r2   r   compressed_allreduce-   sr   

0
$



z HcclBackend.compressed_allreduce)N)	__name__
__module____qualname__r   r   r"   r.   tensorra   r   r   r   r   r      s
    
	
r   )	numpyr<   r.   rC   deepspeed.commcommr   objectr   r   r   r   r   <module>   s
   