o
    ci#                     @   s  d Z ddlZz0ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 ddlm
Z
 dd	lmZ W n eyA   ed
w ddlmZmZ ejejejejejejejejiZejejejejejejejej ej!ej"ej#ej$ej%ejej&ej'ej(ej)ej*ej+ej,ej-ej.ej/iZ0e r	ddl1Z1ddl2Z1e1j3ej"e1j4ej5e1jeje1j!ej"e1j#ej$e1j%eje1j6eje1j&ej'e1j7ej8e1j(ej)e1j*ej+e1j,ej-e1j.ej/iZ9e:edrej;e9e1j<< e1j4ej#e1jeje1j!ej!e1j#ej#e1j%ej%e1j6ej%e1j&ej&e1j7ej*e1j(ej(e1j*ej*e1j,ej,iZ=dd Z>dd Z?dd Z@dd ZAdd ZBdd ZCdd ZDdd ZEdd ZFdd  ZGd!d" ZHd#d$ ZId%d& ZJd'd( ZKd)d* ZLdS )+z!Code to wrap some NCCL API calls.    N)nccl)Deviceget_versionget_build_versionNcclCommunicator)
groupStart)groupEndz*NCCL in Ray requires Cupy being available!)ReduceOptorch_availableNCCL_BFLOAT16c                   C   s   t jj S )z+Returns the number of compute-capable GPUs.)cupycudaruntimegetDeviceCount r   r   b/home/ubuntu/.local/lib/python3.10/site-packages/ray/util/collective/collective_group/nccl_util.pyget_num_gpusV   s   r   c                   C      t  S Nr   r   r   r   r   get_nccl_build_version[      r   c                   C   r   r   r   r   r   r   r   get_nccl_runtime_version_   r   r   c                   C   s   t  S r   )r   get_unique_idr   r   r   r   get_nccl_unique_idc   s   r   c                 C   s   t | ||}|S )a(  Create an NCCL communicator using NCCL APIs.

    Args:
        world_size: the number of processes of this communicator group.
        nccl_unique_id: the NCCLUniqueID for this group.
        rank: the rank of this process.
    Returns:
        comm (nccl.ncclComm_t): an NCCL communicator.
    r   )
world_sizenccl_unique_idrankcommr   r   r   create_nccl_communicatorg   s   
r!   c                 C   s   | t vrtd| t |  S )zMap the reduce op to NCCL reduce op type.

    Args:
        reduce_op: ReduceOp Enum (SUM/PRODUCT/MIN/MAX).
    Returns:
        (nccl.ncclRedOp_t): the mapped NCCL reduce op.
    z&NCCL does not support reduce op: '{}'.)NCCL_REDUCE_OP_MAPRuntimeErrorformat)	reduce_opr   r   r   get_nccl_reduce_opu   s   r&   c                 C   sF   t | tjrt| jj S t rt | tjrt	| j S t
dt| )z2Return the corresponded NCCL dtype given a tensor.]Unsupported tensor type. Got: {}. Supported GPU tensor types are: torch.Tensor, cupy.ndarray.)
isinstancer   ndarrayNUMPY_NCCL_DTYPE_MAPdtypetyper   torchTensorTORCH_NCCL_DTYPE_MAP
ValueErrorr$   tensorr   r   r   get_nccl_tensor_dtype   s   

r3   c                 C   sB   t | tjr
| jjS t rt | tjrt| j S t	d
t| )z2Return the corresponded Cupy dtype given a tensor.r'   )r(   r   r)   r+   r,   r   r-   r.   TORCH_NUMPY_DTYPE_MAPr0   r$   r1   r   r   r   get_cupy_tensor_dtype   s   

r5   c                 C   s`   t | tjr
| jjS t | tjr| jS t r't | tjr'| j	s#t
d|  S tdt| )z@Return the pointer to the underlying memory storage of a tensor.z8Torch tensor must be on GPU when using NCCL collectives.r'   )r(   r   r)   dataptrnumpyr   r-   r.   is_cudar#   data_ptrr0   r$   r,   r1   r   r   r   get_tensor_ptr   s   
r;   c                 C   sL   t | tjst | tjr| jS t rt | tjrt| S t	d
t| )z*Return the number of elements in a tensor.r'   )r(   r   r)   r8   sizer   r-   r.   numelr0   r$   r,   r1   r   r   r   get_tensor_n_elements   s   

r>   c                 C   sF   t | tjrt| jS t rt | tjrt|  S t	d
t| )z)Return the shape of the tensor as a list.r'   )r(   r   r)   listshaper   r-   r.   r<   r0   r$   r,   r1   r   r   r   get_tensor_shape   s   

rA   c                    sP   t  tjr fdd jD S t rt  tjrt  S t	d
t )z+Return the strides of the tensor as a list.c                    s   g | ]
}t | jj qS r   )intr+   itemsize).0strider1   r   r   
<listcomp>   s    z&get_tensor_strides.<locals>.<listcomp>r'   )r(   r   r)   stridesr   r-   r.   r?   rE   r0   r$   r,   r1   r   r1   r   get_tensor_strides   s   
rH   c              
   C   s   t | tjrz| jj}W |S  ty } ztd|d}~ww t r7t | tj	r7| jj
}t |ts5td|S tdt| )z!Return the GPU index of a tensor.z!The tensor is not on a valid GPU.Nz!Unsupported tensor type. Got: {}.)r(   r   r)   deviceidAttributeErrorr#   r   r-   r.   indexrB   r0   r$   r,   )r2   rI   execr   r   r   get_tensor_device   s   
	

rN   c                 C   s   d}t | tjrt |tjrt| | nRt ret | tjr*t |tjr*| | n=t | tjrEt |tjrEtjj	
| }| | n"t | tjrbt |tjrbttjj	|}t| | nd}nd}|sutdt| t|dS )zCopy the content from src_tensor to dst_tensor.

    Args:
        dst_tensor: the tensor to copy from.
        src_tensor: the tensor to copy to.

    Returns:
        None
    TFzdUnsupported tensor type. Got: {} and {}. Supported GPU tensor types are: torch.Tensor, cupy.ndarray.N)r(   r   r)   copytor   r-   r.   copy_utilsdlpackfrom_dlpacktoDlpack
fromDlpack	to_dlpackr0   r$   r,   )
dst_tensor
src_tensorcopiedtr   r   r   copy_tensor   s8   
r[   c                 C   s.   t | tstdt| dd | D }|S )zReturns the gpu devices of the list of input tensors.

    Args:
        tensors: a list of tensors, each locates on a GPU.

    Returns:
        list: the list of GPU devices.

    zAExpect a list of tensors each locates on a GPU device. Got: '{}'.c                 S   s   g | ]}t |qS r   )rN   )rD   rZ   r   r   r   rF   $  s    z*get_tensor_device_list.<locals>.<listcomp>)r(   r?   r#   r$   r,   )tensorsdevicesr   r   r   get_tensor_device_list  s   


r^   )M__doc__r8   r   	cupy.cudar   r   cupy.cuda.ncclr   r   r	   r
   r   ImportErrorray.util.collective.typesr   r   SUMNCCL_SUMPRODUCT	NCCL_PRODMINNCCL_MINMAXNCCL_MAXr"   int_
NCCL_INT64uint8
NCCL_UINT8uint32NCCL_UINT32uint64NCCL_UINT64int8	NCCL_INT8int32
NCCL_INT32int64half	NCCL_HALFfloat16NCCL_FLOAT16float32NCCL_FLOAT32float64NCCL_FLOAT64doubleNCCL_DOUBLEr*   r-   torch.utils.dlpackboolrB   NCCL_INTlongfloat
NCCL_FLOATr/   hasattrr   bfloat16r4   r   r   r   r   r!   r&   r3   r5   r;   r>   rA   rH   rN   r[   r^   r   r   r   r   <module>   s    

)