o
    پi                     @   s  d dl Z d dlZd dlmZ d dlZddlmZ ddlm	Z	 ej
dd Zdejfd	d
ZdefddZdejfddZdejdededefddZdejdejddfddZd'ddZdefddZdefddZdee dejd ejdejfd!d"Zd'd#d$Zd'd%d&ZdS )(    N)Sequence   )env)gen_nvshmem_modulec                  C   s   t  } d }ddg}| D ]}|D ]}|| }| r|} nq|d ur% nq|d u r1td|  tj|tjd t  }|S )Nzlibnvshmem_host.sozlibnvshmem_host.so.3z=Could not find libnvshmem_host.so or libnvshmem_host.so.3 in )mode)	jit_envget_nvshmem_lib_dirsexistsFileNotFoundErrorctypesCDLLRTLD_GLOBALr   build_and_load)lib_dirslib_path	lib_nameslib_dirlib_namecandidate_pathmodule r   K/home/ubuntu/.local/lib/python3.10/site-packages/flashinfer/comm/nvshmem.pyget_nvshmem_module   s(   
r   returnc                  C   s   t  } t |  | S N)alloc_empty_unique_idr   nvshmem_get_unique_id)uidr   r   r   get_unique_id&   s   r   c                   C   
   t   S r   )r   nvshmem_unique_id_sizer   r   r   r   unique_id_size,      
r!   c                   C   s   t jt t jddS )Ncpu)dtypedevice)torchzerosr!   uint8r   r   r   r   r   0   s   r   r   rank
world_sizec                 C   s   t  | ||}tj  |S r   )r   nvshmem_initr&   cudasynchronize)r   r)   r*   statusr   r   r   init4   s   
r/   destsourcec                 C   s   t  | |S r   )r   nvshmem_alltoall)r0   r1   r   r   r   alltoall:      r3   c                   C   s   t j  t   d S r   )r&   r,   r-   r   nvshmem_finalizer   r   r   r   finalize>   s   
r6   c                   C   r   r   )r   nvshmem_my_per   r   r   r   my_peC   r"   r8   c                   C   r   r   )r   nvshmem_n_pesr   r   r   r   n_pesG   r"   r:   shaper$   r%   c                 C   s   t  | ||}t|S )a  Allocates memory using NVSHMEM collective malloc operation.

    This is a collective operation that requires participation by all PEs (Processing Elements).
    All participants must call this function with the same parameters.

    Note: This tensor should be explicitly deleted (del tensor) to ensure proper ordering
    of nvshmem_free operations rather than relying on garbage collection.

    Args:
        shape: The shape of the tensor to allocate.
        dtype: The data type of the tensor.
        device: The device to allocate the tensor on.

    Returns:
        A tensor allocated using NVSHMEM collective malloc.

    Reference:
        https://docs.nvidia.com/nvshmem/api/gen/api/memory.html#nvshmem-malloc-nvshmem-free-nvshmem-align
    )r   nvshmem_mallocr&   from_dlpack)r;   r$   r%   outputr   r   r   mallocK   s   
r?   c                   C      t    d S r   )r   nvshmem_barrier_allr   r   r   r   barrier_allh   r4   rB   c                   C   r@   r   )r   %nvshmem_barrier_all_on_current_streamr   r   r   r   barrier_all_on_current_streaml   r4   rD   )r   N)r   	functoolstypingr   r&   jitr   r   jit.commr   cacher   Tensorr   intr!   r   r/   r3   r6   r8   r:   r$   r%   r?   rB   rD   r   r   r   r   <module>   s6    



