o
    װi6                     @   s  d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z
 d dlmZ d dlmZ g dZd	d
dZdd ZG dd dejZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZedZedZe Ze Ze Ze Z d S )!    )runtime)_compile)_cuda_types)BuiltinFunc)Constant)Data)wraps_class_method)	this_gridthis_thread_blocksyncwait
wait_priormemcpy_asyncz[#include <cuda/barrier>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
z,#include <cooperative_groups/memcpy_async.h>)cgcg_memcpy_asyncc                 C   sH   t | jd| }|du r"| jjt|  t| jd| d d S d S )Ninclude_FT)getattr	generatedcodesappend_header_to_codesetattr)envheaderflag r   @/home/ubuntu/.local/lib/python3.10/site-packages/cupyx/jit/cg.py_check_include   s
   r   c                   @   s,   e Zd ZdZdZdd Zdd Zdd ZdS )	_ThreadGroupz( Base class for all cooperative groups. Nc                 C   s   t N)NotImplementedErrorselfr   r   r   __init__&   s   z_ThreadGroup.__init__c                 C   s   | j  S r   
child_typer!   r   r   r   __str__)   s   z_ThreadGroup.__str__c                 C      t |d t|j dtjS )Nr   z.sync()r   _Datacoder   voidr"   r   instancer   r   r   _sync,   s   
z_ThreadGroup._sync)__name__
__module____qualname____doc__r%   r#   r&   r.   r   r   r   r   r   !   s    r   c                       s   e Zd ZdZdd Zedd Ze fddZedd	 Zed
d Z	edd Z
edd Zedd Zedd Zedd Zedd Z  ZS )
_GridGroupa  A handle to the current grid group. Must be created via :func:`this_grid`.

    .. seealso:: `CUDA Grid Group API`_, :class:`numba.cuda.cg.GridGroup`

    .. _CUDA Grid Group API:
        https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#grid-group-cg
    c                 C   
   d| _ d S )Nzcg::grid_groupr$   r!   r   r   r   r#   :      
z_GridGroup.__init__c                 C   r'   )zU
        is_valid()

        Returns whether the grid_group can synchronize.
        r   z.is_valid())r   r)   r*   r   bool_r,   r   r   r   is_valid=      
z_GridGroup.is_validc                    s   d|j _t ||S )z
        sync()

        Synchronize the threads named in the group.

        .. seealso:: :meth:`numba.cuda.cg.GridGroup.sync`
        T)r   	enable_cgsuperr.   r,   	__class__r   r   r   G   s   z_GridGroup.syncc                 C   r'   z`
        thread_rank()

        Rank of the calling thread within ``[0, num_threads)``.
        r   z.thread_rank()r   r)   r*   r   uint64r,   r   r   r   thread_rankU   r8   z_GridGroup.thread_rankc                 C   2   t  dk r
tdt|d t|j dtjS )z]
        block_rank()

        Rank of the calling block within ``[0, num_blocks)``.
        4+  z'block_rank() is supported on CUDA 11.6+r   z.block_rank()_runtime_getLocalRuntimeVersionRuntimeErrorr   r)   r*   r   r?   r,   r   r   r   
block_rank_      
z_GridGroup.block_rankc                 C   rA   zN
        num_threads()

        Total number of threads in the group.
        rB   z(num_threads() is supported on CUDA 11.6+r   z.num_threads()rC   r,   r   r   r   num_threadsk   rH   z_GridGroup.num_threadsc                 C   rA   )zL
        num_blocks()

        Total number of blocks in the group.
        rB   z'num_blocks() is supported on CUDA 11.6+r   z.num_blocks()rC   r,   r   r   r   
num_blocksw   rH   z_GridGroup.num_blocksc                 C   rA   )z[
        dim_blocks()

        Dimensions of the launched grid in units of blocks.
        rB   z'dim_blocks() is supported on CUDA 11.6+r   z.dim_blocks()rD   rE   rF   r   r)   r*   r   dim3r,   r   r   r   
dim_blocks   rH   z_GridGroup.dim_blocksc                 C   rA   )zc
        block_index()

        3-Dimensional index of the block within the launched grid.
        rB   z(block_index() is supported on CUDA 11.6+r   z.block_index()rL   r,   r   r   r   block_index   rH   z_GridGroup.block_indexc                 C   r'   zG
        size()

        Total number of threads in the group.
        r   z.size()r>   r,   r   r   r   size      
z_GridGroup.sizec                 C   r'   )zZ
        group_dim()

        Dimensions of the launched grid in units of blocks.
        r   .group_dim()r   r)   r*   r   rM   r,   r   r   r   	group_dim   rR   z_GridGroup.group_dim)r/   r0   r1   r2   r#   _wraps_class_methodr7   r   r@   rG   rJ   rK   rN   rO   rQ   rU   __classcell__r   r   r;   r   r3   1   s.    
	
	






r3   c                       s   e Zd ZdZdd Ze fddZedd Zedd	 Zed
d Z	edd Z
edd Zedd Zedd Z  ZS )_ThreadBlockGroupa  A handle to the current thread block group. Must be
    created via :func:`this_thread_block`.

    .. seealso:: `CUDA Thread Block Group API`_

    .. _CUDA Thread Block Group API:
        https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-block-group-cg
    c                 C   r4   )Nzcg::thread_blockr$   r!   r   r   r   r#      r5   z_ThreadBlockGroup.__init__c                    s   t  ||S )zM
        sync()

        Synchronize the threads named in the group.
        )r:   r.   r,   r;   r   r   r      s   z_ThreadBlockGroup.syncc                 C   r'   r=   r   r)   r*   r   uint32r,   r   r   r   r@      r8   z_ThreadBlockGroup.thread_rankc                 C   r'   )zc
        group_index()

        3-Dimensional index of the block within the launched grid.
        r   z.group_index()rT   r,   r   r   r   group_index   r8   z_ThreadBlockGroup.group_indexc                 C   r'   )zf
        thread_index()

        3-Dimensional index of the thread within the launched block.
        r   z.thread_index()rT   r,   r   r   r   thread_index   r8   z_ThreadBlockGroup.thread_indexc                 C   rA   )z^
        dim_threads()

        Dimensions of the launched block in units of threads.
        rB   z(dim_threads() is supported on CUDA 11.6+r   z.dim_threads()rL   r,   r   r   r   dim_threads   rH   z_ThreadBlockGroup.dim_threadsc                 C   rA   rI   )rD   rE   rF   r   r)   r*   r   rZ   r,   r   r   r   rJ      rH   z_ThreadBlockGroup.num_threadsc                 C   r'   rP   rY   r,   r   r   r   rQ      rR   z_ThreadBlockGroup.sizec                 C   r'   )z\
        group_dim()

        Dimensions of the launched block in units of threads.
        r   rS   rT   r,   r   r   r   rU   	  rR   z_ThreadBlockGroup.group_dim)r/   r0   r1   r2   r#   rV   r   r@   r[   r\   r]   rJ   rQ   rU   rW   r   r   r;   r   rX      s&    	
	
	
	



rX   c                       s,   e Zd Zdd Z fddZdd Z  ZS )_ThisCgGroupc                 C   sf   |dkr	d}d}n|dkrd}d}nt || _d| d| d	| d
| _|dkr1|  jd7  _d S d S )Ngridz
grid groupr3   thread_blockzthread block grouprX   z
        Returns the current z (:class:`~cupyx.jit.cg.z/`).

        .. seealso:: :class:`cupyx.jit.cg.`z!, :func:`numba.cuda.cg.this_grid`)r    
group_typer2   )r"   rb   nametypenamer   r   r   r#     s$   
z_ThisCgGroup.__init__c                    s   t    d S r   r:   __call__r!   r;   r   r   rf   (  s   z_ThisCgGroup.__call__c                 C   sD   t jrtd| jdkrt }n| jdkrt }td| j d|S )Nz)cooperative group is not supported on HIPr_   r`   z	cg::this_z())rD   is_hiprF   rb   r3   rX   r)   )r"   r   cg_typer   r   r   
call_const+  s   

z_ThisCgGroup.call_const)r/   r0   r1   r#   rf   ri   rW   r   r   r;   r   r^     s    r^   c                       $   e Zd Z fddZdd Z  ZS )_Syncc                       t    dS )a  Calls ``cg::sync()``.

        Args:
            group: a valid cooperative group

        .. seealso:: `cg::sync`_

        .. _cg::sync:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-sync
        Nre   r"   groupr;   r   r   rf   7     z_Sync.__call__c                 C   s4   t |jts
tdt|d td|j dtjS )Nz'group must be a valid cooperative groupr   z	cg::sync())	
isinstancectyper   
ValueErrorr   r)   r*   r   r+   r"   r   rn   r   r   r   callD  s   
z
_Sync.callr/   r0   r1   rf   ru   rW   r   r   r;   r   rk   5      rk   c                       s0   e Zd Zdd fdd
ZddddZ  ZS )_MemcpySyncN)aligned_sizec                   rl   )a  Calls ``cg::memcpy_sync()``.

        Args:
            group: a valid cooperative group
            dst: the destination array that can be viewed as a 1D
                C-contiguous array
            dst_idx: the start index of the destination array element
            src: the source array that can be viewed as a 1D C-contiguous
                array
            src_idx: the start index of the source array element
            size (int): the number of bytes to be copied from
                ``src[src_index]`` to ``dst[dst_idx]``
            aligned_size (int): Use ``cuda::aligned_size_t<N>`` to guarantee
                the compiler that ``src``/``dst`` are at least N-bytes aligned.
                The behavior is undefined if the guarantee is not held.

        .. seealso:: `cg::memcpy_sync`_

        .. _cg::memcpy_sync:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-memcpy-async
        Nre   )r"   rn   dstdst_idxsrcsrc_idxrQ   ry   r;   r   r   rf   M  s   z_MemcpySync.__call__c             
   C   s   t |d t |d t||}t||}||fD ]}	t|	jtjtjfs*tdqt	
|||}t	
|||}t	|tjd|}t||}|j }
|rat|tsWtdd|j d|
 d}
td	|j d
|j d|j d|
 d	tjS )Nr   r   zdst/src must be of array type.	same_kindz,aligned_size must be a compile-time constantzcuda::aligned_size_t<>(rp   zcg::memcpy_async(z, &(z), &(z), )r   r)   initrq   rr   r   CArrayPtr	TypeErrorr   	_indexing_astype_scalarrZ   r*   	_Constantrs   objr+   )r"   r   rn   rz   r{   r|   r}   rQ   ry   arr	size_coder   r   r   ru   f  s@   




z_MemcpySync.callrv   r   r   r;   r   rx   K  s
    rx   c                       rj   )_Waitc                    rl   )a  Calls ``cg::wait()``.

        Args:
            group: a valid cooperative group

        .. seealso: `cg::wait`_

        .. _cg::wait:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-wait
        Nre   rm   r;   r   r   rf     ro   z_Wait.__call__c                 C   s    t |d td|j dtjS )Nr   z	cg::wait(rp   r(   rt   r   r   r   ru     s   
z
_Wait.callrv   r   r   r;   r   r     rw   r   c                       rj   )
_WaitPriorc                    rl   )aX  Calls ``cg::wait_prior<N>()``.

        Args:
            group: a valid cooperative group
            step (int): wait for the first ``N`` steps to finish

        .. seealso: `cg::wait_prior`_

        .. _cg::wait_prior:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-wait
        Nre   rm   r;   r   r   rf     s   z_WaitPrior.__call__c                 C   s:   t |d t|tstdtd|j d|j dtjS )Nr   z$step must be a compile-time constantzcg::wait_prior<r   rp   )	r   rq   r   rs   r)   r   r*   r   r+   )r"   r   rn   stepr   r   r   ru     s   

z_WaitPrior.callrv   r   r   r;   r   r     s    r   r_   r`   N)!	cupy.cudar   rD   	cupyx.jitr   r   cupyx.jit._internal_typesr   _BuiltinFuncr   r   r   r)   r   rV   __all__r   r   TypeBaser   r3   rX   r^   rk   rx   r   r   r	   r
   r   r   r   r   r   r   r   r   <module>   s6     c :
