o
    X۷i7                     @  s(  d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z
 d dlmZ d dlmZ d dlmZ g d	Zd
ddZdd ZG dd dejZG dd deZG dd deZG dd de
ZG dd de
ZG dd de
ZG dd de
ZG dd de
ZedZed Ze Ze Z e Z!e Z"d!S )"    )annotations)runtime)_compile)_cuda_types)BuiltinFunc)Constant)Data)wraps_class_method)	this_gridthis_thread_blocksyncwait
wait_priormemcpy_asyncz[#include <cuda/barrier>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
z,#include <cooperative_groups/memcpy_async.h>)cgcg_memcpy_asyncc                 C  sH   t | jd| }|du r"| jjt|  t| jd| d d S d S )Ninclude_FT)getattr	generatedcodesappend_header_to_codesetattr)envheaderflag r   B/home/ubuntu/vllm_env/lib/python3.10/site-packages/cupyx/jit/cg.py_check_include   s
   r   c                   @  s,   e Zd ZdZdZdd Zdd Zdd ZdS )	_ThreadGroupz( Base class for all cooperative groups. Nc                 C  s   t N)NotImplementedErrorselfr   r   r   __init__(   s   z_ThreadGroup.__init__c                 C  s   | j  S r    
child_typer"   r   r   r   __str__+   s   z_ThreadGroup.__str__c                 C     t |d t|j dtjS )Nr   z.sync()r   _Datacoder   voidr#   r   instancer   r   r   _sync.   s   
z_ThreadGroup._sync)__name__
__module____qualname____doc__r&   r$   r'   r/   r   r   r   r   r   #   s    r   c                      s   e Zd ZdZdd Zedd Ze fddZedd	 Zed
d Z	edd Z
edd Zedd Zedd Zedd Zedd Z  ZS )
_GridGroupa  A handle to the current grid group. Must be created via :func:`this_grid`.

    .. seealso:: `CUDA Grid Group API`_, :class:`numba.cuda.cg.GridGroup`

    .. _CUDA Grid Group API:
        https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#grid-group-cg
    c                 C  
   d| _ d S )Nzcg::grid_groupr%   r"   r   r   r   r$   <      
z_GridGroup.__init__c                 C  r(   )zU
        is_valid()

        Returns whether the grid_group can synchronize.
        r   z.is_valid())r   r*   r+   r   bool_r-   r   r   r   is_valid?      
z_GridGroup.is_validc                   s   d|j _t ||S )z
        sync()

        Synchronize the threads named in the group.

        .. seealso:: :meth:`numba.cuda.cg.GridGroup.sync`
        T)r   	enable_cgsuperr/   r-   	__class__r   r   r   I   s   z_GridGroup.syncc                 C  r(   z`
        thread_rank()

        Rank of the calling thread within ``[0, num_threads)``.
        r   z.thread_rank()r   r*   r+   r   uint64r-   r   r   r   thread_rankW   r9   z_GridGroup.thread_rankc                 C  2   t  dk r
tdt|d t|j dtjS )z]
        block_rank()

        Rank of the calling block within ``[0, num_blocks)``.
        4+  z'block_rank() is supported on CUDA 11.6+r   z.block_rank()_runtime_getLocalRuntimeVersionRuntimeErrorr   r*   r+   r   r@   r-   r   r   r   
block_ranka      
z_GridGroup.block_rankc                 C  rB   zN
        num_threads()

        Total number of threads in the group.
        rC   z(num_threads() is supported on CUDA 11.6+r   z.num_threads()rD   r-   r   r   r   num_threadsm   rI   z_GridGroup.num_threadsc                 C  rB   )zL
        num_blocks()

        Total number of blocks in the group.
        rC   z'num_blocks() is supported on CUDA 11.6+r   z.num_blocks()rD   r-   r   r   r   
num_blocksy   rI   z_GridGroup.num_blocksc                 C  rB   )z[
        dim_blocks()

        Dimensions of the launched grid in units of blocks.
        rC   z'dim_blocks() is supported on CUDA 11.6+r   z.dim_blocks()rE   rF   rG   r   r*   r+   r   dim3r-   r   r   r   
dim_blocks   rI   z_GridGroup.dim_blocksc                 C  rB   )zc
        block_index()

        3-Dimensional index of the block within the launched grid.
        rC   z(block_index() is supported on CUDA 11.6+r   z.block_index()rM   r-   r   r   r   block_index   rI   z_GridGroup.block_indexc                 C  r(   zG
        size()

        Total number of threads in the group.
        r   z.size()r?   r-   r   r   r   size      
z_GridGroup.sizec                 C  r(   )zZ
        group_dim()

        Dimensions of the launched grid in units of blocks.
        r   .group_dim()r   r*   r+   r   rN   r-   r   r   r   	group_dim   rS   z_GridGroup.group_dim)r0   r1   r2   r3   r$   _wraps_class_methodr8   r   rA   rH   rK   rL   rO   rP   rR   rV   __classcell__r   r   r<   r   r4   3   s.    
	
	






r4   c                      s   e Zd ZdZdd Ze fddZedd Zedd	 Zed
d Z	edd Z
edd Zedd Zedd Z  ZS )_ThreadBlockGroupa  A handle to the current thread block group. Must be
    created via :func:`this_thread_block`.

    .. seealso:: `CUDA Thread Block Group API`_

    .. _CUDA Thread Block Group API:
        https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-block-group-cg
    c                 C  r5   )Nzcg::thread_blockr%   r"   r   r   r   r$      r6   z_ThreadBlockGroup.__init__c                   s   t  ||S )zM
        sync()

        Synchronize the threads named in the group.
        )r;   r/   r-   r<   r   r   r      s   z_ThreadBlockGroup.syncc                 C  r(   r>   r   r*   r+   r   uint32r-   r   r   r   rA      r9   z_ThreadBlockGroup.thread_rankc                 C  r(   )zc
        group_index()

        3-Dimensional index of the block within the launched grid.
        r   z.group_index()rU   r-   r   r   r   group_index   r9   z_ThreadBlockGroup.group_indexc                 C  r(   )zf
        thread_index()

        3-Dimensional index of the thread within the launched block.
        r   z.thread_index()rU   r-   r   r   r   thread_index   r9   z_ThreadBlockGroup.thread_indexc                 C  rB   )z^
        dim_threads()

        Dimensions of the launched block in units of threads.
        rC   z(dim_threads() is supported on CUDA 11.6+r   z.dim_threads()rM   r-   r   r   r   dim_threads   rI   z_ThreadBlockGroup.dim_threadsc                 C  rB   rJ   )rE   rF   rG   r   r*   r+   r   r[   r-   r   r   r   rK      rI   z_ThreadBlockGroup.num_threadsc                 C  r(   rQ   rZ   r-   r   r   r   rR      rS   z_ThreadBlockGroup.sizec                 C  r(   )z\
        group_dim()

        Dimensions of the launched block in units of threads.
        r   rT   rU   r-   r   r   r   rV     rS   z_ThreadBlockGroup.group_dim)r0   r1   r2   r3   r$   rW   r   rA   r\   r]   r^   rK   rR   rV   rX   r   r   r<   r   rY      s&    	
	
	
	



rY   c                      s,   e Zd Zdd Z fddZdd Z  ZS )_ThisCgGroupc                 C  sf   |dkr	d}d}n|dkrd}d}nt || _d| d| d	| d
| _|dkr1|  jd7  _d S d S )Ngridz
grid groupr4   thread_blockzthread block grouprY   z
        Returns the current z (:class:`~cupyx.jit.cg.z/`).

        .. seealso:: :class:`cupyx.jit.cg.`z!, :func:`numba.cuda.cg.this_grid`)r!   
group_typer3   )r#   rc   nametypenamer   r   r   r$     s$   
z_ThisCgGroup.__init__c                   s   t    d S r    r;   __call__r"   r<   r   r   rg   *  s   z_ThisCgGroup.__call__c                 C  sD   t jrtd| jdkrt }n| jdkrt }td| j d|S )Nz)cooperative group is not supported on HIPr`   ra   z	cg::this_z())rE   is_hiprG   rc   r4   rY   r*   )r#   r   cg_typer   r   r   
call_const-  s   

z_ThisCgGroup.call_const)r0   r1   r2   r$   rg   rj   rX   r   r   r<   r   r_     s    r_   c                      $   e Zd Z fddZdd Z  ZS )_Syncc                      t    dS )a  Calls ``cg::sync()``.

        Args:
            group: a valid cooperative group

        .. seealso:: `cg::sync`_

        .. _cg::sync:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-sync
        Nrf   r#   groupr<   r   r   rg   9     z_Sync.__call__c                 C  s4   t |jts
tdt|d td|j dtjS )Nz'group must be a valid cooperative groupr   z	cg::sync())	
isinstancectyper   
ValueErrorr   r*   r+   r   r,   r#   r   ro   r   r   r   callF  s   
z
_Sync.callr0   r1   r2   rg   rv   rX   r   r   r<   r   rl   7      rl   c                      s0   e Zd Zdd fdd
ZddddZ  ZS )_MemcpySyncN)aligned_sizec                  rm   )a  Calls ``cg::memcpy_sync()``.

        Args:
            group: a valid cooperative group
            dst: the destination array that can be viewed as a 1D
                C-contiguous array
            dst_idx: the start index of the destination array element
            src: the source array that can be viewed as a 1D C-contiguous
                array
            src_idx: the start index of the source array element
            size (int): the number of bytes to be copied from
                ``src[src_index]`` to ``dst[dst_idx]``
            aligned_size (int): Use ``cuda::aligned_size_t<N>`` to guarantee
                the compiler that ``src``/``dst`` are at least N-bytes aligned.
                The behavior is undefined if the guarantee is not held.

        .. seealso:: `cg::memcpy_sync`_

        .. _cg::memcpy_sync:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-memcpy-async
        Nrf   )r#   ro   dstdst_idxsrcsrc_idxrR   rz   r<   r   r   rg   O  s   z_MemcpySync.__call__c             
   C  s   t |d t |d t||}t||}||fD ]}	t|	jtjtjfs*tdqt	
|||}t	
|||}t	|tjd|}t||}|j }
|rat|tsWtdd|j d|
 d}
td	|j d
|j d|j d|
 d	tjS )Nr   r   zdst/src must be of array type.	same_kindz,aligned_size must be a compile-time constantzcuda::aligned_size_t<>(rq   zcg::memcpy_async(z, &(z), &(z), )r   r*   initrr   rs   r   CArrayPtr	TypeErrorr   	_indexing_astype_scalarr[   r+   	_Constantrt   objr,   )r#   r   ro   r{   r|   r}   r~   rR   rz   arr	size_coder   r   r   rv   h  s@   




z_MemcpySync.callrw   r   r   r<   r   ry   M  s
    ry   c                      rk   )_Waitc                   rm   )a  Calls ``cg::wait()``.

        Args:
            group: a valid cooperative group

        .. seealso: `cg::wait`_

        .. _cg::wait:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-wait
        Nrf   rn   r<   r   r   rg     rp   z_Wait.__call__c                 C  s    t |d td|j dtjS )Nr   z	cg::wait(rq   r)   ru   r   r   r   rv     s   
z
_Wait.callrw   r   r   r<   r   r     rx   r   c                      rk   )
_WaitPriorc                   rm   )aX  Calls ``cg::wait_prior<N>()``.

        Args:
            group: a valid cooperative group
            step (int): wait for the first ``N`` steps to finish

        .. seealso: `cg::wait_prior`_

        .. _cg::wait_prior:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-wait
        Nrf   rn   r<   r   r   rg     s   z_WaitPrior.__call__c                 C  s:   t |d t|tstdtd|j d|j dtjS )Nr   z$step must be a compile-time constantzcg::wait_prior<r   rq   )	r   rr   r   rt   r*   r   r+   r   r,   )r#   r   ro   stepr   r   r   rv     s   

z_WaitPrior.callrw   r   r   r<   r   r     s    r   r`   ra   N)#
__future__r   	cupy.cudar   rE   	cupyx.jitr   r   cupyx.jit._internal_typesr   _BuiltinFuncr   r   r   r*   r	   rW   __all__r   r   TypeBaser   r4   rY   r_   rl   ry   r   r   r
   r   r   r   r   r   r   r   r   r   <module>   s8     c :
