o
    X۷iC                     @  s|  U d dl mZ d dlmZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ G dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd  d eZG d!d" d"eZ G d#d$ d$eZ!e"e e#e e$e e%e iZ&d%e'd&< e Z(e Z)e Z*e Z+ed'Z,ed(Z-e! Z.ed)d*e	j/rd+nd, Z0ed-d.Z1ed/d0Z2ed1d2Z3ed3d2Z4ed4d5Z5ed6d5Z6ed7d2e	j/rd+nd8 Z7ed9d2Z8ed:d2Z9ed;d2Z:d<e	j/r%d+nd= Z;e d>e;Z<e d?e;Z=e d@e;Z>e dAe;Z?dS )B    )annotations)Any)MappingN)runtime)device)_cuda_types)_cuda_typerules)BuiltinFunc)Data)Constant)Range)_compile)reducec                      s0   e Zd Zdd fdd
ZddddZ  ZS )	RangeFuncNunrollc                     t    dS )a  Range with loop unrolling support.

        Args:
            start (int):
                Same as that of built-in :obj:`range`.
            stop (int):
                Same as that of built-in :obj:`range`.
            step (int):
                Same as that of built-in :obj:`range`.
            unroll (int or bool or None):

                - If `True`, add ``#pragma unroll`` directive before the
                  loop.
                - If `False`, add ``#pragma unroll(1)`` directive before
                  the loop to disable unrolling.
                - If an `int`, add ``#pragma unroll(n)`` directive before
                  the loop, where the integer ``n`` means the number of
                  iterations to unroll.
                - If `None` (default), leave the control of loop unrolling
                  to the compiler (no ``#pragma``).

        .. seealso:: `#pragma unroll`_

        .. _#pragma unroll:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#pragma-unroll
        Nsuper__call__)selfr   args	__class__ N/home/ubuntu/vllm_env/lib/python3.10/site-packages/cupyx/jit/_builtin_funcs.pyr      s   zRangeFunc.__call__c          	      G  s  t |dkr
tdt |dkrtd|d td}}}n*t |dkr4|d |d td}}}nt |dkr@|\}}}n	tdt | |d urtdd ||||fD s^td	|j}t|tsut|tsutd
t|j	 |du r{d}|du sd|  k rdk sn t
d t|tr|jdk}n|jjjdkrd}nd }t||}t||}t||}|jjjdvrtd|jjjdvrtd|jjjdvrtd|jdkrtt}n|jdkr|j}nJ t||||||dS )Nr   z)range expected at least 1 argument, got 0         z'range expected at most 3 argument, got c                 s  s    | ]}t |tV  qd S N)
isinstancer   ).0xr   r   r   	<genexpr>C   s    z!RangeFunc.call.<locals>.<genexpr>zCloop unrolling requires constant start, stop, step and unroll valuez-unroll value expected to be of type int, got FTl        zUloop unrolling is ignored as the unroll value is non-positive or greater than INT_MAXuiuz%range supports only for integer type.numpycudar   )len	TypeErrorr   allobjr    intbooltype__name__warningswarnctypedtypekindr
   initmoder   Scalarr   )	r   envr   r   startstopstepstep_is_positiver2   r   r   r   call5   sf   



zRangeFunc.callr/   
__module____qualname__r   r=   __classcell__r   r   r   r   r      s    r   c                   @     e Zd Zdd ZdS )LenFuncc                 O  sr   t |dkrtdt | |rtd|d }t|jtjs$td|jjs,tdtd|j dt	d	S )
Nr   z#len() expects only 1 argument, got #keyword arguments are not supportedr   zlen() supports only array typezlen() of unsized arrayzstatic_cast<long long>(z.shape()[0])q)
r(   r)   r    r2   r   CArrayndimr
   coder7   )r   r8   r   kwdsargr   r   r   r=   r   s   zLenFunc.callNr/   r?   r@   r=   r   r   r   r   rC   p       rC   c                   @  rB   )MinFuncc                   <   t |dk rtdt | |rtdt fdd|S )Nr   z(min() expects at least 2 arguments, got rD   c                      t tj| |fd  S r   )r   _call_ufunccupyminimumabr8   r   r   <lambda>       zMinFunc.call.<locals>.<lambda>r(   r)   r   r   r8   r   rI   r   rV   r   r=         zMinFunc.callNrK   r   r   r   r   rM      rL   rM   c                   @  rB   )MaxFuncc                   rN   )Nr   z(max() expects at least 2 arguments, got rD   c                   rO   r   )r   rP   rQ   maximumrS   rV   r   r   rW      rX   zMaxFunc.call.<locals>.<lambda>rY   rZ   r   rV   r   r=      r[   zMaxFunc.callNrK   r   r   r   r   r\      rL   r\   c                      s$   e Zd Z fddZdd Z  ZS )SyncThreadsc                   r   )zCalls ``__syncthreads()``.

        .. seealso:: `Synchronization functions`_

        .. _Synchronization functions:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#synchronization-functions
        Nr   r   r   r   r   r         zSyncThreads.__call__c                 C  s   t dtjS )Nz__syncthreads())r
   r   voidr   r8   r   r   r   
call_const   s   zSyncThreads.call_constr/   r?   r@   r   rc   rA   r   r   r   r   r^      s    
r^   c                      s0   e Zd Zdd fdd
ZddddZ  ZS )SyncWarp    )maskc                  r   )a:  Calls ``__syncwarp()``.

        Args:
            mask (int): Active threads in a warp. Default is 0xffffffff.

        .. seealso:: `Synchronization functions`_

        .. _Synchronization functions:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#synchronization-functions
        Nr   )r   rg   r   r   r   r         zSyncWarp.__call__Nc                C  s   t jr|d urtd| dt d }|rDt|tr-d|j  kr(dks-td tdt	
|tjd|}t||}d|j d}nd	}t|tjS )
Nmask  is ignored on HIPr   rf   mask is out of range	same_kindz__syncwarp()z__syncwarp())r   is_hipr0   r1   RuntimeWarningr    r   r+   
ValueErrorr   _astype_scalarr   int32r
   r5   rH   ra   )r   r8   rg   rH   r   r   r   r=      s"   

zSyncWarp.callr>   r   r   r   r   re      s    re   c                      s(   e Zd Zd fdd	ZdddZ  ZS )SharedMemoryNc                   r   )a  Allocates shared memory and returns it as a 1-D array.

        Args:
            dtype (dtype):
                The dtype of the returned array.
            size (int or None):
                If ``int`` type, the size of static shared memory.
                If ``None``, declares the shared memory with extern specifier.
            alignment (int or None): Enforce the alignment via __align__(N).
        Nr   )r   r3   size	alignmentr   r   r   r      rh   zSharedMemory.__call__c                 C  sN   |j dd}t|}t|t|||}||j|< ||j|< t|t|S )N_smem)prefix)	get_fresh_variable_namer   to_ctyper
   r   	SharedMemdeclslocalsPtr)r   r8   r3   rt   ru   namer2   varr   r   r   rc      s   


zSharedMemory.call_constr   rd   r   r   r   r   rs      s    rs   c                      s0   e Zd Zdd Zd fdd	ZdddZ  ZS )	AtomicOpc                 C  s.   || _ d| | _|| _d| j d}|| _d S )NatomicCalls the ``a  `` function to operate atomically on
        ``array[index]``. Please refer to `Atomic Functions`_ for detailed
        explanation.

        Args:
            array: A :class:`cupy.ndarray` to index over.
            index: A valid index such that the address to the corresponding
                array element ``array[index]`` can be computed.
            value: Represent the value to use for the specified operation. For
                the case of :obj:`atomic_cas`, this is the value for
                ``array[index]`` to compare with.
            alt_value: Only used in :obj:`atomic_cas` to represent the value
                to swap to.

        .. seealso:: `Numba's corresponding atomic functions`_

        .. _Atomic Functions:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions

        .. _Numba's corresponding atomic functions:
            https://numba.readthedocs.io/en/stable/cuda-reference/kernel.html#synchronization-and-atomic-operations
        _op_name_dtypes__doc__r   opdtypesdocr   r   r   __init__   s
   

zAtomicOp.__init__Nc                      t    d S r   r   )r   arrayindexvalue	alt_valuer   r   r   r         zAtomicOp.__call__c                 C  s2  | j }| j}t||}t|jtjtjfst	dt
|||}|j}	|	jj| jvr7t	d| d|	j dt
||	d|}t||}|dkr|d usOJ |	jjdkratt dk ratd	t
||	d|}t||}| d
|j d|j d|j d}
n|d u sJ | d
|j d|j d}
t|
|	S )Nz)The first argument must be of array type.`` does not support  input.rl   CASHF   z5uint16 atomic operation is not supported before sm_70z(&, rm   )r   r   r
   r5   r    r2   r   rF   r}   r)   r   	_indexingr3   r~   r   rq   charr,   r   get_compute_capabilityRuntimeErrorrH   )r   r8   r   r   r   value2r~   r   targetr2   rH   r   r   r   r=     s0   $
zAtomicOp.callr   r/   r?   r@   r   r   r=   rA   r   r   r   r   r      s    r   c                      s,   e Zd Zdd Z fddZdd Z  ZS )GridFuncc                 C  st   |dkrd| _ d| _d| _d| _n|dkr"d| _ d| _d	| _d
| _ntdd| j  d| j d| j d}|| _d S )Ngridz%Compute the thread index in the grid.z1jit.threadIdx.x + jit.blockIdx.x * jit.blockDim.xznumba.cuda.gridz+threadIdx.{n} + blockIdx.{n} * blockDim.{n}gridsizezCompute the grid size.zjit.blockDim.x * jit.gridDim.xznumba.cuda.gridsizezblockDim.{n} * gridDim.{n}zunsupported functionz        zH

        Computation of the first integer is as follows::

            a  

        and for the other two integers the ``y`` and ``z`` attributes are used.

        Args:
            ndim (int): The dimension of the grid. Only 1, 2, or 3 is allowed.

        Returns:
            int or tuple:
                If ``ndim`` is 1, an integer is returned, otherwise a tuple.

        .. note::
            This function follows the convention of Numba's
            :func:`z`.
        )_desc_eq_link_coderp   r   )r   r6   r   r   r   r   r   #  s"   

zGridFunc.__init__c                   r   r   r   )r   rG   r   r   r   r   F  r   zGridFunc.__call__c                   s   t |ts	td|dkrt jjddtjS |dkrd}n|dkr&d}ntd	d
	 fdd|D }t
tjg| }|dkrLtd| d|S td| d|S )Nzndim must be an integerr   r"   nr   )r"   yr   )r"   r   zzOnly ndim=1,2,3 are supportedr   c                 3  s    | ]
} j j|d V  qdS )r   N)r   format)r!   r   r_   r   r   r#   X  s    z&GridFunc.call_const.<locals>.<genexpr>zSTD::make_pair(rm   zSTD::make_tuple()r    r,   r)   r
   r   r   r   uint32rp   joinTuple)r   r8   rG   dims	elts_coder2   r   r_   r   rc   I  s   
zGridFunc.call_const)r/   r?   r@   r   r   rc   rA   r   r   r   r   r   !  s    #r   c                      s8   e Zd Zdd Zdd fdd
Zdddd	Z  ZS )
WarpShuffleOpc                 C  s>   || _ d|r
|d nd d | _|| _d| j d}|| _d S )N__shfl__ syncr   z`` function. Please refer to
        `Warp Shuffle Functions`_ for detailed explanation.

        .. _Warp Shuffle Functions:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
        r   r   r   r   r   r   c  s
   
zWarpShuffleOp.__init__    )widthc                  r   r   r   )r   rg   r   val_idr   r   r   r   r   o  r   zWarpShuffleOp.__call__Nc          
      C  sh  | j }t||}|j}|jj| jvrtd| d|j dz|j}W n t	y/   tdw t
jr>td| dt nd|  krKdksPtd	 td	| jd
v rYtj}ntj}t||d|}t||}|r{t|trz|jdvrztdnt
jrtdntd}t|tjd|}t||}| dt| d|j d|j }	|	d|j d7 }	t|	|S )Nr   r   r   zmask must be an integerri   rj   r   rf   rk   )updownrl   )r            r   zwidth needs to be power of 2@   r   (r   rm   )r   r
   r5   r2   r3   r~   r   r)   r+   	Exceptionr   rn   r0   r1   ro   rp   r   r   r   rr   r   rq   r    r   hexrH   )
r   r8   rg   r   r   r   r~   r2   val_id_trH   r   r   r   r=   r  sD   




"
zWarpShuffleOp.callr   r   r   r   r   r   a  s    r   c                      s,   e Zd Z fddZdd Zdd Z  ZS )LaneIDc                   r   )zReturns the lane ID of the calling thread, ranging in
        ``[0, jit.warpsize)``.

        .. note::
            Unlike :obj:`numba.cuda.laneid`, this is a callable function
            instead of a property.
        Nr   r_   r   r   r   r     r`   zLaneID.__call__c                 C  s"   d}t js|d7 }|S |d7 }|S )Nz2__device__ __forceinline__ unsigned int LaneId() {z
                unsigned int ret;
                asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
                return ret; }
            z3
                return __lane_id(); }
            )r   rn   )r   preambler   r   r   _get_preamble  s   
zLaneID._get_preamblec                 C  s   |j |   tdtjS )NzLaneId())	generatedadd_coder   r
   r   r   rb   r   r   r   rc     s   zLaneID.call_const)r/   r?   r@   r   r   rc   rA   r   r   r   r   r     s    
r   zMapping[Any, BuiltinFunc]builtin_functions_dictr   r   Add)rr   r   uint64float32float64r   )float16Sub)rr   r   Exch)rr   r   r   r   Min)rr   r   r   MaxInc)r   Decr   )uint16AndOrXor)rr   r   int64r   r   )r   r   r   r   r   xor)@
__future__r   typingr   collections.abcr   r0   rQ   cupy_backends.cuda.apir   	cupy.cudar   	cupyx.jitr   r   cupyx.jit._internal_typesr	   r
   r   r   r   	functoolsr   r   rC   rM   r\   r^   re   rs   r   r   r   r   ranger(   minmaxr   __annotations__range_syncthreadssyncwarpshared_memoryr   r   laneidrn   
atomic_add
atomic_subatomic_exch
atomic_min
atomic_max
atomic_inc
atomic_dec
atomic_cas
atomic_and	atomic_or
atomic_xor_shfl_dtypes	shfl_syncshfl_up_syncshfl_down_syncshfl_xor_syncr   r   r   r   <module>   s    Z"?@:!


