o
    iB                     @   sp  U d dl mZmZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dl	mZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dl	mZ d dlmZ G dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd  d eZG d!d" d"eZee e e e!e e"e iZ#eeef e$d#< e Z%e Z&e Z'e Z(ed$Z)ed%Z*e Z+ed&d'ej,rd(nd) Z-ed*d+Z.ed,d-Z/ed.d/Z0ed0d/Z1ed1d2Z2ed3d2Z3ed4d/ej,rd(nd5 Z4ed6d/Z5ed7d/Z6ed8d/Z7d9ej,rd(nd: Z8ed;e8Z9ed<e8Z:ed=e8Z;ed>e8Z<dS )?    )AnyMappingN)runtime)device)_cuda_types)_cuda_typerules)BuiltinFunc)Data)Constant)Range)_compile)reducec                       s0   e Zd Zdd fdd
ZddddZ  ZS )	RangeFuncNunrollc                      t    dS )a  Range with loop unrolling support.

        Args:
            start (int):
                Same as that of built-in :obj:`range`.
            stop (int):
                Same as that of built-in :obj:`range`.
            step (int):
                Same as that of built-in :obj:`range`.
            unroll (int or bool or None):

                - If `True`, add ``#pragma unroll`` directive before the
                  loop.
                - If `False`, add ``#pragma unroll(1)`` directive before
                  the loop to disable unrolling.
                - If an `int`, add ``#pragma unroll(n)`` directive before
                  the loop, where the integer ``n`` means the number of
                  iterations to unroll.
                - If `None` (default), leave the control of loop unrolling
                  to the compiler (no ``#pragma``).

        .. seealso:: `#pragma unroll`_

        .. _#pragma unroll:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#pragma-unroll
        Nsuper__call__)selfr   args	__class__ U/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/cupyx/jit/_builtin_funcs.pyr      s   zRangeFunc.__call__c          	      G   s  t |dkr
tdt |dkrtd|d td}}}n*t |dkr4|d |d td}}}nt |dkr@|\}}}n	tdt | |d urtdd ||||fD s^td	|j}t|tsut|tsutd
t|j	 |du r{d}|du sd|  k rdk sn t
d t|tr|jdk}n|jjjdkrd}nd }t||}t||}t||}|jjjdvrtd|jjjdvrtd|jjjdvrtd|jdkrtt}n|jdkr|j}nJ t||||||dS )Nr   z)range expected at least 1 argument, got 0         z'range expected at most 3 argument, got c                 s   s    | ]}t |tV  qd S N)
isinstancer
   ).0xr   r   r   	<genexpr>@   s    z!RangeFunc.call.<locals>.<genexpr>zCloop unrolling requires constant start, stop, step and unroll valuez-unroll value expected to be of type int, got FTl        zUloop unrolling is ignored as the unroll value is non-positive or greater than INT_MAXuiuz%range supports only for integer type.numpycudar   )len	TypeErrorr
   allobjr   intbooltype__name__warningswarnctypedtypekindr	   initmoder   Scalarr   )	r   envr   r   startstopstepstep_is_positiver1   r   r   r   call2   sf   



zRangeFunc.callr.   
__module____qualname__r   r<   __classcell__r   r   r   r   r      s    r   c                   @      e Zd Zdd ZdS )LenFuncc                 O   sr   t |dkrtdt | |rtd|d }t|jtjs$td|jjs,tdtd|j dt	d	S )
Nr   z#len() expects only 1 argument, got #keyword arguments are not supportedr   zlen() supports only array typezlen() of unsized arrayzstatic_cast<long long>(z.shape()[0])q)
r'   r(   r   r1   r   CArrayndimr	   coder6   )r   r7   r   kwdsargr   r   r   r<   o   s   zLenFunc.callNr.   r>   r?   r<   r   r   r   r   rB   m       rB   c                   @   rA   )MinFuncc                    <   t |dk rtdt | |rtdt fdd|S )Nr   z(min() expects at least 2 arguments, got rC   c                       t tj| |fd  S r   )r   _call_ufunccupyminimumabr7   r   r   <lambda>       zMinFunc.call.<locals>.<lambda>r'   r(   r   r   r7   r   rH   r   rU   r   r<         zMinFunc.callNrJ   r   r   r   r   rL   }   rK   rL   c                   @   rA   )MaxFuncc                    rM   )Nr   z(max() expects at least 2 arguments, got rC   c                    rN   r   )r   rO   rP   maximumrR   rU   r   r   rV      rW   zMaxFunc.call.<locals>.<lambda>rX   rY   r   rU   r   r<      rZ   zMaxFunc.callNrJ   r   r   r   r   r[      rK   r[   c                       s$   e Zd Z fddZdd Z  ZS )SyncThreadsc                    r   )zCalls ``__syncthreads()``.

        .. seealso:: `Synchronization functions`_

        .. _Synchronization functions:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#synchronization-functions
        Nr   r   r   r   r   r         zSyncThreads.__call__c                 C   s   t dtjS )Nz__syncthreads())r	   r   voidr   r7   r   r   r   
call_const   s   zSyncThreads.call_constr.   r>   r?   r   rb   r@   r   r   r   r   r]      s    
r]   c                       s0   e Zd Zdd fdd
ZddddZ  ZS )SyncWarp    )maskc                   r   )a:  Calls ``__syncwarp()``.

        Args:
            mask (int): Active threads in a warp. Default is 0xffffffff.

        .. seealso:: `Synchronization functions`_

        .. _Synchronization functions:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#synchronization-functions
        Nr   )r   rf   r   r   r   r         zSyncWarp.__call__Nc                C   s   t jr|d urtd| dt d }|rDt|tr-d|j  kr(dks-td tdt	
|tjd|}t||}d|j d}nd	}t|tjS )
Nmask  is ignored on HIPr   re   mask is out of range	same_kindz__syncwarp()z__syncwarp())r   is_hipr/   r0   RuntimeWarningr   r
   r*   
ValueErrorr   _astype_scalarr   int32r	   r4   rG   r`   )r   r7   rf   rG   r   r   r   r<      s"   

zSyncWarp.callr=   r   r   r   r   rd      s    rd   c                       s(   e Zd Zd fdd	ZdddZ  ZS )SharedMemoryNc                    r   )a  Allocates shared memory and returns it as a 1-D array.

        Args:
            dtype (dtype):
                The dtype of the returned array.
            size (int or None):
                If ``int`` type, the size of static shared memory.
                If ``None``, declares the shared memory with extern specifier.
            alignment (int or None): Enforce the alignment via __align__(N).
        Nr   )r   r2   size	alignmentr   r   r   r      rg   zSharedMemory.__call__c                 C   sN   |j dd}t|}t|t|||}||j|< ||j|< t|t|S )N_smem)prefix)	get_fresh_variable_namer   to_ctyper	   r   	SharedMemdeclslocalsPtr)r   r7   r2   rs   rt   namer1   varr   r   r   rb      s   


zSharedMemory.call_constr   rc   r   r   r   r   rr      s    rr   c                       s0   e Zd Zdd Zd fdd	ZdddZ  ZS )	AtomicOpc                 C   s.   || _ d| | _|| _d| j d}|| _d S )NatomicCalls the ``a  `` function to operate atomically on
        ``array[index]``. Please refer to `Atomic Functions`_ for detailed
        explanation.

        Args:
            array: A :class:`cupy.ndarray` to index over.
            index: A valid index such that the address to the corresponding
                array element ``array[index]`` can be computed.
            value: Represent the value to use for the specified operation. For
                the case of :obj:`atomic_cas`, this is the value for
                ``array[index]`` to compare with.
            alt_value: Only used in :obj:`atomic_cas` to represent the value
                to swap to.

        .. seealso:: `Numba's corresponding atomic functions`_

        .. _Atomic Functions:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions

        .. _Numba's corresponding atomic functions:
            https://numba.readthedocs.io/en/stable/cuda-reference/kernel.html#synchronization-and-atomic-operations
        _op_name_dtypes__doc__r   opdtypesdocr   r   r   __init__   s
   

zAtomicOp.__init__Nc                       t    d S r   r   )r   arrayindexvalue	alt_valuer   r   r   r         zAtomicOp.__call__c                 C   s2  | j }| j}t||}t|jtjtjfst	dt
|||}|j}	|	jj| jvr7t	d| d|	j dt
||	d|}t||}|dkr|d usOJ |	jjdkratt dk ratd	t
||	d|}t||}| d
|j d|j d|j d}
n|d u sJ | d
|j d|j d}
t|
|	S )Nz)The first argument must be of array type.`` does not support  input.rk   CASHF   z5uint16 atomic operation is not supported before sm_70z(&, rl   )r   r   r	   r4   r   r1   r   rE   r|   r(   r   	_indexingr2   r}   r   rp   charr+   r   get_compute_capabilityRuntimeErrorrG   )r   r7   r   r   r   value2r}   r   targetr1   rG   r   r   r   r<      s0   $
zAtomicOp.callr   r.   r>   r?   r   r   r<   r@   r   r   r   r   r      s    r   c                       s,   e Zd Zdd Z fddZdd Z  ZS )GridFuncc                 C   st   |dkrd| _ d| _d| _d| _n|dkr"d| _ d| _d	| _d
| _ntdd| j  d| j d| j d}|| _d S )Ngridz%Compute the thread index in the grid.z1jit.threadIdx.x + jit.blockIdx.x * jit.blockDim.xznumba.cuda.gridz+threadIdx.{n} + blockIdx.{n} * blockDim.{n}gridsizezCompute the grid size.zjit.blockDim.x * jit.gridDim.xznumba.cuda.gridsizezblockDim.{n} * gridDim.{n}zunsupported functionz        zH

        Computation of the first integer is as follows::

            a  

        and for the other two integers the ``y`` and ``z`` attributes are used.

        Args:
            ndim (int): The dimension of the grid. Only 1, 2, or 3 is allowed.

        Returns:
            int or tuple:
                If ``ndim`` is 1, an integer is returned, otherwise a tuple.

        .. note::
            This function follows the convention of Numba's
            :func:`z`.
        )_desc_eq_link_codero   r   )r   r5   r   r   r   r   r      s"   

zGridFunc.__init__c                    r   r   r   )r   rF   r   r   r   r   C  r   zGridFunc.__call__c                    s   t |ts	td|dkrt jjddtjS |dkrd}n|dkr&d}ntd	d
	 fdd|D }t
tjg| }|dkrLtd| d|S td| d|S )Nzndim must be an integerr   r!   nr   )r!   yr   )r!   r   zzOnly ndim=1,2,3 are supportedr   c                 3   s    | ]
} j j|d V  qdS )r   N)r   format)r    r   r^   r   r   r"   U  s    z&GridFunc.call_const.<locals>.<genexpr>zSTD::make_pair(rl   zSTD::make_tuple()r   r+   r(   r	   r   r   r   uint32ro   joinTuple)r   r7   rF   dims	elts_coder1   r   r^   r   rb   F  s   
zGridFunc.call_const)r.   r>   r?   r   r   rb   r@   r   r   r   r   r     s    #r   c                       s8   e Zd Zdd Zdd fdd
Zdddd	Z  ZS )
WarpShuffleOpc                 C   s>   || _ d|r
|d nd d | _|| _d| j d}|| _d S )N__shfl__ syncr   z`` function. Please refer to
        `Warp Shuffle Functions`_ for detailed explanation.

        .. _Warp Shuffle Functions:
            https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
        r   r   r   r   r   r   `  s
   
zWarpShuffleOp.__init__    )widthc                   r   r   r   )r   rf   r~   val_idr   r   r   r   r   l  r   zWarpShuffleOp.__call__Nc          
      C   sh  | j }t||}|j}|jj| jvrtd| d|j dz|j}W n t	y/   tdw t
jr>td| dt nd|  krKdksPtd	 td	| jd
v rYtj}ntj}t||d|}t||}|r{t|trz|jdvrztdnt
jrtdntd}t|tjd|}t||}| dt| d|j d|j }	|	d|j d7 }	t|	|S )Nr   r   r   zmask must be an integerrh   ri   r   re   rj   )updownrk   )r            r   zwidth needs to be power of 2@   r   (r   rl   )r   r	   r4   r1   r2   r}   r   r(   r*   	Exceptionr   rm   r/   r0   rn   ro   r   r   r   rq   r   rp   r   r
   hexrG   )
r   r7   rf   r~   r   r   r}   r1   val_id_trG   r   r   r   r<   o  sD   




"
zWarpShuffleOp.callr   r   r   r   r   r   ^  s    r   c                       s,   e Zd Z fddZdd Zdd Z  ZS )LaneIDc                    r   )zReturns the lane ID of the calling thread, ranging in
        ``[0, jit.warpsize)``.

        .. note::
            Unlike :obj:`numba.cuda.laneid`, this is a callable function
            instead of a property.
        Nr   r^   r   r   r   r     r_   zLaneID.__call__c                 C   s"   d}t js|d7 }|S |d7 }|S )Nz2__device__ __forceinline__ unsigned int LaneId() {z
                unsigned int ret;
                asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
                return ret; }
            z3
                return __lane_id(); }
            )r   rm   )r   preambler   r   r   _get_preamble  s   
zLaneID._get_preamblec                 C   s   |j |   tdtjS )NzLaneId())	generatedadd_coder   r	   r   r   ra   r   r   r   rb     s   zLaneID.call_const)r.   r>   r?   r   r   rb   r@   r   r   r   r   r     s    
r   builtin_functions_dictr   r   Add)rq   r   uint64float32float64r   )float16Sub)rq   r   Exch)rq   r   r   r   Min)rq   r   r   MaxInc)r   Decr   )uint16AndOrXor)rq   r   int64r   r   )r   r   r   r   r   xor)=typingr   r   r/   rP   cupy_backends.cuda.apir   	cupy.cudar   	cupyx.jitr   r   cupyx.jit._internal_typesr   r	   r
   r   r   	functoolsr   r   rB   rL   r[   r]   rd   rr   r   r   r   r   ranger'   minmaxr   __annotations__range_syncthreadssyncwarpshared_memoryr   r   laneidrm   
atomic_add
atomic_subatomic_exch
atomic_min
atomic_max
atomic_inc
atomic_dec
atomic_cas
atomic_and	atomic_or
atomic_xor_shfl_dtypes	shfl_syncshfl_up_syncshfl_down_syncshfl_xor_syncr   r   r   r   <module>   s    Z"?@:!


