o
    Y۷iݜ                     @   s  d dl Zd dlZd dlZd dlZd dlZd dlmZmZm	Z	m
Z
mZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m
Z* d dl+m,Z, d dl+m-Z- d dl.m/Z/ g dZ0G dd dej1Z2G dd de3Z4G dd dZ5G dd deZ6G dd deZ7G dd deej1Z8dS )     N)config	serializesigutilstypestypingutils)Cache	CacheImpl)global_compiler_lock)
Dispatcher)NumbaPerformanceWarning)Purposetypeof)get_current_device)wrap_arg)compile_cudaCUDACompiler)driver)get_context)cuda_target)missing_launch_config_msgnormalize_kernel_dimensions)r   cuda)_dispatcher)warn)hsinhcoshloghlog10hlog2hexphexp10hexp2hsqrthrsqrthfloorhceilhrcphrinthtrunchdivc                       s  e Zd ZdZe			d2 fdd	Zedd Zed	d
 Zdd Z	edd Z
edd Ze fddZdd Zdd Zedd Zedd Zedd Zedd Zedd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd3d)d*Zd4d,d-Zd5d.d/Zd0d1 Z  ZS )6_Kernelz
    CUDA Kernel specialized for a given set of argument types. When called, this
    object launches the kernel on the device.
    NFTc                    s  |rt dt   d| _d | _|| _|| _|| _|| _|p g | _	||
r'dndd}t
 j}t| jtj| j| j|||||d	}|j}| jj}|j}|j}||j|j||||||	\ }|sag }d  v | _| jrnd _ fd	d
tD }|rtjtjt}tj|d}| | |D ]} !| q|j"| _#|j$| _$|j%| _& | _'|j(| _(|| _|j| _|j)| _)g | _*g | _+g | _,d S )Nz,Cannot compile a device function as a kernelF   r   )fastmathoptdebuglineinfoinliner.   nvvm_optionscccudaCGGetIntrinsicHandleTc                    s"   g | ]}d |    v r|qS )__numba_wrapper_)get_asm_str).0fnlib K/home/ubuntu/vllm_env/lib/python3.10/site-packages/numba/cuda/dispatcher.py
<listcomp>l   s    z$_Kernel.__init__.<locals>.<listcomp>zcpp_function_wrappers.cu)-RuntimeErrorsuper__init__
objectmodeentry_pointpy_funcargtypesr1   r2   
extensionsr   compute_capabilityr   r   voidtarget_context__code__co_filenameco_firstlinenoprepare_cuda_kernellibraryfndescr8   cooperativeneeds_cudadevrtcuda_fp16_math_funcsospathdirnameabspath__file__joinappendadd_linking_filename
entry_name	signaturetype_annotation_type_annotation_codelibrarycall_helperenvironment_referenced_environmentsliftedreload_init)selfrE   rF   linkr1   r2   r3   r.   rG   max_registersr/   devicer4   r5   crestgt_ctxcodefilenamelinenumkernelresbasedirfunctions_cu_pathfilepath	__class__r;   r>   rB   .   sp   




z_Kernel.__init__c                 C      | j S N)ra   rg   r=   r=   r>   rO         z_Kernel.libraryc                 C   rw   rx   )r`   ry   r=   r=   r>   r_      rz   z_Kernel.type_annotationc                 C   rw   rx   )rd   ry   r=   r=   r>   _find_referenced_environments   s   z%_Kernel._find_referenced_environmentsc                 C   
   | j  S rx   )rJ   codegenry   r=   r=   r>   r}         
z_Kernel.codegenc                 C   s   t | jjS rx   )tupler^   argsry   r=   r=   r>   argument_types   s   z_Kernel.argument_typesc	           
         sX   |  | }	t| |	  d|	_||	_||	_||	_d|	_||	_||	_	||	_
||	_||	_|	S )&
        Rebuild an instance.
        N)__new__rA   rB   rD   rQ   r]   r^   r`   ra   r1   r2   rb   rG   )
clsrQ   r\   r^   codelibraryr1   r2   rb   rG   instanceru   r=   r>   _rebuild   s   
z_Kernel._rebuildc              
   C   s(   t | j| j| j| j| j| j| j| jdS )a  
        Reduce the instance for serialization.
        Compiled definitions are serialized in PTX form.
        Type annotation are discarded.
        Thread, block and shared memory configuration are serialized.
        Stream information is discarded.
        )rQ   r\   r^   r   r1   r2   rb   rG   )	dictrQ   r]   r^   ra   r1   r2   rb   rG   ry   r=   r=   r>   _reduce_states   s
   
z_Kernel._reduce_statesc                 C   s   | j   dS )z7
        Force binding to current CUDA context
        N)ra   
get_cufuncry   r=   r=   r>   bind      z_Kernel.bindc                 C      | j  jjS )zN
        The number of registers used by each thread for this kernel.
        )ra   r   attrsregsry   r=   r=   r>   regs_per_thread      z_Kernel.regs_per_threadc                 C   r   )zD
        The amount of constant memory used by this kernel.
        )ra   r   r   constry   r=   r=   r>   const_mem_size   r   z_Kernel.const_mem_sizec                 C   r   )zM
        The amount of shared memory used per block for this kernel.
        )ra   r   r   sharedry   r=   r=   r>   shared_mem_per_block   r   z_Kernel.shared_mem_per_blockc                 C   r   )z:
        The maximum allowable threads per block.
        )ra   r   r   
maxthreadsry   r=   r=   r>   max_threads_per_block   r   z_Kernel.max_threads_per_blockc                 C   r   )zM
        The amount of local memory used per thread for this kernel.
        )ra   r   r   localry   r=   r=   r>   local_mem_per_thread   r   z_Kernel.local_mem_per_threadc                 C   r|   )z6
        Returns the LLVM IR for this kernel.
        )ra   get_llvm_strry   r=   r=   r>   inspect_llvm   s   
z_Kernel.inspect_llvmc                 C   s   | j j|dS )z7
        Returns the PTX code for this kernel.
        r5   )ra   r8   )rg   r5   r=   r=   r>   inspect_asm   r   z_Kernel.inspect_asmc                 C   r|   )zv
        Returns the CFG of the SASS for this kernel.

        Requires nvdisasm to be available on the PATH.
        )ra   get_sass_cfgry   r=   r=   r>   inspect_sass_cfg      
z_Kernel.inspect_sass_cfgc                 C   r|   )zp
        Returns the SASS code for this kernel.

        Requires nvdisasm to be available on the PATH.
        )ra   get_sassry   r=   r=   r>   inspect_sass   r   z_Kernel.inspect_sassc                 C   sb   | j du r	td|du rtj}td| j| jf |d td|d t| j |d td|d dS )
        Produce a dump of the Python source of this function annotated with the
        corresponding Numba IR and type information. The dump is written to
        *file*, or *sys.stdout* if *file* is *None*.
        Nz Type annotation is not availablez%s %sfilezP--------------------------------------------------------------------------------zP================================================================================)r`   
ValueErrorsysstdoutprintr]   r   )rg   r   r=   r=   r>   inspect_types  s   
z_Kernel.inspect_typesr   c                 C   sH   t  }| j }t|trtdd |}||||}|jj	}|| S )a  
        Calculates the maximum number of blocks that can be launched for this
        kernel in a cooperative grid in the current context, for the given block
        and dynamic shared memory sizes.

        :param blockdim: Block dimensions, either as a scalar for a 1D block, or
                         a tuple for 2D or 3D blocks.
        :param dynsmemsize: Dynamic shared memory size in bytes.
        :return: The maximum number of blocks in the grid.
        c                 S   s   | | S rx   r=   )xyr=   r=   r>   <lambda>&  s    z5_Kernel.max_cooperative_grid_blocks.<locals>.<lambda>)
r   ra   r   
isinstancer   	functoolsreduce$get_active_blocks_per_multiprocessorrj   MULTIPROCESSOR_COUNT)rg   blockdimdynsmemsizectxcufuncactive_per_smsm_countr=   r=   r>   max_cooperative_grid_blocks  s   

z#_Kernel.max_cooperative_grid_blocksc                    s  | j   | jr* jd } j|\}}|ttjksJ t }	|j	d|d g }
g }t
| j|D ]\}}| ||||
| q4tjrLtjd}nd }|rS|jpT|}tj jg|||||R d| ji | jrtt|	|| |	jdkr݇ fddfddd	D }fd
dd	D }|	j}| j|\}}}|d u rd}n|\}}}tj|}d|||f }d|||f }|rd||d f f|dd   }|| |f}|| |
D ]}|  qd S )N__errcode__r   )streamrQ   c                    s<    j d j| f \}}t }tt||| |jS )Nz%s__%s__)	moduleget_global_symbolr\   ctypesc_intr   device_to_host	addressofvalue)r\   memszval)r   r=   r>   load_symbolS  s   
z#_Kernel.launch.<locals>.load_symbolc                       g | ]} d | qS )tidr=   r9   ir   r=   r>   r?   [      z"_Kernel.launch.<locals>.<listcomp>zyxc                    r   )ctaidr=   r   r   r=   r>   r?   \  r    z"In function %r, file %s, line %s, z%stid=%s ctaid=%sz%s: %s   )ra   r   r1   r\   r   r   r   sizeofr   memsetzipr   _prepare_argsr   USE_NV_BINDINGbindingCUstreamhandlelaunch_kernelrQ   r   r   r   rb   get_exceptionrT   rU   rW   )rg   r   griddimr   r   	sharedmemexcnameexcmemexcszexcvalretr
kernelargstvzero_streamstream_handler   r   rm   excclsexc_argsloclocinfosymrt   linenoprefixwbr=   )r   r   r>   launch-  sn   





z_Kernel.launchc                 C   sX  t | jD ]}|j||||d\}}qt|tjrt|||}tj	}t
d}	t
d}
||j}||jj}t|}tjrEt|}t
|}||	 ||
 || || || t|jD ]}|||j|  qht|jD ]}|||j|  qzdS t|tjrttd| |}|| dS |tjkrtt|tj}|| dS |tjkrt|}|| dS |tj krt!|}|| dS |tj"krt#t|}|| dS |tj$kr|t!|j% |t!|j& dS |tj'kr |t|j% |t|j& dS t|tj(tj)fr8|t*|tj+ dS t|tj,r\t|||}|j-}tjrUt
t|}|| dS t|tj.rt/|t/|ksnJ t0||D ]\}}| 1||||| qsdS t|tj2rz| 1|j|j3||| W dS  t4y   t4||w t4||)zF
        Convert arguments to ctypes and append to kernelargs
        )r   r   r   zc_%sN)5reversedrG   prepare_argsr   r   Arrayr   	to_devicer   	c_ssize_tc_void_psizedtypeitemsizer   device_pointerr   intrZ   rangendimshapestridesIntegergetattrfloat16c_uint16npviewuint16float64c_doublefloat32c_floatbooleanc_uint8	complex64realimag
complex128
NPDatetimeNPTimedeltac_int64int64Recorddevice_ctypes_pointer	BaseTuplelenr   r   
EnumMemberr   NotImplementedError)rg   tyr   r   r   r   	extensiondevaryc_intpmeminfoparentnitemsr   ptrdataaxcvaldevrecr   r   r=   r=   r>   r   u  s   


















z_Kernel._prepare_args)	NFFFFNNTFrx   )r   r   r   )__name__
__module____qualname____doc__r
   rB   propertyrO   r_   r{   r}   r   classmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   __classcell__r=   r=   ru   r>   r,   (   sL    \











Hr,   c                   @   $   e Zd Zdd Zdd Zdd ZdS )ForAllc                 C   s6   |dk r
t d| || _|| _|| _|| _|| _d S )Nr   z0Can't create ForAll with negative task count: %s)r   
dispatcherntasksthread_per_blockr   r   )rg   r0  r1  tpbr   r   r=   r=   r>   rB     s   
zForAll.__init__c                 G   s^   | j dkrd S | jjr| j}n| jj| }| |}| j | d | }|||| j| jf | S )Nr   r   )r1  r0  specialized
specialize_compute_thread_per_blockr   r   )rg   r   r4  r   r   r=   r=   r>   __call__  s   


zForAll.__call__c                 C   sZ   | j }|dkr	|S t }tt|j }t|j d| j	dd}|j
di |\}}|S )Nr   i   )funcb2d_funcmemsizeblocksizelimitr=   )r2  r   nextiter	overloadsvaluesr   ra   r   r   get_max_potential_block_size)rg   r0  r3  r   rp   kwargs_r=   r=   r>   r6    s   z ForAll._compute_thread_per_blockN)r'  r(  r)  rB   r7  r6  r=   r=   r=   r>   r/    s    
r/  c                   @   s   e Zd Zdd Zdd ZdS )_LaunchConfigurationc           	      C   sl   || _ || _|| _|| _|| _tjr2d}|d |d  |d  }||k r4d| d}tt| d S d S d S )N   r   r      z
Grid size zB will likely result in GPU under-utilization due to low occupancy.)	r0  r   r   r   r   r   CUDA_LOW_OCCUPANCY_WARNINGSr   r   )	rg   r0  r   r   r   r   min_grid_size	grid_sizemsgr=   r=   r>   rB     s   	z_LaunchConfiguration.__init__c                 G   s   | j || j| j| j| jS rx   )r0  callr   r   r   r   rg   r   r=   r=   r>   r7    s   z_LaunchConfiguration.__call__N)r'  r(  r)  rB   r7  r=   r=   r=   r>   rC    s    rC  c                   @   r.  )CUDACacheImplc                 C   s   |  S rx   )r   )rg   rp   r=   r=   r>   r      s   zCUDACacheImpl.reducec                 C   s   t jdi |S )Nr=   )r,   r   )rg   rJ   payloadr=   r=   r>   rebuild#     zCUDACacheImpl.rebuildc                 C   s   dS )NTr=   )rg   rk   r=   r=   r>   check_cachable&  s   zCUDACacheImpl.check_cachableN)r'  r(  r)  r   rN  rP  r=   r=   r=   r>   rL    s    rL  c                       s$   e Zd ZdZeZ fddZ  ZS )	CUDACachezS
    Implements a cache that saves and loads CUDA kernels and compile results.
    c                    sF   ddl m} |d t ||W  d    S 1 sw   Y  d S )Nr   )target_overrider   )numba.core.target_extensionrR  rA   load_overload)rg   sigrJ   rR  ru   r=   r>   rT  7  s   
$zCUDACache.load_overload)r'  r(  r)  r*  rL  _impl_classrT  r-  r=   r=   ru   r>   rQ  1  s    rQ  c                       sB  e Zd ZdZdZeZef fdd	Ze	dd Z
dd Zejd	d
dAddZdd ZdBddZe	dd Zdd Zdd Zdd Zdd Zdd Ze	dd ZdCd!d"ZdCd#d$ZdCd%d&ZdCd'd(ZdCd)d*Zd+d, ZdCd-d.Zd/d0 Zd1d2 Z dCd3d4Z!dCd5d6Z"dCd7d8Z#dCd9d:Z$dCd;d<Z%e&d=d> Z'd?d@ Z(  Z)S )DCUDADispatchera  
    CUDA Dispatcher object. When configured and called, the dispatcher will
    specialize itself for the given arguments (if no suitable specialized
    version already exists) & compute capability, and launch on the device
    associated with the current context.

    Dispatcher objects are not to be constructed by the user, but instead are
    created using the :func:`numba.cuda.jit` decorator.
    Fc                    s"   t  j|||d d| _i | _d S )N)targetoptionspipeline_classF)rA   rB   _specializedspecializations)rg   rE   rX  rY  ru   r=   r>   rB   R  s
   
	
zCUDADispatcher.__init__c                 C   s
   t | S rx   )
cuda_typesrW  ry   r=   r=   r>   _numba_type_b  r~   zCUDADispatcher._numba_type_c                 C   s   t | j| _d S rx   )rQ  rE   _cachery   r=   r=   r>   enable_cachingf  rO  zCUDADispatcher.enable_cachingrD  )maxsizer   c                 C   s   t ||\}}t| ||||S rx   )r   rC  )rg   r   r   r   r   r=   r=   r>   	configurei  s   zCUDADispatcher.configurec                 C   s   t |dvr
td| j| S )N)rE  r-      z.must specify at least the griddim and blockdim)r  r   ra  rK  r=   r=   r>   __getitem__n  s   
zCUDADispatcher.__getitem__c                 C   s   t | ||||dS )a3  Returns a 1D-configured dispatcher for a given number of tasks.

        This assumes that:

        - the kernel maps the Global Thread ID ``cuda.grid(1)`` to tasks on a
          1-1 basis.
        - the kernel checks that the Global Thread ID is upper-bounded by
          ``ntasks``, and does nothing if it is not.

        :param ntasks: The number of tasks.
        :param tpb: The size of a block. An appropriate value is chosen if this
                    parameter is not supplied.
        :param stream: The stream on which the configured dispatcher will be
                       launched.
        :param sharedmem: The number of bytes of dynamic shared memory required
                          by the kernel.
        :return: A configured dispatcher, ready to launch on a set of
                 arguments.)r3  r   r   )r/  )rg   r1  r3  r   r   r=   r=   r>   foralls  s   zCUDADispatcher.forallc                 C   s   | j dS )aS  
        A list of objects that must have a `prepare_args` function. When a
        specialized kernel is called, each argument will be passed through
        to the `prepare_args` (from the last object in this list to the
        first). The arguments to `prepare_args` are:

        - `ty` the numba type of the argument
        - `val` the argument value itself
        - `stream` the CUDA stream used for the current call to the kernel
        - `retr` a list of zero-arg functions that you may want to append
          post-call cleanup work to.

        The `prepare_args` function must return a tuple `(ty, val)`, which
        will be passed in turn to the next right-most `extension`. After all
        the extensions have been called, the resulting `(ty, val)` will be
        passed into Numba's default argument marshalling logic.
        rG   )rX  getry   r=   r=   r>   rG     s   zCUDADispatcher.extensionsc                 O   s   t trx   )r   r   )rg   r   rA  r=   r=   r>   r7    s   zCUDADispatcher.__call__c                 C   sD   | j rtt| j }n
tjj| g|R  }|||||| dS )zJ
        Compile if necessary and invoke this kernel with *args*.
        N)	r4  r<  r=  r>  r?  r   r   
_cuda_callr   )rg   r   r   r   r   r   rp   r=   r=   r>   rJ    s   zCUDADispatcher.callc                    s(   |rJ  fdd|D }  t|S )Nc                    s   g | ]}  |qS r=   typeof_pyvalr9   ary   r=   r>   r?     s    z4CUDADispatcher._compile_for_args.<locals>.<listcomp>)compiler   )rg   r   kwsrF   r=   ry   r>   _compile_for_args  s   z CUDADispatcher._compile_for_argsc                 C   sD   zt |tjW S  ty!   t|r t tj|ddtj Y S  w )NF)sync)r   r   argumentr   r   is_cuda_arrayas_cuda_array)rg   r   r=   r=   r>   rh    s   
zCUDADispatcher.typeof_pyvalc                    s    j rtdt j}t fdd|D } j||f}|r"|S  j}t j	|d}|
| |  d|_| j||f< |S )zd
        Create a new instance of this dispatcher specialized for the given
        *args*.
        zDispatcher already specializedc                 3   s    | ]}  |V  qd S rx   rg  ri  ry   r=   r>   	<genexpr>  s    z,CUDADispatcher.specialize.<locals>.<genexpr>)rX  T)r4  r@   r   rH   r   r[  re  rX  rW  rE   rk  disable_compilerZ  )rg   r   r5   rF   specializationrX  r=   ry   r>   r5    s    
zCUDADispatcher.specializec                 C   rw   )z>
        True if the Dispatcher has been specialized.
        )rZ  ry   r=   r=   r>   r4    s   zCUDADispatcher.specializedNc                 C   D   |dur| j |j jS | jrtt| j  jS dd | j  D S )a  
        Returns the number of registers used by each thread in this kernel for
        the device in the current context.

        :param signature: The signature of the compiled kernel to get register
                          usage for. This may be omitted for a specialized
                          kernel.
        :return: The number of registers used by the compiled variant of the
                 kernel for the given signature and current device.
        Nc                 S      i | ]\}}||j qS r=   )r   r9   rU  overloadr=   r=   r>   
<dictcomp>      z6CUDADispatcher.get_regs_per_thread.<locals>.<dictcomp>)r>  r   r   r4  r<  r=  r?  itemsrg   r^   r=   r=   r>   get_regs_per_thread     z"CUDADispatcher.get_regs_per_threadc                 C   ru  )a  
        Returns the size in bytes of constant memory used by this kernel for
        the device in the current context.

        :param signature: The signature of the compiled kernel to get constant
                          memory usage for. This may be omitted for a
                          specialized kernel.
        :return: The size in bytes of constant memory allocated by the
                 compiled variant of the kernel for the given signature and
                 current device.
        Nc                 S   rv  r=   )r   rw  r=   r=   r>   ry    rz  z5CUDADispatcher.get_const_mem_size.<locals>.<dictcomp>)r>  r   r   r4  r<  r=  r?  r{  r|  r=   r=   r>   get_const_mem_size  s   z!CUDADispatcher.get_const_mem_sizec                 C   ru  )a  
        Returns the size in bytes of statically allocated shared memory
        for this kernel.

        :param signature: The signature of the compiled kernel to get shared
                          memory usage for. This may be omitted for a
                          specialized kernel.
        :return: The amount of shared memory allocated by the compiled variant
                 of the kernel for the given signature and current device.
        Nc                 S   rv  r=   )r   rw  r=   r=   r>   ry    rz  z;CUDADispatcher.get_shared_mem_per_block.<locals>.<dictcomp>)r>  r   r   r4  r<  r=  r?  r{  r|  r=   r=   r>   get_shared_mem_per_block  r~  z'CUDADispatcher.get_shared_mem_per_blockc                 C   ru  )a(  
        Returns the maximum allowable number of threads per block
        for this kernel. Exceeding this threshold will result in
        the kernel failing to launch.

        :param signature: The signature of the compiled kernel to get the max
                          threads per block for. This may be omitted for a
                          specialized kernel.
        :return: The maximum allowable threads per block for the compiled
                 variant of the kernel for the given signature and current
                 device.
        Nc                 S   rv  r=   )r   rw  r=   r=   r>   ry  ,  rz  z<CUDADispatcher.get_max_threads_per_block.<locals>.<dictcomp>)r>  r   r   r4  r<  r=  r?  r{  r|  r=   r=   r>   get_max_threads_per_block  s   z(CUDADispatcher.get_max_threads_per_blockc                 C   ru  )a  
        Returns the size in bytes of local memory per thread
        for this kernel.

        :param signature: The signature of the compiled kernel to get local
                          memory usage for. This may be omitted for a
                          specialized kernel.
        :return: The amount of local memory allocated by the compiled variant
                 of the kernel for the given signature and current device.
        Nc                 S   rv  r=   )r   rw  r=   r=   r>   ry  ?  rz  z;CUDADispatcher.get_local_mem_per_thread.<locals>.<dictcomp>)r>  r   r   r4  r<  r=  r?  r{  r|  r=   r=   r>   get_local_mem_per_thread/  r~  z'CUDADispatcher.get_local_mem_per_threadc                 C   sP   | j r
| t| | jj}d|}tj||| jd}t	
| j}||||fS )z
        Get a typing.ConcreteTemplate for this dispatcher and the given
        *args* and *kws* types.  This allows resolution of the return type.

        A (template, pysig, args, kws) tuple is returned.
        zCallTemplate({0}))key
signatures)_can_compilecompile_devicer   rE   r'  formatr   make_concrete_templatenopython_signaturesr   pysignature)rg   r   rl  	func_namer\   call_templatepysigr=   r=   r>   get_call_templateB  s   
z CUDADispatcher.get_call_templatec           
      C   s   || j vrc| jQ | jd}| jd}| jd}| jd}| jdr)dnd|d}t j}t| j||||||||d		}	|	| j |< |	j	|	j
|	j|	jg W d
   |	S 1 s\w   Y  |	S | j | }	|	S )zCompile the device function for the given argument types.

        Each signature is compiled once by caching the compiled function inside
        this object.

        Returns the `CompileResult`.
        r1   r2   r3   r.   r/   r-   r   )r/   r.   r0   N)r>  _compiling_counterrX  re  r   rH   r   rE   rJ   insert_user_functionrD   rP   rO   )
rg   r   return_typer1   r2   r3   r.   r4   r5   rk   r=   r=   r>   r  ]  s:   





zCUDADispatcher.compile_devicec                 C   s,   dd |D }| j ||dd || j|< d S )Nc                 S   s   g | ]}|j qS r=   )_coderi  r=   r=   r>   r?     s    z/CUDADispatcher.add_overload.<locals>.<listcomp>Tr   )_insertr>  )rg   rp   rF   c_sigr=   r=   r>   add_overload  s   zCUDADispatcher.add_overloadc                 C   s   t |\}}|du s|tjksJ | jrtt| j S | j	|}|dur*|S | j
|| j}|dur@| j|  d7  < n&| j|  d7  < | jsPtdt| j|fi | j}|  | j
|| | || |S )z
        Compile and bind to the current context a version of this kernel
        specialized for the given signature.
        Nr   zCompilation disabled)r   normalize_signaturer   noner4  r<  r=  r>  r?  re  r^  rT  	targetctx_cache_hits_cache_missesr  r@   r,   rE   rX  r   save_overloadr  )rg   rU  rF   r  rp   r=   r=   r>   rk    s$   zCUDADispatcher.compilec                 C   sb   | j d}|dur|r| j| j S | j|  S |r'dd | j D S dd | j D S )z
        Return the LLVM IR for this kernel.

        :param signature: A tuple of argument types.
        :return: The LLVM IR for the given signature, or a dict of LLVM IR
                 for all previously-encountered signatures.

        rj   Nc                 S   s   i | ]
\}}||j  qS r=   )rO   r   rw  r=   r=   r>   ry        z/CUDADispatcher.inspect_llvm.<locals>.<dictcomp>c                 S      i | ]	\}}||  qS r=   )r   rw  r=   r=   r>   ry        )rX  re  r>  rO   r   r   r{  rg   r^   rj   r=   r=   r>   r     s   	zCUDADispatcher.inspect_llvmc                    sv   t  j | jd}|dur!|r| j| j S | j|  S |r/ fdd| j D S  fdd| j D S )a+  
        Return this kernel's PTX assembly code for for the device in the
        current context.

        :param signature: A tuple of argument types.
        :return: The PTX code for the given signature, or a dict of PTX codes
                 for all previously-encountered signatures.
        rj   Nc                    s   i | ]\}}||j  qS r=   )rO   r8   rw  r   r=   r>   ry    s    z.CUDADispatcher.inspect_asm.<locals>.<dictcomp>c                    s   i | ]
\}}||  qS r=   )r   rw  r   r=   r>   ry    r  )	r   rH   rX  re  r>  rO   r8   r   r{  r  r=   r   r>   r     s   	

zCUDADispatcher.inspect_asmc                 C   >   | j dr
td|dur| j|  S dd | j D S )a  
        Return this kernel's CFG for the device in the current context.

        :param signature: A tuple of argument types.
        :return: The CFG for the given signature, or a dict of CFGs
                 for all previously-encountered signatures.

        The CFG for the device in the current context is returned.

        Requires nvdisasm to be available on the PATH.
        rj   z'Cannot get the CFG of a device functionNc                 S   r  r=   )r   r9   rU  defnr=   r=   r>   ry    r  z3CUDADispatcher.inspect_sass_cfg.<locals>.<dictcomp>)rX  re  r@   r>  r   r{  r|  r=   r=   r>   r     s   zCUDADispatcher.inspect_sass_cfgc                 C   r  )a  
        Return this kernel's SASS assembly code for for the device in the
        current context.

        :param signature: A tuple of argument types.
        :return: The SASS code for the given signature, or a dict of SASS codes
                 for all previously-encountered signatures.

        SASS for the device in the current context is returned.

        Requires nvdisasm to be available on the PATH.
        rj   z(Cannot inspect SASS of a device functionNc                 S   r  r=   )r   r  r=   r=   r>   ry    r  z/CUDADispatcher.inspect_sass.<locals>.<dictcomp>)rX  re  r@   r>  r   r{  r|  r=   r=   r>   r     s   zCUDADispatcher.inspect_sassc                 C   s2   |du rt j}| j D ]
\}}|j|d qdS )r   Nr   )r   r   r>  r{  r   )rg   r   rB  r  r=   r=   r>   r     s
   zCUDADispatcher.inspect_typesc                 C   s   | ||}|S )r   r=   )r   rE   rX  r   r=   r=   r>   r     s   
zCUDADispatcher._rebuildc                 C   s   t | j| jdS )zd
        Reduce the instance for serialization.
        Compiled definitions are discarded.
        )rE   rX  )r   rE   rX  ry   r=   r=   r>   r     s   zCUDADispatcher._reduce_statesr&  )r   r   r   rx   )*r'  r(  r)  r*  
_fold_argsr   targetdescrr   rB   r+  r]  r_  r   	lru_cachera  rc  rd  rG   r7  rJ  rm  rh  r5  r4  r}  r  r  r  r  r  r  r  rk  r   r   r   r   r   r,  r   r   r-  r=   r=   ru   r>   rW  @  sL    










'
$




rW  )9numpyr  rT   r   r   r   
numba.corer   r   r   r   r   r   numba.core.cachingr   r	   numba.core.compiler_lockr
   numba.core.dispatcherr   numba.core.errorsr   numba.core.typing.typeofr   r   numba.cuda.apir   numba.cuda.argsr   numba.cuda.compilerr   r   numba.cuda.cudadrvr   numba.cuda.cudadrv.devicesr   numba.cuda.descriptorr   numba.cuda.errorsr   r   
numba.cudar\  numbar   r   warningsr   rS   ReduceMixinr,   objectr/  rC  rL  rQ  rW  r=   r=   r=   r>   <module>   s@        /.