o
    ٰi9                     @  s4   d Z ddlmZ ddlmZ dddddddZdS )zPNVRTC (NVIDIA Runtime Compilation) utilities for compiling CUDA source to CUBIN.    )annotations)Sequencez	kernel.cuN)namearch
extra_optssourcestrr   r   
str | Noner   Sequence[str] | Nonereturnbytesc             
   C  s  z
ddl m}m} W n ty } ztd|d}~ww |du rzX|d\}||jjkr4td| | \}}||jjkrBd}|	|j
j|\}}	||jjkrYtd| |	|j
j|\}}
||jjkrptd| d|	 |
 }W n ty } z	td	| d
|d}~ww |t| t|ddd\}}||jjkrtd| d|  dg}|r|dd |D  ||t||\}||jjkr||\}}||jjkr|dkrd| }|||\}||jjkrd|d }nd| }nd| }|| t|||\}}||jjkr,|| td| d| }|||\}||jjkrJ|| td| || |S )a  Compile CUDA source code to CUBIN using NVRTC.

    This function uses the NVIDIA Runtime Compilation (NVRTC) library to compile
    CUDA C++ source code into a CUBIN binary that can be loaded and executed
    using the CUDA Driver API.

    Parameters
    ----------
    source : str
        The CUDA C++ source code to compile.

    name : str, optional
        The name to use for the source file (for error messages). Default: "kernel.cu"

    arch : str, optional
        The target GPU architecture (e.g., "sm_75", "sm_80", "sm_89"). If not specified,
        attempts to auto-detect from the current GPU.

    extra_opts : Sequence[str], optional
        Additional compilation options to pass to NVRTC (e.g., ["-I/path/to/include", "-DDEFINE=1"]).

    Returns
    -------
    bytes
        The compiled CUBIN binary data.

    Raises
    ------
    RuntimeError
        If NVRTC compilation fails or CUDA bindings are not available.

    Example
    -------
    .. code-block:: python

        from tvm_ffi.cpp import nvrtc

        cuda_source = '''
        extern "C" __global__ void add_one(float* x, float* y, int n) {
            int idx = blockIdx.x * blockDim.x + threadIdx.x;
            if (idx < n) {
                y[idx] = x[idx] + 1.0f;
            }
        }
        '''

        cubin_bytes = nvrtc.nvrtc_compile(cuda_source)
        # Use cubin_bytes with tvm_ffi.cpp.load_inline and embed_cubin parameter

    r   )drivernvrtczBCUDA bindings not available. Install with: pip install cuda-pythonNz"Failed to initialize CUDA driver: z(Failed to get compute capability major: z(Failed to get compute capability minor: sm_z(Failed to auto-detect GPU architecture: z-. Please specify 'arch' parameter explicitly.z Failed to create NVRTC program: s   --gpu-architecture=s   -default-devicec                 S  s"   g | ]}t |tr| n|qS  )
isinstancer   encode).0optr   r   E/home/ubuntu/.local/lib/python3.10/site-packages/tvm_ffi/cpp/nvrtc.py
<listcomp>   s   " z!nvrtc_compile.<locals>.<listcomp>    zNVRTC compilation failed:
zutf-8z-NVRTC compilation failed (couldn't get log): zNVRTC compilation failed: z%Failed to get CUBIN size from NVRTC: z Failed to get CUBIN from NVRTC: )cuda.bindingsr   r   ImportErrorRuntimeErrorcuInitCUresultCUDA_SUCCESScuCtxGetDevicecuDeviceGetAttributeCUdevice_attribute,CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR	ExceptionnvrtcCreateProgramr   r   nvrtcResultNVRTC_SUCCESSextendnvrtcCompileProgramlennvrtcGetProgramLogSizenvrtcGetProgramLogdecodenvrtcDestroyProgramnvrtcGetCUBINSizenvrtcGetCUBIN)r   r   r   r   r   r   eresultdevicemajorminorprogopts
result_loglog_sizelog_buf	error_msg
cubin_size	cubin_bufr   r   r   nvrtc_compile   s   9
"





r=   )
r   r   r   r   r   r	   r   r
   r   r   )__doc__
__future__r   typingr   r=   r   r   r   r   <module>   s   