o
    c۷il!                     @   s   d Z ddlZddlZddlZddlZddlZddlmZ ddlZej	
ddZer.dej	d< ej	
dddkZdadad	ad
efddZdededB fddZdededB fddZdeeef dB fddZdededefddZdd Zdd Zdd ZdS )a  
System ptxas replacement for CUTLASS DSL.

Usage::

    CUTE_DSL_KEEP_PTX=1 CUTE_DSL_PTXAS_PATH=/usr/local/cuda/bin/ptxas pytest tests/

Environment variables:
    CUTE_DSL_PTXAS_PATH    - Path to ptxas (e.g., /usr/local/cuda/bin/ptxas)
    CUTE_DSL_KEEP_PTX      - Must be set to 1 before cutlass is imported
    CUTE_DSL_PTXAS_VERBOSE - Set to 1 for verbose output
    CUTE_DSL_DUMP_DIR      - Directory for dumped PTX files (default: cwd)
    CUTE_DSL_KEEP_CUBIN    - Set to 1 to save compiled cubin files
    N)PathCUTE_DSL_PTXAS_PATH1CUTE_DSL_KEEP_PTXCUTE_DSL_PTXAS_VERBOSE0Fmsgc                 C   s    t rtd|  tjd d S d S )Nz[ptxas] )file)VERBOSEprintsysstderr)r    r   J/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/cute_dsl_ptxas.py_log%   s   r   ptx_pathreturnc              
   C   sV   z|   jddddW S  ty* } ztd|  d|  W Y d }~d S d }~ww )Nzutf-8ignore)errors zFailed to read : )
read_bytesdecoderstripOSErrorr   )r   excr   r   r   	_read_ptx*   s   r   c                 C   s&   t | }|d u s| dsd S |S )N})r   r   endswith)r   contentr   r   r   _read_complete_ptx2   s   r    c                    s  t | dd  std dS ttjdt }|jddd t|	ddd	 dd
}td  d|  tdt
| d|   fdd|D }|rvtdt
| d   |D ]}t|}|du rfq[td  d|  ||f  S tdt  dtj}|D ] }t|}|du rq||rtd  d|  ||f  S qt
|dkrt|d }|durtd  d|d   ||d fS td  d|  dS )z*Find dumped PTX for the compiled function.function_nameNz*Compiled function is missing function_nameCUTE_DSL_DUMP_DIRT)parentsexist_okz*.ptxc                 S   s
   |   jS )N)statst_mtime_ns)pathr   r   r   <lambda>D   s   
 z_get_ptx.<locals>.<lambda>)keyreversezSearching dumped PTX for z in zFound z PTX candidate files in c                    s   g | ]	} |j v r|qS r   )name).0r   	func_namer   r   
<listcomp>J   s    z_get_ptx.<locals>.<listcomp>z filename matches for zUsing PTX filename match for r   z
\.entry\s+z	(?:\s|\()zFound PTX for    r   zUsing sole PTX candidate for zNo PTX found for function )getattrr   r   osenvirongetcwdmkdirsortedrgloblenr    recompileescape	MULTILINEsearch)compiled_funcdump_dir	ptx_pathsfilename_matchesr   r   entry_patternr   r-   r   _get_ptx9   sH   
rD   ptx_contentc              	   C   s  t d|}|r|dnd}|  |kr| | | d}z`tdus&J tjtd| ddt	|t	| gd	d	d
}|j
dkrHtd|j | }td| j dt| d| d tjdddkrw| d}|| td|  |W |jd	d S |jd	d w )z(Compile PTX to cubin using system ptxas.z\.target\s+(sm_\d+[a-z]?)r0   sm_90az
.cubin.tmpNz-arch=z-O3z-oT)capture_outputtextr   zptxas failed: z	Compiled z -> z bytes ()CUTE_DSL_KEEP_CUBINr   r   z.cubinzSaved: 
missing_ok)r:   r>   group	read_text
write_textwith_suffixr   
subprocessrunstr
returncodeRuntimeErrorr   r   r   r+   r9   r2   r3   r4   write_bytesunlink)r   rE   matcharch	cubin_tmpresult
cubin_data	cubin_outr   r   r   _compile_ptxi   s,   


"

r^   c              
   C   s  t | }|std t| S |\}}zt||}W n ty7 } ztd| d t| W  Y d}~S d}~ww ddlm  m} ||dddddd\}}||j	j
kr`td| d t| S |  \}	}
tt|}t|}t|}td}td}tjd t|tjtt|tjtt|tj}t| jD ]}||_|
| |jdkrtd t|   S qtd	|j  ts|jd
d ||jgS )z:Replacement for _load_cuda_library that uses system ptxas.z-PTX not found, falling back to embedded ptxaszCompilation failed (z!), falling back to embedded ptxasNr   zcudaLibraryLoadData failed (   z:cuda_load_to_device failed, falling back to embedded ptxaszLoaded kernel from TrK   )rD   r   _original_load_cuda_libraryr^   	Exceptioncuda.bindings.runtimebindingsruntimecudaLibraryLoadDatacudaError_tcudaSuccess_get_cuda_init_and_loadctypesc_void_pintpointerc_int32castrangenum_devicesvaluer+   _user_wanted_ptxrW   cudaLibrary_t)selfr[   rE   r   cubinecuda_runtimeerrlibrary_cuda_load_to_device
lib_handle
ptr_to_libptr_to_ptr_to_libdev_iderr_valargsdevr   r   r   _patched_load_cuda_library   sN   




r   c                 C   s8   t | dd d u r|  | _tdt| j d t| S )N_ptxas_cuda_libraryzLoaded z0 CUDA libraries before creating TVM FFI function)r1   _load_cuda_libraryr   r   r9   !_original_create_tvm_ffi_function)rt   r   r   r    _patched_create_tvm_ffi_function   s   
r   c                  C   s   t dusJ tjt rtt tjstdt  tjdddka	tjdddks0J dd} t
jjj}|jturD|jat|_d} d	d
lm} |jturW|jat|_d} | rbtdt   dS td dS )z9Install system ptxas hook. Call before importing cutlass.Nzptxas not found: r   r   r   z1Require CUTE_DSL_KEEP_PTX=1 to use system's ptxasFTr   )TVMFFIJitCompiledFunctionBasez"Installed system ptxas patch with z$System ptxas patch already installed)r   r2   r'   isfileaccessX_OKrU   r3   r4   rr   cutlasscutlass_dslcuda_jit_executorCudaDialectJitCompiledFunctionr   r   r`   $cutlass.cutlass_dsl.tvm_ffi_providerr   _create_tvm_ffi_functionr   r   r   )patchedcuda_jit_function_clsr   r   r   r   patch   s.   

r   )__doc__r2   r   r:   ri   rQ   pathlibr   r   r3   r4   r   r
   r`   r   rr   rS   r   r   r    tuplerD   bytesr^   r   r   r   r   r   r   r   <module>   s.    
0$5
