o
    ڗi>                     @   s<  d dl mZmZ d dlmZmZmZmZ d dlm	Z	 d dl
Z
d dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ defd	d
Ze
 defddZe
 dd Ze
 defddZ dd Z!e
 dd Z"e
ddd Z#e	ddG dd dZ$G dd deZ%dS )    )BaseBackend	GPUTarget)irpassesllvmnvidia)	dataclassN)AnyDictTupleOptional)
ModuleType)Pathtargetc                 C   s   dd S )Nc                 S   s   |   rdS dS )N)       r   )r   r   r   )is_int8)lhsTyperhsType r   ]/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/triton/backends/nvidia/compiler.py<lambda>   s    zmin_dot_size.<locals>.<lambda>r   r   r   r   r   min_dot_size   s   r   binaryc                 C   s   t jd|   ddt jt jtd| g}|D ]5}t j|rPt j	|rPt
j|dgt
jd}|d urPtjd|dtjd	}|d urP||d
f  S qtd|  )NTRITON__PATH bin	--version)stderrz.*release (\d+\.\d+).*utf-8flags   zCannot find )osenvirongetupperpathjoindirname__file__existsisfile
subprocesscheck_outputSTDOUTresearchdecode	MULTILINEgroupRuntimeError)r   pathsr   resultversionr   r   r   _path_to_binary   s   r;   c                  C   s    t tdd dgd} | S )Nptxasr   r   r!   )r/   r0   r;   r4   )r:   r   r   r   get_ptxas_version&   s   r=   returnc                 C   sr   t | tsJ tt| d\}}|dkr#|dk rd| S |dkr#dS |dkr+d| S |dkr3d	| S td
|  )zK
    Get the highest PTX version supported by the current CUDA driver.
    .      P   U      F   
   ?   z?Triton only support CUDA 10.0 or higher, but got CUDA version: )
isinstancestrmapintsplitr7   )cuda_versionmajorminorr   r   r   ptx_get_version,   s   rP   c                 C   s&   | j }|d u rtd\}}t|}|S )Nr<   )ptx_versionr;   rP   )optionsrQ   _rM   r   r   r   get_ptx_version_from_options?   s
   rT   c                 C   s    t | }td|}d| }|S )NS   z+ptx)rT   min)rR   rQ   llvm_ptx_versionfeaturesr   r   r   get_featuresG   s   

rY   c                 C   s@   t | d}t|  W  d    S 1 sw   Y  d S )Nrb)openhashlibsha256read	hexdigest)r)   fr   r   r   	file_hashU   s   $ra   T)frozenc                   @   s  e Zd ZU dZeed< dZeed< dZeed< dZeed< dZ	eed	< dZ
eed
< dZeed< dZee ed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZeed< dZee ed< dZeed< dZeed< dZeed< dZeed < dZeed!< d"d# Zd$d% Z dS )&CUDAOptions   	num_warpsr$   num_ctas   
num_stagesr   num_buffers_warp_specnum_consumer_groupsreg_dec_producerreg_inc_consumerNmaxnreg)r$   r$   r$   cluster_dimsrQ   Tenable_fp_fusion)fp8e5fp8e4b15supported_fp8_dtypesr   deprecated_fp8_dtypestf32default_dot_input_precision)rt   tf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsFdebugcudabackend_namesanitize_overflowc                 C   s   t tjd }| jd u ri nt| j}|dd s%tdt|d |d< t	
| dt|  | jdkr?| j| jd @ dksCJ dd S )	Nlib	libdeviceTRITON_LIBDEVICE_PATHzlibdevice.10.bcrz   r   r$   znum_warps must be a power of 2)r   r,   parentrz   dictr'   r%   getenvrI   object__setattr__tupleitemsre   )selfdefault_libdirrz   r   r   r   __post_init__t   s    zCUDAOptions.__post_init__c                 C   sX   t | j}tdd t|d D |d< ddd t| D }t|d	 S )Nc                 s   s     | ]\}}|t |fV  qd S N)ra   ).0kvr   r   r   	<genexpr>   s    z#CUDAOptions.hash.<locals>.<genexpr>rz   rS   c                 S   s   g | ]\}}| d | qS )-r   )r   namevalr   r   r   
<listcomp>       z$CUDAOptions.hash.<locals>.<listcomp>r!   )
r   __dict__r   sortedr*   r   r\   r]   encoder_   )r   	hash_dictkeyr   r   r   hash}   s   
zCUDAOptions.hash)!__name__
__module____qualname__re   rK   __annotations__rf   rh   ri   rj   rk   rl   rm   r   rn   r   rQ   ro   boolrr   r   rI   rs   ru   rx   ry   rz   r   r{   r}   r~   r   r   r   r   r   r   rc   [   s.   
 	rc   c                       s   e Zd ZedefddZdeddf fddZdefdd	Zd
d Z	dd Z
deeef fddZdd Zedd Zedd Zedd Zedd Zedd Zdd Ze dd Z  ZS ) CUDABackendr   c                 C   s
   | j dkS )Nr|   )backendr   r   r   r   supports_target   s   
zCUDABackend.supports_targetr>   Nc                    s.   t  | |j| _t| jtsJ d| _d S )Ncubin)super__init__arch
capabilityrH   rK   
binary_ext)r   r   	__class__r   r   r      s   
zCUDABackend.__init__c                    s    fddt j D }d|vr'tt j}| jdkr|d tt||d< d|vr4| jdkr4d|d< d	|vrBt	
d
ddk|d	< | jdkrIdnd|d< t di |S )Nc                    s   i | ]}| v r| | qS r   r   )r   r   optsr   r   
<dictcomp>   r   z-CUDABackend.parse_options.<locals>.<dictcomp>rr   Y   fp8e4nvrs   Z   )rq   ro   TRITON_DEFAULT_FP_FUSION1i   @r   ry   r   )rc   __dataclass_fields__keyssetrr   r   addr   r   r%   r   )r   r   argsrr   r   r   r   parse_options   s   



zCUDABackend.parse_optionsc                 C   s(   |j |j|j|jd |jd |jd fS )Nr   r$      )re   rf   sharedrn   )r   metadatar   r   r   pack_metadata   s   zCUDABackend.pack_metadatac                 C   s>   dd l m  m  m} | jdkr|jn|jt| jd}|S )Nr   rB   )convert_custom_typesr   )	triton.language.extra.cudalanguageextrar|   r   convert_custom_float8_sm80convert_custom_float8_sm70r   r   )r   r|   codegen_fnsr   r   r   get_codegen_implementation   s
   z&CUDABackend.get_codegen_implementationc                 C   s   ddl m} d|iS )Nr   )r   ztriton.language.extra.libdevice)r   r   )r   r   r   r   r   get_module_map   s   zCUDABackend.get_module_mapc                 C   s   t | d S r   )r   load_dialects)r   ctxr   r   r   r      s   zCUDABackend.load_dialectsc                 C   s   t | j}|  tj| tj| tj	| tj
| tj| tj| tj| tj| tj| ||  | S r   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointeradd_combineadd_canonicalizeradd_reorder_broadcastadd_cseadd_licmadd_symbol_dceadd_loop_unrollrun)modr   optpmr   r   r   	make_ttir   s   
zCUDABackend.make_ttirc                 C   s`  t  }|jd ur|jd |_|jd |_|jd |_tjdddkr5t	
 }t|| j}| jd t| j}|  tj|d| |jd	|j tj| |d
 dkr`tj| t jj|| tj| tj| tj| tj| tj||dk tj | |d
 dkrtj!| tj"| tj#||j$ tj%||j$ tj&||j$ tj'||j(|j$|j)|j* tj+||j, tj-||j$ tj.| tj||dk tj| tj/| tj0| tj | tj1| |d
 dkrt jj2| t jj3| tj4| |5|  |j|j|jf|d< | S )Nr   r$   r   MLIR_ENABLE_REMARK0r   Tzcuda:r   rF      rB   	   rn   )6r   ClusterInforn   clusterDimXclusterDimYclusterDimZr%   r&   r'   r   
source_mgrr   source_mgr_diagr   printOpOnDiagnosticr   r   r   r   add_convert_to_ttgpuirre   rf   ttgpuiradd_coalesceadd_f32_dot_tc	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operandsr   r   add_optimize_accumulator_init add_combine_tensor_select_and_ifadd_ws_task_partitionrj   add_taskid_propagateadd_ws_data_partitionadd_ws_code_partitionri   rk   rl   add_pipelinerh   add_ws_loweringadd_prefetchadd_reduce_data_duplicationadd_reorder_instructionsr   add_fence_insertionadd_tma_loweringr   r   )r   r   r   r   cluster_infosrcMgrdiagr   r   r   r   
make_ttgir   s^   

zCUDABackend.make_ttgirc                 C   s  t |}| d}|d ur|d  |9  < | }t|j}|  tjdddkr;t	
 }t||j}	|jd tjj| tj| tj| tj| tj| tjj||| tjj| tj| tj| tj| tj| tjdddkrtj| | | t	!  t	 }
t	"||
}|dkrd	nd
| }t#|}d}t	$|||| t%| |j&d ur|' D ]}|( s|) r|*|j& q|j+rdd |j+D }t	,|| t	-|t	j. | d|d< t/|}~~
|S )Nz"triton_gpu.num-warp-groups-per-ctare   r   r   r   TTRITON_DISABLE_LINE_INFOr   sm_90asm_nvptx64-nvidia-cudac                 S   s   g | ]\}}|qS r   r   )r   r   r)   r   r   r   r   1  s    z)CUDABackend.make_llir.<locals>.<listcomp>ztriton_gpu.sharedr   )0rT   get_int_attrr   r   r   r   r%   r&   r'   r   r   r   r   r   r   r   %add_decompose_unsupported_conversionsr   convertadd_scf_to_cfadd_index_to_llvmiradd_allocate_shared_memoryadd_to_llvmirr   add_nvgpu_to_llvmadd_arith_to_llvmirr   r   r   r   llvmiradd_di_scoper   init_targets	to_modulerY   attach_datalayoutset_nvvm_reflect_ftzrm   get_functionsis_declarationis_external_linkageset_nvvm_maxnregrz   link_extern_libsoptimize_moduleOPTIMIZE_O3rI   )srcr   rR   r   rQ   num_warp_groupsr   r   r   r   r   llvm_modprocrX   tripler   r8   retr   r   r   	make_llir   s^   



zCUDABackend.make_llirc           
   	   C   s   t |}d}|dkrdnd| }t|}t| |||dg|jd}td|}	t|	dks0J |	d	 |d
< |d  d|d  }tjdd| |tj	d}tdd|}t
jdddkrftd t| |S )Nr  r   r  r  znvptx-short-ptrFz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r$   r   r   rF   r?   z\.version \d+\.\d+z	.version r"   z,\s*debug|debug,\s*r   NVPTX_ENABLE_DUMPr   r   z // -----// NVPTX Dump //----- //)rT   rY   r   translate_to_asmro   r2   findalllensubr5   r%   r&   r'   print)
r  r   r   r   rQ   r   r  rX   r!  namesr   r   r   make_ptx=  s   zCUDABackend.make_ptxc                 C   s|  t d\}}tjdddd$}tjdddd}||  |  |jd }tjd	r0g nd
g}	|j	r8g ndg}
|dkrAdnd}tjdddkrPddgng }|g|	|
d|d| | |jd|}z%t
j|dd|d tj|jrt|j tj|jrt|j W n\ t
jy } zOt|j}| }W d    n1 sw   Y  tj|jrt|j |jdkrd}n|jdtj krd}nd|j }t| d| dd| d d }~ww t|d!}| }W d    n	1 sw   Y  tj|rt| W d    n1 sw   Y  W d    |S W d    |S 1 s7w   Y  |S )"Nr<   Fwz.ptx)deletemodesuffixrz.logz.or  z	-lineinfoz--fmad=falser   ar   DISABLE_PTXAS_OPTr   r   z--opt-levelz-vz--gpu-name=sm_z-oT)check	close_fdsr       z!Internal Triton PTX codegen error   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command:  
rZ   )r;   tempfileNamedTemporaryFilewriteflushr   r%   r&   r'   ro   r/   r   r)   r-   removeCalledProcessErrorr[   r^   
returncodesignalSIGSEGVr7   r*   )r  r   r   r   r<   rS   fsrcflogfbin	line_infofmadr.  	opt_level	ptxas_cmdelog_filelogerrorr`   r   r   r   r   
make_cubinS  s   






*(((zCUDABackend.make_cubinc                    s^    fdd|d<  fdd|d<  fdd|d<  fdd|d	<  fd
d|d< d S )Nc                    s    | | S r   )r   r  r   rR   r   r   r   r     s    z(CUDABackend.add_stages.<locals>.<lambda>r   c                        | | jS r   )r  r   rM  rN  r   r   r         ttgirc                    rO  r   )r"  r   rM  rN  r   r   r     rP  llirc                    rO  r   )r*  r   rM  rN  r   r   r     rP  ptxc                    rO  r   )rL  r   rM  rN  r   r   r     rP  r   r   )r   stagesrR   r   rN  r   
add_stages  s
   zCUDABackend.add_stagesc                 C   s   t  }| d| j S )Nr   )r=   r   )r   r:   r   r   r   r     s   zCUDABackend.hash)r   r   r   staticmethodr   r   r   r	   r   r   r   r
   rI   r   r   r   r   r  r"  r*  rL  rU  	functools	lru_cacher   __classcell__r   r   r   r   r      s,    
	

4
<

,r   )&triton.backends.compilerr   r   triton._C.libtritonr   r   r   r   dataclassesr   rW  typingr	   r
   r   r   typesr   r\   r2   r8  r?  r%   r/   pathlibr   r   rX  rI   r;   r=   rK   rP   rT   rY   ra   rc   r   r   r   r   r   <module>   s8    


(