o
    Gۂi%\                     @   sx  d dl mZmZmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlZd dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd d	lmZ d
efddZde de
j!fddZ"e# d%de fddZ$e# de fddZ%de fddZ&e# de fddZ'e#ddd Z(de fddZ)edd G d!d" d"Z*G d#d$ d$eZ+dS )&    )BaseBackend	GPUTargetLanguage)irpassesllvmnvidia)knobs)
PTXASError)	dataclassN)AnyDictTupleOptional)
ModuleType)Pathtargetc                 C   s   dt tttf fdd}|S )Nreturnc                 S   s0   | j j}|j j}||ksJ d|dkrdS dS )Nz%lhs and rhs bitwidth must be the same   )   r       )r   r      )scalarprimitive_bitwidth)lhs_typerhs_typelhs_bitwidthrhs_bitwidth r   b/home/ubuntu/maya3_transcribe/venv/lib/python3.10/site-packages/triton/backends/nvidia/compiler.pycheck_dot_compatibility   s   z-min_dot_size.<locals>.check_dot_compatibility)r   int)r   r    r   r   r   min_dot_size   s   
r"   archr   c                 C   s   | dkrt jjS t jjS )Nd   )r	   r   ptxas_blackwellptxas)r#   r   r   r   	get_ptxas"   s   r'   P   c                 C   s2   t jj}|d ur
|S tt| jdgd}|S )Nz	--versionutf-8)r	   r   mock_ptx_version
subprocesscheck_outputr'   pathdecode)r#   mock_verversionr   r   r   get_ptxas_version&   s
   r1   c                 C   s   t | tsJ tt| d\}}|dkr#|dk rd| S d| d S |dkr+d| S |dkr3d	| S |d
krCd}||d
 d  | S td|  )zK
    Get the highest PTX version supported by the current CUDA driver.
    .      r(   r      F   
   ?      Z   z?Triton only support CUDA 10.0 or higher, but got CUDA version: )
isinstancestrmapr!   splitRuntimeError)cuda_versionmajorminorbase_ptxr   r   r   ptx_get_version/   s   rD   c                 C   s$   | j }|d u rt|j}t|}|S N)ptx_versionr'   r0   rD   )optionsr#   rF   r@   r   r   r   get_ptx_version_from_optionsG   s
   
rH   c                 C   s"   t | |}td|}d| }|S )NV   z+ptx)rH   min)rG   r#   rF   llvm_ptx_versionfeaturesr   r   r   get_featuresO   s   


rM   c                 C   s@   t | d}t|  W  d    S 1 sw   Y  d S )Nrb)openhashlibsha256read	hexdigest)r-   fr   r   r   	file_hash]   s   $rU   
capabilityc                 C   s   | dkrdnd}d|  | S )Nr:   a sm_r   )rV   suffixr   r   r   sm_arch_from_capabilityc   s   r[   T)frozenc                   @   sN  e Zd ZU dZeed< dZeed< dZeed< dZeed< d	Z	e
e ed
< d	Zeed< ejjZe
e ed< d	Ze
e ed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZeed< dZee ed< d	Zeed< d	Zeed< dZeed< dZ eed < dZ!eed!< d	Z"eed"< d#Z#eed$< d%d& Z$d'd( Z%d	S ))CUDAOptions   	num_warpsr   num_ctas   
num_stagesr   	warp_sizeNmaxnregrF   ptx_optionsir_overrideTenable_fp_fusionenable_reflect_ftzFlaunch_cooperative_grid
launch_pdl)fp8e5fp8e4b15supported_fp8_dtypesr   !deprecated_fp8_dot_operand_dtypestf32default_dot_input_precision)ro   tf32x3ieeebf16x3bf16x6allowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsdebugcudabackend_namesanitize_overflowr#   rX   instrumentation_modec                 C   s   t tjd }| jd u ri nt| j}|dd s%tjjp"t	|d |d< t
| dt|  | jdkr?| j| jd @ dksCJ dd S )Nlib	libdevicezlibdevice.10.bcrw   r   r   znum_warps must be a power of 2)r   __file__parentrw   dictgetr	   r   libdevice_pathr<   object__setattr__tupleitemsr_   )selfdefault_libdirrw   r   r   r   __post_init__   s    zCUDAOptions.__post_init__c                 C   sX   t | j}tdd t|d D |d< ddd t| D }t|d	 S )Nc                 s   s     | ]\}}|t |fV  qd S rE   )rU   ).0kvr   r   r   	<genexpr>   s    z#CUDAOptions.hash.<locals>.<genexpr>rw   _c                 S   s   g | ]\}}| d | qS )-r   )r   namevalr   r   r   
<listcomp>   s    z$CUDAOptions.hash.<locals>.<listcomp>r)   )
r   __dict__r   sortedjoinr   rP   rQ   encoderS   )r   	hash_dictkeyr   r   r   hash   s   
zCUDAOptions.hash)&__name__
__module____qualname__r_   r!   __annotations__r`   rb   rc   rd   r   rF   r	   r   ptxas_optionsre   r<   rf   rg   boolrh   ri   rj   rm   r   rn   rp   ru   rv   rw   r   rx   rz   r{   r#   r|   r   r   r   r   r   r   r]   i   s4   
 
r]   c                       s   e Zd ZdZedefddZdd Zdefdd	Z	deddf fd
dZ
defddZdd Zdd Zdeeef fddZdd Zedd Zedd Zdd Zdd Zdd Zd d! Zd"d# Ze d$d% Z  ZS )&CUDABackendNr   c                 C   s
   | j dkS )Nry   )backend)r   r   r   r   supports_target   s   
zCUDABackend.supports_targetc                 C   s0   d}t ||}|std| t|dS )Nz	^sm(\d+)$z(TRITON_OVERRIDE_ARCH must have the form r   )re	fullmatch
ValueErrorr!   group)r   r#   patternmatchr   r   r   _parse_arch   s
   zCUDABackend._parse_archr   c                 C   s   |  |j}d| S )Ncuda:)r   r#   )r   rG   rV   r   r   r   get_target_name      
zCUDABackend.get_target_namec                    s   t  | d| _d S )Ncubin)super__init__
binary_ext)r   r   	__class__r   r   r      r   zCUDABackend.__init__c                    s  d v r d dkrd d< dt jjpd| jj i}| fddtj D  t	| 
|d }|d	d
d
krG|dk rGtd| dd|vrattj}|dkrY|d tt||d< d|vrm|dkrmd|d< d|vrwt jj|d< |dkr}dnd|d< tdi |S )Nr|   consanTrx   r#   smc                    s*   i | ]}| v r | d ur| | qS rE   r   )r   r   optsr   r   
<dictcomp>   s   * z-CUDABackend.parse_options.<locals>.<dictcomp>r`   r   r:   zBnum_ctas > 1 requires NVIDIA SM90+ (Hopper). Current target is sm_zM. This configuration will fail. Please set num_ctas=1 or target an SM90+ GPU.rm   Y   fp8e4nvrn   )rl   rg   i   @r   rv   r   )r	   runtimeoverride_archr   r#   updater]   __dataclass_fields__keysr!   r   r   r   setrm   addr   r   languagedefault_fp_fusion)r   r   argsrV   rm   r   r   r   parse_options   s*   


zCUDABackend.parse_optionsc                 C   s   |j |j|jfS rE   )r_   r`   shared)r   metadatar   r   r   pack_metadata   s   zCUDABackend.pack_metadatac                 C   sL   dd l m  m  m} t| |j}|dkr|jn|jt	| j
d}|S )Nr   r(   )convert_custom_typesr"   )triton.language.extra.cudar   extrary   r!   r   r#   convert_custom_float8_sm80convert_custom_float8_sm70r"   r   )r   rG   ry   rV   codegen_fnsr   r   r   get_codegen_implementation   s   z&CUDABackend.get_codegen_implementationc                 C   s   ddl m} d|iS )Nr   )r~   ztriton.language.extra.libdevice)r   r~   )r   r~   r   r   r   get_module_map   s   zCUDABackend.get_module_mapc                 C   s$   t | tjrtj| d S d S rE   )r   load_dialectsr   instrumentation)r   ctxr   r   r   r      s   
zCUDABackend.load_dialectsc                 C   s   t | j}|  tj| tj| |d dk r"tj	| tj
| tj| tj| tj| tj| tj| || d | S )Nr7   	   	make_ttir)r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointer(add_rewrite_tensor_descriptor_to_pointeradd_canonicalizeradd_combineadd_reorder_broadcastadd_cseadd_symbol_dceadd_loop_unrollrun)modr   optrV   pmr   r   r   r      s   zCUDABackend.make_ttirc                 C   sb  |j d ur| dt| j|j  t| j}| }|d dk}tj	
|d| |jd|j tj| tj|| tjj| tj| tj| tj| tj| tj||dk tjj| tj	| |d dv rtj| tj| tj	| tj| tj| tjj||j | tj!||j  tj"| tj#||j | nm|d dkr!tj| tj| tj	| tj$| tj%|d tjj&| tj!||j  tj"| tj'||j  tj#||j | tj(| tj| tj%|d	 tjj)| ntj	| tj| tj	| tj*| tj||dk tj+| tjj,| |d d
kr]tjj-| tj| tjj.| tj/| tj0| tj	| tj1| tjj2|| tjj3| tj4| tj5| tj| |6| d | 7 |d< | S )Nzttg.maxnregr7   r   r   r   r(   )r   r   FTr   
make_ttgirtensordesc_meta)8rd   set_attrr   builderr   get_int32_attrr   r   r   r   add_convert_to_ttgpuirr_   r`   ttgpuiradd_coalesceadd_f32_dot_tcr   	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operands add_optimize_descriptor_encodingadd_loop_aware_cseadd_fuse_nested_loopsr   r   add_triton_licm add_combine_tensor_select_and_ifhopperadd_hopper_warpspecrb   add_assign_latenciesadd_schedule_loopsadd_pipelineadd_optimize_accumulator_initadd_hoist_tmem_allocadd_promote_lhs_to_tmemadd_warp_specializeadd_optimize_partition_warpsadd_remove_tmem_tokensadd_prefetchadd_coalesce_async_copyadd_optimize_tmem_layoutsadd_tma_loweringadd_interleave_tmemadd_reduce_data_duplicationadd_reorder_instructionsr   add_fence_insertionadd_lower_mmaadd_sccpr   r   get_tensordesc_metadata)r   r   r   rV   r   dump_enabledemuTF32r   r   r   r      s   
zCUDABackend.make_ttgirc                 C   s   |}t |j}|  tj| tj| tj| t	jj
| tj| tj| tj| tj| tj| ||d | |d< |S )Ngluon_to_ttgirr   )r   r   r   r   r   gluonr   add_infer_coalesced_encodingsadd_resolve_auto_encodingsr   r   r
  r   r   r  r   r   r   r   r   r  )r   srcr   rG   rV   r   r   r   r   r   r  @  s   zCUDABackend.gluon_to_ttgirc                 C   s*  t || jj}|}t|j}|  tj	| tj
| tj| tj| tjj||| tjj| tjj| tjjdkrNtj| tj| tjj|| tjrhtjd||j tjj||| tj| tj | tjj!| tjj"| tj| tj | tj#| tj$| tjj%stjj&stj'(| tjrtjd||j |)|d tjj&rtjj%st|j}|  tj'(| |)|d t|j}|  tj'*| |)|d t+,  t+ }tjj-rt.dt+/||}	t0|}
t1|| jj}d}t2  t+3|	||
| |j4r0t5|	 |j6rHt7|	rHd	d
 |j6D }t+8|	| t+9|	t+j: |;d}|d ur]||d< |;d|d< |;d|d< |;d|d< |;d|d< |;dpd|d< |;dpd|d< t<|	}~	~|S )Nr   ttgpuir_to_llvmirllvmir_to_llvm	make_llirzmake_llir.disable_line_infoz,make_llir.dump_ir_extract_di_local_variableszYAddress Sanitizer Error: Address sanitizer is currently only supported on the AMD backendnvptx64-nvidia-cudac                 S   s   g | ]\}}|qS r   r   )r   r   r-   r   r   r   r         z)CUDABackend.make_llir.<locals>.<listcomp>zttg.total-num-warpsr_   z
ttg.sharedr   zttg.tensor_memory_size	tmem_sizezttg.global_scratch_memory_sizeglobal_scratch_sizez#ttg.global_scratch_memory_alignmentglobal_scratch_alignzttg.profile_scratch_memory_sizer   profile_scratch_sizez$ttg.profile_scratch_memory_alignmentr   profile_scratch_align)=rH   r   r#   r   r   r   r   r   r   r   add_allocate_warp_groupsconvertadd_scf_to_cfr  r   r   add_allocate_shared_memory_nvr   add_allocate_tensor_memoryadd_check_matmul_two_ctar	   compilationr|   add_concurrency_sanitizer"add_allocate_global_scratch_memoryadd_proxy_fence_insertionr   r   patchadd_to_llvmirr   r   r   add_nvgpu_to_llvmadd_warp_specialize_to_llvmr   add_nvvm_to_llvmdisable_line_info"dump_ir_extract_di_local_variablesllvmiradd_di_scoper   add_di_local_variabler   init_targetsenable_asanr?   	to_moduler[   rM   set_short_ptrattach_datalayoutrh   set_nvvm_reflect_ftzrw   has_extern_depslink_extern_libsoptimize_moduleOPTIMIZE_O3get_int_attrr<   )r   r  r   rG   rV   rF   r   r   r   llvm_modprocrL   triplepathstotal_num_warpsretr   r   r   r  S  s   



zCUDABackend.make_llirc              	   C   s   t || jj}d}t|}t|| jj}dg}	t|||||	|jd}
t	d|
}t
|dks1J |d |d< |d  d	|d  }tjd
d| |
tjd}
tjdd| |
tjd}
tjjsgtdd|
}
tjjrstd t|
 |
S )Nr  znvptx-mad-wide-optFz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r   r   r   r7   r2   z\.version \d+\.\d+z	.version )flagsz\.target sm_\d+z.target sm_z,\s*debug|debug,\s*rX   z // -----// NVPTX Dump //----- //)rH   r   r#   r[   rM   r   translate_to_asmrg   r   findalllensub	MULTILINEr	   r)  r3  r   
dump_nvptxprint)r   r  r   r   rV   rF   rD  rC  rL   rH  rG  namesr   r   r   make_ptx  s$   zCUDABackend.make_ptxc                 C   s   t | jjj}tjdddde}tjddddB}|| |  |jd }g }	t	j
jr6|	dd	g7 }	nt	jjr@|	d
g7 }	n|	dg7 }	|jrJg ndg}
t|}t	jjrYddgng }|jrd|jdng }|g|	|
d||d| |jd|}zDtj|dd|d t	jjrt|j}t|  W d    n1 sw   Y  tj|jrt|j tj|jrt|j W nk tjy. } z]t|j}| }W d    n1 sw   Y  tj|jrt|j |jdkrd}n|jdtj krd}nd|j }| d| dd| d}td| d| d t |d }~ww t|d}| }W d    n	1 sDw   Y  tj|rUt| W d    n1 s`w   Y  W d    |S W d    |S 1 syw   Y  |S ) NFwz.ptx)deletemoderZ   rz.logz.oz	-lineinfoz-suppress-debug-infoz-gz--fmad=falsez--opt-level0 z-vz--gpu-name=z-oT)check	close_fdsstderr   z!Internal Triton PTX codegen error   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command: 
zC

================================================================
z

zy
================================================================
please share the reproducer above with Triton project.
rN   )!r'   r   r#   r-   tempfileNamedTemporaryFilewriteflushr   r	   r)  r2  r   disable_ptxas_optrg   r[   re   r>   r+   r   dump_ptxas_logrO   rO  rR   osexistsremoveCalledProcessError
returncodesignalSIGSEGVr   r
   )r   r  r   r   rV   r&   fsrcflogfbin
debug_infofmadr#   disable_optptx_extra_options	ptxas_cmdlog_fileelogerrorrT   r   r   r   r   
make_cubin  s   





	

*HHHzCUDABackend.make_cubinc                    s    j |tjkr  fdd|d<  fdd|d< n|tjkr/ fdd|d<  fdd|d< fd	d|d
< fdd|d< tjjd ur]tj||  d S d S )Nc                        | | S rE   )r   r  r   rV   rG   r   r   r   <lambda>      z(CUDABackend.add_stages.<locals>.<lambda>r   c                    rx  rE   )r   ry  rz  r   r   r{    r|  ttgirc                    rx  rE   )r  ry  rz  r   r   r{    r|  c                    rx  rE   )r  ry  rz  r   r   r{     r|  llirc                        | | jjS rE   )rQ  r   r#   ry  rG   r   r   r   r{  !  r  ptxc                    r  rE   )rw  r   r#   ry  r  r   r   r{  "  r  r   )r   r#   r   TRITONGLUONr	   r   add_stages_inspection_hook)r   stagesrG   r   r   rz  r   
add_stages  s   

zCUDABackend.add_stagesc                 C   s   t | jj}| d| jj S )Nr   )r1   r   r#   )r   r0   r   r   r   r   &  s   zCUDABackend.hash)r   r   r   r   staticmethodr   r   r   r<   r   r   r   r   r   r   r   r   r   r   r   r   r  r  rQ  rw  r  	functools	lru_cacher   __classcell__r   r   r   r   r      s.    


I`Lr   )r(   ),triton.backends.compilerr   r   r   triton._C.libtritonr   r   r   r   tritonr	   triton.runtime.errorsr
   dataclassesr   r  typingr   r   r   r   typesr   rP   r   r^  ri  rd  r+   pathlibr   r"   r!   
NvidiaToolr'   r  r1   rD   rH   rM   rU   r[   r]   r   r   r   r   r   <module>   s<    
,