o
    GۂiY                     @   s   d dl mZmZmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlmZ d	efd
dZdd Zdd ZeddG dd dZG dd deZdS )    )BaseBackend	GPUTargetLanguage)irpassesllvmamd)knobs)	dataclass)AnyDictTuple)
ModuleTypeN)Pathtargetc                 C   s   dd S )Nc                 S   s   dS )N)   r   r    )lhs_typerhs_typer   r   _/home/ubuntu/maya3_transcribe/venv/lib/python3.10/site-packages/triton/backends/amd/compiler.py<lambda>   s    z"get_min_dot_size.<locals>.<lambda>r   r   r   r   r   get_min_dot_size   s   r   c                 C   s,   t jjd u r| dkp| dko|du S t jjS )Ngfx942gfx950T)r	   r   use_block_pingpong)archuse_async_copyr   r   r   is_pingpong_schedule_enabled   s   r   c                 C   s   t jjd u r
| dkS t jjS )Nr   )r	   r   use_in_thread_transposer   r   r   r   is_in_thread_transpose_enabled   s   r!   T)frozenc                   @   s&  e Zd ZU dZeed< dZeed< dZeed< dZeed< d	Z	e
ed
< dZeed< dZeed< d	Zeed< dZee ed< dZee ed< dZeed< dZee ed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< d Zeed!< d"Zeed#< d$d% Zd&d' Zd	S )(
HIPOptions   	num_warpsr   waves_per_eu   
num_stagesr   num_ctasNextern_libsFdebugTsanitize_overflowr   )fp8e4nvfp8e5fp8e5b16fp8e4b8supported_fp8_dtypesr   !deprecated_fp8_dot_operand_dtypesieeedefault_dot_input_precision)r3   bf16x3bf16x6allowed_dot_input_precisionsenable_fp_fusionlaunch_cooperative_gridmatrix_instr_nonkdimkpackallow_flush_denormmax_num_imprecise_acc_defaulthipbackend_name instrumentation_modenoneschedule_hintc                 C   s   t | jdd }|dkrdnd}t| d| | jdkr'| j| jd @ dks+J d	| jd
krF| jdkrFtd| j d t| dd tt	j
d }| jd u rTi nt| j}dD ]}t|| d ||< q[t| dt|  d S )N   
       @   	warp_sizer   r   znum_warps must be a power of 2r   zckpack is deprecated starting from gfx950 and will be removed in later releases. So for now kpack = z7 will be overwritten to 1 to make transitioning easier.r;   lib)ocmlocklz.bcr*   )intr   object__setattr__r%   r;   warningswarnr   __file__parentr*   dictstrtupleitems)self	gfx_majorrI   default_libdirr*   rJ   r   r   r   __post_init__N   s     zHIPOptions.__post_init__c                 C   s.   d dd | j D }t|d S )N_c                 S   s   g | ]\}}| d | qS )-r   ).0namevalr   r   r   
<listcomp>b   s    z#HIPOptions.hash.<locals>.<listcomp>utf-8)join__dict__rW   hashlibsha256encode	hexdigest)rX   keyr   r   r   hasha   s   zHIPOptions.hash) __name__
__module____qualname__r%   rM   __annotations__r&   r(   r)   r*   rT   r+   boolr,   r   rU   r1   r   r2   r4   r7   r8   r9   r:   r;   r<   r=   r?   rA   rC   r[   rj   r   r   r   r   r#      s0   
 r#   c                       s  e Zd ZdZdZedefddZdeddf fddZde	fd	d
Z
defddZdd Zdd Zdee	ef fddZdd Zedd Zedd Zedd Zedd Zedd Zedd  Zed!d" Zed#d$ Zed%d& Zd'd( Ze d)d* Z  Z S )+
HIPBackendNFr   c                 C   s
   | j dkS )Nr>   )backendr   r   r   r   supports_targetj   s   
zHIPBackend.supports_targetreturnc                    s&   t  | t|jtsJ d| _d S )Nhsaco)super__init__
isinstancer   rU   
binary_ext)rX   r   	__class__r   r   rv   n   s   
zHIPBackend.__init__c                 C   s   d|j  S )Nhip:r    rX   optionsr   r   r   get_target_names   s   zHIPBackend.get_target_namec                    s  dt jjp| jji} dddkr"t| jjs"td| jj | jjdkr;t	t
j}|dh tt||d< d vrHttt
j|d< | jjd	krbt	t
j}|d
dh tt||d< d vrlt jj|d< | fddt
j D  t
di |S )Nr   r)   r   znum_ctas > 1 not supported on r   tf32r7   r1   r   r/   r0   r2   r8   c                    s*   i | ]}| v r | d ur| | qS Nr   )r^   koptsr   r   
<dictcomp>   s   * z,HIPBackend.parse_options.<locals>.<dictcomp>r   )r	   runtimeoverride_archr   r   getr   supports_multi_cta_launch
ValueErrorsetr#   r7   updaterV   sortedr1   r2   languagedefault_fp_fusion__dataclass_fields__keys)rX   r   argsr7   r2   r   r   r   parse_optionsv   s"   

zHIPBackend.parse_optionsc                 C   s   |j |j|jfS r   )r%   r)   shared)rX   metadatar   r   r   pack_metadata   s   zHIPBackend.pack_metadatac                 C   s   dt | jiS )Nmin_dot_size)r   r   r|   r   r   r   get_codegen_implementation   s   z%HIPBackend.get_codegen_implementationc                 C   s   ddl m} d|iS )Nr   )	libdeviceztriton.language.extra.libdevice)triton.language.extra.hipr   )rX   r   r   r   r   get_module_map   s   zHIPBackend.get_module_mapc                 C   s$   t | tjrtj| d S d S r   )r   load_dialectsrp   instrumentation)rX   ctxr   r   r   r      s   
zHIPBackend.load_dialectsc                 C   sL   dd l }d}t| dr|  |kS t| |jr$t| dr$|   |kS dS )Nr   i	ptr_rangeuntyped_storageF)torchhasattrr   rw   Tensorr   size)argr   
MAX_INT_32r   r   r   is_within_2gb   s   
zHIPBackend.is_within_2gbc                 C   s$   t | }d| v r|ddgg7 }|S )NSztt.pointer_rangerG   )r   
parse_attr)descretr   r   r   r      s   
zHIPBackend.parse_attrc                 K   s0   t j| fi |}tjjrt| r|d7 }|S )Nr   )r   get_tensor_specializationr	   r   use_buffer_opsrp   r   )r   kwargsr   r   r   r   r      s   z$HIPBackend.get_tensor_specializationc                 C   s   t | j}|  tj| tj| tj	| tj
| tj| tj| tj| tj| tj| tj| || d | S )N	make_ttir)r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointer(add_rewrite_tensor_descriptor_to_pointeradd_canonicalizeradd_combineadd_reorder_broadcastadd_cseadd_triton_licmadd_symbol_dceadd_loop_unrollrun)modr   r}   pmr   r   r   r      s   zHIPBackend.make_ttirc                 C   s  t | j}|  tj|d|j |j|j	|j
 || d t | j}|  d}tj| tj|| tj| tj| tjj||j|j|j tj| tjj| tjj||j tjj| tj| tj| tj| tj| tjj}t|j|}tjj||j  tjj!||| |rtjj"||j tj| |j#$ dkr|j#%dD ]
}tjj&|| qtj| tj'| t(|jrtjj)| tj| tjj*| |r|j dkrtjj+||j  tjj,r%tjj-| tj| tjj.||jtjj/tjj0 tjj1| tj| tj2| tj3| || d | 4 |d< | S )	Nr{   make_ttgir_earlyFrB   ,r   
make_ttgirtensordesc_meta)5r   r   r   r   r   r   add_convert_to_ttgpuirr   r%   rI   r)   r   ttgpuiradd_coalesceadd_f32_dot_tcadd_remove_layout_conversionsadd_optimize_thread_localityr   add_accelerate_matmulr:   r;   add_optimize_epilogueadd_optimize_dot_operandsadd_hoist_layout_conversionsadd_fuse_nested_loopsr   r   r   r	   r   r   add_schedule_loopsr(   add_pipelineadd_coalesce_async_copyrC   lowersplitinsert_instruction_sched_hintsadd_reduce_data_duplicationr!   add_in_thread_transposeadd_reorder_instructionsadd_block_pingpongr   add_canonicalize_pointersadd_convert_to_buffer_opsuse_buffer_atomics%buffer_ops_analyze_small_tensor_rangeadd_fold_true_cmpir   r   get_tensordesc_metadata)r   r   r}   r   emuTF32r   r   hintr   r   r   r      sp   

zHIPBackend.make_ttgirc                 C   s|   | }t |j}|  tj| tj| tj	| tj
| tj| tj| ||d | |d< |S )Ngluon_to_ttgirr   )r   r   r   r   r   gluonr   add_resolve_auto_encodingsr   add_sccpr   add_loop_aware_cser   r    add_combine_tensor_select_and_ifr   r   )srcr   r}   r   r   r   r   r   r     s   zHIPBackend.gluon_to_ttgirc                    s  | }t |j}|  tjj||j d}tjj	||j| tj
| tj| tj
| tjj| tjrFtjd||j d}tjj||j| tj| tj| tj
| tj
| tj| tj| tj| |j dkrtjj||j|j tjrtjd||j tjj stjj!stj"#| tjj$|| |%|d tjj!rtjj st |j}|  tj"#| |%|d t |j}|  tj"&| |%|d t'(  t' }t')|| t*  d	}tjj+rd
}t', tj-|j| t. |j t/ d t0 dd t0 dd t0 dd t0 d|j1dk dd  2 D }	|	d 3tj4 |	d 5dd|j6|j1   d|j7dv rm|	d 5dd |	d 5dd |	d 5d|j8 d|j8  |j9rdnd}
|	d 5d |
 tjj+r|	d :d
 |	d ;  t<|	d  tjj+rt=t>j?d! }t@|d" t@|d# t@|d$ g}t'A | n|jBr fd%d|jBD }tC|dkrt'A | t'D t'jE|jd	g |jF tG|jr|	d Hd& |	d Hd' |	d Hd( tjjIr!tJ|	d  | Kd)|d*< | Kd+p/d|d,< | Kd-p9d.|d/< tL  tM  t@ S )0Nr   ttgpuir_to_llvmirTrB   llvmir_to_llvm	make_llirzmake_llir.disable_line_infoz,make_llir.dump_ir_extract_di_local_variablesr@   +xnacki  __oclc_finite_only_optF__oclc_correctly_rounded_sqrt32__oclc_unsafe_math_opt__oclc_wavefrontsize64rH   c                 S   s   g | ]}|  s|qS r   )is_declaration)r^   fnr   r   r   ra   x  s    z(HIPBackend.make_llir.<locals>.<listcomp>zamdgpu-flat-work-group-sizez1,zmemory-bound-attentionr   zamdgpu-sched-strategyziterative-ilpzuniform-work-group-sizetruezamdgpu-waves-per-euz, zpreserve-signr3   zdenormal-fp-math-f32rJ   z
asanrtl.bczocml.bczockl.bcc                    s    g | ]\}}t  |r|qS r   )r   need_extern_lib)r^   r_   pathllvm_modr   r   ra     s     zamdgpu-no-workgroup-id-xzamdgpu-no-workgroup-id-yzamdgpu-no-workgroup-id-zz
ttg.sharedr   zttg.profile_scratch_memory_sizeprofile_scratch_sizez$ttg.profile_scratch_memory_alignmentr   profile_scratch_align)Nr   r   r   r   r   r   r   add_update_async_wait_countr   add_optimize_lds_usageconvertadd_scf_to_cfr   r   add_index_to_llvmiradd_allocate_shared_memoryrp   r   patchadd_to_llvmirr   r   r   add_cf_to_llvmiradd_arith_to_llvmirr   rC   r   lower_instruction_sched_hintsr(   r	   compilationdisable_line_info"dump_ir_extract_di_local_variablesllvmiradd_di_scopeadd_builtin_func_to_llvmirr   add_di_local_variabler   init_targets	to_moduleattach_target_tripleenable_asanattach_datalayoutTARGET_TRIPLEset_isa_versionset_abi_versionset_bool_control_constantrI   get_functionsset_calling_convCALLING_CONV_AMDGPU_KERNELadd_fn_attrr%   r   r&   r<   add_fn_target_featureadd_fn_asan_attrset_all_fn_arg_inregr   rR   rS   rU   link_extern_libsr*   lenoptimize_moduleOPTIMIZE_O3r8   has_architected_sgprsremove_fn_attrscalarize_packed_fops#add_scalarize_packed_fops_llvm_passget_int_attrcleanup_bitcode_metadatadisable_print_inline)r   r   r}   r   r   custom_lds_size_HIPBackend__HIP_FTZr   target_featuresfnsdenormal_moderZ   pathsr   r   r   r     s   










zHIPBackend.make_llirc           
   	   C   s   t d| }t|dksJ |d |d< g }d|jv rdnd}t| d }|d d	 | }t	| t
j|j|||j|}t| t
j|j|||j| t| t
j|j|||jd
}	tj
jrgtd t|	 |	S )Nz3define amdgpu_kernel void @([a-zA-Z_][a-zA-Z0-9_]*)r   r   r_   gfx11z-real-true16r@   rb   r\   Fz!// -----// AMDGCN Dump //----- //)refindallr  r   re   rf   rg   rh   r   translate_to_mirr   r  r8   dump_sched_dagtranslate_to_asmr	   dump_amdgcnprint)
r   r   r}   namesflagsfeaturesir_hashdump_file_idr\   amdgcnr   r   r   make_amdgcn  s(   zHIPBackend.make_amdgcnc           
   
   C   s   d}t jjrd}t| |j|}t b}t +}t|j	d}|
| W d    n1 s0w   Y  t|j	|j	 W d    n1 sGw   Y  t|j	d}| }	W d    n1 saw   Y  W d    |	S W d    |	S 1 syw   Y  |	S )Nr@   r   wbrb)r	   r  r  r   assemble_amdgcnr   tempfileNamedTemporaryFileopenr_   write
link_hsacoread)
r   r   r}   r)  rt   tmp_outtmp_infd_infd_outr   r   r   r   
make_hsaco  s*   




zHIPBackend.make_hsacoc                    s   |t jkr fdd|d<  fdd|d< n|t jkr& fdd|d<  fdd|d<  fd	d|d
<  fdd|d< tjjd urStj| |d  d S d S )Nc                        | | S r   )r   r   r   r}   rX   r   r   r         z'HIPBackend.add_stages.<locals>.<lambda>r   c                    rJ  r   )r   rK  rL  r   r   r     rM  ttgirc                    rJ  r   )r   rK  rL  r   r   r     rM  c                    rJ  r   )r   rK  rL  r   r   r     rM  llirc                    rJ  r   )r;  rK  rL  r   r   r     rM  r:  c                    rJ  r   )rI  rK  rL  r   r   r     rM  rt   )r   TRITONGLUONr	   r   add_stages_inspection_hook)rX   stagesr}   r   r   rL  r   
add_stages  s   

zHIPBackend.add_stagesc                 C   s   | j  S r   r   )rX   r   r   r   rj     s   zHIPBackend.hash)!rk   rl   rm   r   %supports_native_tensor_specializationstaticmethodr   rr   rv   rU   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r;  rI  rT  	functools	lru_cacherj   __classcell__r   r   ry   r   rp   f   sD    





>

 

rp   )triton.backends.compilerr   r   r   triton._C.libtritonr   r   r   r   tritonr	   dataclassesr
   typingr   r   r   typesr   re   r?  r.  rW  rP   pathlibr   r   r   r!   r#   rp   r   r   r   r   <module>   s$    G