o
    ۗi7                    @  s4  d dl mZ d dlZd dlmZmZmZmZmZ d dl	Z	ddl
mZ ddlmZ ddlmZ ed	ZG d
d deZd6ddZd6ddZd7ddZd8dd Zd9d:d#d$Zd;d)d*Z	!	+d<d=d0d1Zd>d4d5Zd?d9d:Zd?d;d<Zd?d=d>Zd@d?d@Zd@dAdBZdAdDdEZ d@dFdGZ!dBdLdMZ"dBdNdOZ#dCdRdSZ$dDdTdUZ%dEdVdWZ&dEdXdYZ'dEdZd[Z(dEd\d]Z)dEd^d_Z*dFd`daZ+dEdbdcZ,dEdddeZ-dEdfdgZ.dGdhdiZ/dHdjdkZ0dIdldmZ1dJdpdqZ2dEdrdsZ3dEdtduZ4dEdvdwZ5dEdxdyZ6dEdzd{Z7dEd|d}Z8dKddZ9dLddZ:dMddZ;dNddZ<dOddZ=dPddZ>dQddZ?dRddZ@dSddZAdTddZBdUddZCdVddZDdWddZE	dXdYddZFdd ZGdd ZHdd ZIdd ZJdd ZKdd ZLdd ZMdd ZNdd ZOdZddǄZPd[ddʄZQd\dd̄ZRd]dd؄ZSd^ddڄZTdd܄ ZUddބ ZVd_ddZWd`ddZXdaddZYdbddZZdbddZ[dbddZ\dbddZ]dbddZ^dbddZ_dbddZ`dd Zadcdd ZbddddZcdeddZddfd
dZedd ZfdgddZgdhddZhdiddZidjddZjdjddZkdjddZldkd d!Zmdld%d&Zndmd)d*Zodkd+d,Zpd-d. Zqd9d/d0Zrdnd2d3Zsdnd4d5ZtdS (o      )annotationsN)ListOptionalSequenceTupleTypeVar   )ir   )core)mathTc                      s   e Zd Z fddZ  ZS )IncompatibleTypeErrorImplc                   s@   || _ || _d| j   d | j  | _tt| | j d S )Nzinvalid operands of type  and )type_atype_b__repr__messagesuperr   __init__)selfr   r   	__class__ V/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/triton/language/semantic.pyr      s   z"IncompatibleTypeErrorImpl.__init__)__name__
__module____qualname__r   __classcell__r   r   r   r   r      s    r   axisintbuilder
ir.builderreturn	tl.tensorc                 C  *   | dvrt d|  t|| tjS )Nr   r
   r   z+program_id axis must be 0, 1, or 2 but got )
ValueErrortltensorcreate_get_program_idint32r   r!   r   r   r   
program_id      r-   c                 C  r%   )Nr&   z-num_programs axis must be 0, 1, or 2 but got )r'   r(   r)   create_get_num_programsr+   r,   r   r   r   num_programs"   r.   r0   a_tytl.dtypeb_tyc                 C  s   | j }|j }| j}|j}||kr||kr| S |S |tjjjkr'||kr%| S |S |tjjjkr6||kr4|S | S td| d| )Nzunexpected signedness r   )int_bitwidthint_signednessr(   dtype
SIGNEDNESSUNSIGNED	TypeError)r1   r3   a_rankb_ranka_snb_snr   r   r   integer_promote_impl-   s   r>   a_is_scalarboolb_is_scalar
div_or_modc                 C  sT  ||kr)|r
| |fn|| f\}}|  j|  jkr)|r'|tjtjfv r'tjS |S |  s1| r4tjS |  s<| r?tjS | 	 sG|	 rO|rLtjS tjS | 
 sW|
 rj|r\tjS | 
 rg|
 rgtjS tjS |  r{| r{| |krx| S tjS |  r| std|  d| |r| j|jkrtd|   d |  d t| |S )Nunexpected type r   zCannot use /, #, or % with x because they have different signedness;this is unlikely to result in a useful answer. Cast them to the same signedness.)kindvaluer(   float16bfloat16float32is_fp64float64is_fp32is_fp16is_bf16is_fp8is_intr9   r5   r   r>   )r1   r?   r3   rA   rB   	scalar_ty	tensor_tyr   r   r   computation_type_impl=   s:   
rS   T
check_typec                 C  s  t | trt|| tjS t | trdd|   krdk r$n ntj}n8d|   kr.dk r4n ntj}n(d|   kr>dk rDn ntj	}nd|   krNdk rTn ntj
}ntd|  dtd	| ||d
S t | trd}ddd  }td | }|tdks|dks| | ks||  kr|krn ntj}ntj}td	| ||d
S t | tjrt| j|S t | tjr| S |rtd|  dt|  d| S )N           l                             l            zNonrepresentable integer .r   r6   r!   g      8g   ?r      absinf        zcannot convert z	 of type z
 to tensor)
isinstancer@   r(   r)   get_int1int1r    r+   uint32int64uint64r'   fullfloat__builtins__rI   rK   	constexpr	to_tensorrF   r9   type)xr!   rT   r6   min_float32max_float32abs_xr   r   r   ri   o   s>   


ri   r   r   allow_ptr_aNonec                 C  sJ   |   r!|st| ||  r| |krt| || r#t| |d S d S N)is_ptrr   is_floating)r   r   ro   r   r   r   check_ptr_type_impl   s   


rt   Flhstl.tensor | numbers.NumberrhsTuple[tl.tensor, tl.tensor]c                 C  s  t | tj}t |tj}|r| }	t| |} |r|}
t||}| jj}|jj}t||| t||| |r|| s|| s|t|||||}|rN|	dk rN|	 sX|r\|
dk r\|	 r\t
d|rftd|	||dnt| ||} |rvtd|
||dnt|||}t| ||\} }| |fS )Nr   z{Cannot perform a binary operation between an unsigned tensor and a negative scalar. Perform a explicit cast on one of them.r   rZ   )r_   numbersNumberri   rj   scalarrt   rr   rS   is_int_unsignedr'   re   castbroadcast_impl_value)ru   rw   r!   allow_lhs_ptrallow_rhs_ptrarithmetic_checkrB   lhs_is_scalarrhs_is_scalar
lhs_scalar
rhs_scalar
lhs_sca_ty
rhs_sca_ty
ret_sca_tyr   r   r   binary_op_type_checking_impl   sB   



r   	binary_opcallablec                 C  s   | j jjdks|jjsd S | j j}|j j}||ksJ | s!J t| tj|} t|tj|}|| |d|}|	 }t
||tj}| }t
||tj}tt|||t||||}	d|j d|j }
t|	|
| d S )N@   Fr    z! overflow detected for operation )rj   r{   r4   optionssanitize_overflowrP   r}   r(   rc   get_int_max_valuer)   	get_int64get_int_min_valueand_
less_equalgreater_equalr   device_assert)ru   rw   r!   r   r   r   ret	max_value	min_valuecondmsgr   r   r    binary_op_sanitize_overflow_impl   s    r   inputotherr   c                 C  s   t | ||dd\} }| jj}|jj}| r| rtd| r3| s3|| } }| jj}|jj}| rDt|| j|j| jS |	 rUt|
| j|j| jS | ro|rbt| ||t t|| j|j| jS td| )NTzcannot add pointers togetherrC   )r   rj   r{   rr   r9   r(   r)   create_addptrhandlers   create_faddrP   r   add
create_add)r   r   r   r!   input_scalar_tyother_scalar_tyr   r   r   r      s$   
r   c                 C  s   t | ||dd\} }| jj}| r"t|| jt||j| jS |	 r3t|
| j|j| jS | rM|r@t| ||t t|| j|j| jS td| )NTFrC   )r   rj   r{   rr   r(   r)   r   r   minusrs   create_fsubrP   r   sub
create_subr9   r   r   r   r!   rQ   r   r   r   r      s    r   c                 C  s|   t | ||\} }| jj}| rt|| j|j| jS | r7|r*t	| ||t
 t|| j|j| jS td| NrC   )r   rj   r{   rs   r(   r)   create_fmulr   rP   r   mul
create_mulr9   r   r   r   r   r     s   r   c                 C  s   t | ||dddd\} }| jj}|jj}| r#| r#t|||}nI| r2| r2t| ||} n:| rI| rIt| tj|} t|tj|}n#| re| re|j|jkr^t|||}nt| ||} nt	d| t
|| j|j| jS NFTrC   )r   rj   r{   rs   rP   r}   r(   rI   fp_mantissa_widthr9   r)   create_fdivr   )r   r   r!   r   r   r   r   r   truediv  s    r   c                 C  s   t | ||dddd\} }| jj}|jj}| rK| rKt||}t| ||} t|||}| r>t|	| j
|j
| jS t|| j
|j
| jS td| r   )r   rj   r{   rP   r>   r}   is_int_signedr(   r)   create_sdivr   create_udivr9   )r   r   r!   r   r   ret_tyr   r   r   floordiv0  s   
r   ieee_roundingc                 C  s^   | j j}|j j}| r| stdt| ||dddd\} }|| j|j}t|| j S )Nz4both operands of fdiv must have floating scalar typeFT)	rj   r{   rs   r9   r   r   r   r(   r)   )r   r   r   r!   r   r   r   r   r   r   fdiv?  s   r   c                 C  s   t | ||dddd\} }| jj}|jj}| r2tjt| |d||d}t| t||d|d|}|S |	 rj|j
|j
krLtd|  d |  d | r]t|| j|j| jS t|| j|j| jS td| )NFT_builderzCannot mod z by rD   rC   )r   rj   r{   rs   r   floorr   r   r   rP   r5   r9   r   r   r(   r)   create_sremr   create_urem)r   r   r!   rQ   r   r   r   r   r   r   modJ  s    r   rk   ypropagate_nantl.PropagateNanc                 C     t | ||\} }| j}| r<|tjjkr"t|| j|j| j	S |tjj
kr5t|| j|j| j	S td| | rMt|| j|j| j	S | r^t|| j|j| j	S td| NzUnexpected propagate_nan Unexpected dtype )r   r6   rs   r(   PropagateNanALLr)   create_minimumfr   rj   NONEcreate_minnumfr'   r   create_minsir|   create_minuir9   rk   r   r   r!   r6   r   r   r   minimumf     r   c                 C  r   r   )r   r6   rs   r(   r   r   r)   create_maximumfr   rj   r   create_maxnumfr'   r   create_maxsir|   create_maxuir9   r   r   r   r   maximumx  r   r   minmaxc                 C  sn   t |||\}}t | ||\} }t | ||\} }| j}| r/t|| j|j|j|| jS td| d)Nr   z(. Only floating point clamp is supported)	r   r6   rs   r(   r)   create_clampfr   rj   r9   )rk   r   r   r   r!   r6   r   r   r   clamp  s    r   c                 C  st   t | ||\} }| jj}|jj}| r| st||t||}||kr,t| ||} ||kr6t|||}| |fS rq   )r   rj   r{   rP   r   r>   r}   )r   r   r!   input_sca_tyother_sca_tyr   r   r   r   bitwise_op_type_checking_impl  s   

r   c                 C  *   t | ||\} }t|| j|j| jS rq   )r   r(   r)   
create_andr   rj   r   r   r!   r   r   r   r        r   c                 C  r   rq   )r   r(   r)   	create_orr   rj   r   r   r   r   or_  r   r   c                 C  r   rq   )r   r(   r)   
create_xorr   rj   r   r   r   r   xor_  r   r   c                 C  D   | j  st| td|} |j  st|td|}t| ||S Nra   )rj   is_int1bitcastr(   r6   r   r   r   r   r   logical_and  
   

r   c                 C  r   r   )rj   r   r   r(   r6   r   r   r   r   r   
logical_or  r   r   c                 C  s&   | j  st| td|} t| |S r   )rj   r   r   r(   r6   invert)r   r!   r   r   r   not_  s   

r   c                 C  r   rq   )r   r(   r)   create_lshrr   rj   r   r   r   r   lshr  r   r   c                 C  r   rq   )r   r(   r)   create_ashrr   rj   r   r   r   r   ashr  r   r   c                 C  r   rq   )r   r(   r)   
create_shlr   rj   r   r   r   r   shl  r   r   c                 C  s   | S rq   r   r   r   r   r   plus  s   r   c                 C  sJ   | j j}| rtd|  d t||||}t	|| d|S )Nz$wrong type argument to unary minus ()T)
rj   r{   rr   r'   r   r(   r)   get_null_valueto_irr   )r   r!   r   _0r   r   r   r     s
   r   c                 C  sP   | j j}| s| rtd|  d t||	||}t
| ||S )Nz%wrong type argument to unary invert (r   )rj   r{   rr   rs   r'   r   r(   r)   get_all_ones_valuer   r   )r   r!   r   _1r   r   r   r     s
   r   vtl.block_typec                 C  s&   | j  stjS | j j}ttj|S rq   )rj   is_blockr(   ra   shape
block_type)r   r   r   r   r   
_bool_like  s   
r   c                 C     t | ||\} }| jj}| rt|| j|jt| S |	 rB|
 r4t|| j|jt| S t|| j|jt| S td| r   )r   rj   r{   rs   r(   r)   create_fcmpOGTr   r   rP   r   create_icmpSGTcreate_icmpUGTr9   r   r   r!   rQ   r   r   r   greater_than     r  c                 C  r   r   )r   rj   r{   rs   r(   r)   create_fcmpOGEr   r   rP   r   create_icmpSGEcreate_icmpUGEr9   r  r   r   r   r     r  r   c                 C  r   r   )r   rj   r{   rs   r(   r)   create_fcmpOLTr   r   rP   r   create_icmpSLTcreate_icmpULTr9   r  r   r   r   	less_than  r  r  c                 C  r   r   )r   rj   r{   rs   r(   r)   create_fcmpOLEr   r   rP   r   create_icmpSLEcreate_icmpULEr9   r  r   r   r   r   .  r  r   c                 C  n   t | ||\} }| jj}| rt|| j|jt| S |	 r0t|
| j|jt| S td| r   )r   rj   r{   rs   r(   r)   create_fcmpOEQr   r   rP   create_icmpEQr9   r  r   r   r   equal=     r  c                 C  r  r   )r   rj   r{   rs   r(   r)   create_fcmpUNEr   r   rP   create_icmpNEr9   r  r   r   r   	not_equalI  r  r  startendc                 C  s   t | tr
t |tstdt| d? }t|d? }|s|r"td|| kr*td||  }||d @ dkr:td|g}ttj|}t|| ||S )Nz/arange's arguments must be of type tl.constexpr    zarange must fit in int32z=arange's end argument must be greater than the start argumentr
   r   z#arange's range must be a power of 2)	r_   r    r'   r@   r(   r   r+   r)   create_make_range)r  r  r!   is_start_int64is_end_int64ranger   r   r   r   r   arangeZ  s   r   r   	List[int]r6   c                 C  s   t |tjr|jjdksJ dt|||}n(|d u rtd|dkr,|||}nt	|d|j
 }||}t||}t|| |S )Nr
   zonly accepts size-1 tensorz2dtype must be specified when value is not a tensorr   get_)r_   r(   r)   numelrF   r}   r'   r   r   getattrnamesplat)r   rF   r6   r!   get_value_fnr   r   r   re   k  s   re   rF   c                 C  sF   | j  r	J dt|dkr| S t| j|}t|| j||S )NzCannot splat a block tensorr   )	rj   r   lenr(   r   r6   r)   create_splatr   )rF   r   r!   r   r   r   r   r&    s
   r&  	dst_shapecan_reorderc                 C  sR   d}|D ]}||9 }q| j j|krtdt| j j|}t|| j|||S )Nr
   z:reshape() cannot change total number of elements in tensor)	rj   r#  r'   r(   r   r{   r)   create_reshaper   )r   r*  r+  r!   r#  sr   r   r   r   reshape  s   
r.  c                 C  sZ   dd | j D }||d | j st| ||dS t| jj|}t|	| j
||S )Nc                 S  s   g | ]}t |qS r   r(   _constexpr_to_value.0rk   r   r   r   
<listcomp>      zexpand_dims.<locals>.<listcomp>r
   )r   r!   )r   insertrj   r   r&  r(   r   r{   r)   create_expand_dimsr   )r   r   r!   r*  r   r   r   r   expand_dims  s   
r7  c                 C  sX   |sJ dt | jdksJ t| jj| jd |jd  g}t|| j|j|S )Nz;current implementation of `cat` always may reorder elementsr
   r   )	r(  r   r(   r   rj   r{   r)   
create_catr   )ru   rw   r+  r!   ret_typer   r   r   cat  s   "r:  abc                 C  s   t | ||\} }| jg k}|rt| d|} t|d|}t| jd tjr*td}nd}| j|g }t| jj|}t	|
| j|j|}|rQt|dgd|d}|S )Nr   r   Fr+  r!   )r~   r   r7  r_   r(   rh   r   rj   r{   r)   create_joinr   r.  )r;  r<  r!   
was_rank_1two	new_shaper9  r   r   r   r   join  s   
rC  c                 C  sp   t | jdks	J t| jd dksJ | jd d }t| jj|}|| j\}}t	||t	||fS )Nr   r=  r   )
r(  r   r(   r0  r   rj   r{   create_splitr   r)   )r;  r!   rB  r9  outLHSoutRHSr   r   r   split  s   

rG  dims
Tuple[int]c                   s~   t  jt |krtdtdd |D ttt |kr%td| t jj	 fdd|D }t
| j||S )Nz5permute dims must have the same length as input shapec                 s  s    | ]}t |V  qd S rq   r/  r2  dr   r   r   	<genexpr>  s    zpermute.<locals>.<genexpr>z?permute dims must be a permutation of 0, 1, ..., n-1, but were c                   s   g | ]} j | qS r   r   rJ  r   r   r   r3    r4  zpermute.<locals>.<listcomp>)r(  r   r'   sortedlistr  r(   r   rj   r{   r)   create_transr   )r   rH  r!   r9  r   r   r   permute  s   "rQ  c                 C  s   | j  st| j |}t|| j||S | j  }t|t|kr.t	d| d| ||kr4| S t
|D ]#\}}|| |kr[|dkr[t	d||  d| d| d| d| 
q8t| j j|}t|| j||S )Nz!Cannot broadcast, rank mismatch: z, r
   z3Cannot broadcast, the expanded size of the tensor (z ) must match the existing size (z) at non-singleton dimension : )rj   r   r(   r   r)   r)  r   get_block_shapesr(  r'   	enumerater{   create_broadcast)r   r   r!   r   	src_shapeiitemr   r   r   broadcast_impl_shape  s,   

rY  c              	   C  sZ  | j }|j }| r'| s't|j|j}t||j|	 |}| |fS | sH| rHt|j|j}t|| j|	 |} | |fS | r)| r)|	 }|	 }t
|t
|k rtt
|t
|D ]}t|| jdt|jdg| } | j }|	 }qkn.t
|t
|k rtt
|t
|D ]}t||jdt|jdg| }|j }|	 }qt
|t
|ksJ g }t|D ]3\}	}
||	 }|
dkr|| q|dks||
kr||
 qtdt|	 d t|
 d t| ||krt|j|}t|| j||} ||kr)t|j|}t||j||}| |fS )Nr   r
   z?Cannot make_shape_compatible: incompatible dimensions at index rR  r   )rj   r   r(   r   r{   r   r)   r)  r   rS  r(  r  r6  rT  appendr'   strrU  )ru   rw   r!   lhs_tyrhs_ty	lhs_shape	rhs_shape_	ret_shaperW  leftrightr   r   r   r   r~     sl   +'



r~   rounding_modeOptional[str]c                 C  s<   | d u rd S | dkrt jjS | dkrt jjS td|  d)NrtnertzzInvalid rounding mode: z0. Supported rounding modes are 'rtne' and 'rtz'.)r	   ROUNDING_MODERTNERTZr'   )rd  r   r   r   _str_to_rounding_mode"  s   rk  dst_tyc                 C  s   | j }| rt|j| j  }||kr| S |j}|j}| s%| r+t| ||S |j}|j}||krCt	dt
| d t
| t|| j|||S )Nz!Cannot bitcast data-type of size z to data-type of size )rj   r   r(   r   r{   rS  rr   r}   primitive_bitwidthr'   r[  r)   create_bitcastr   r   )r   rl  r!   src_ty
src_sca_ty
dst_sca_tysrc_bitsdst_bitsr   r   r   r   ,  s    r   fp_downcast_roundingc                 C  sd  | j }t|tjr|j}t|tjr|j}| r#t|j| j  }||kr)| S |j}|j}t	|}d}|
 rU|
 rU|j|jk rU|d u rLtjj}n|tjjkrTd}n|d urgtdt| d t| | so| r|jdd us{J d|jd | |||dS | r|
 s|
 r| s|rt|| j||||S | r| r| r| stt| tj|||S |
 o|
 o|j|jk}|rt|| j|||S |
 o|
 o|j|jk }	|	rt|| j|||S |  rJ|  rJ|j!|j!ks|j"|j"krJ|# o|$  }
|$ r;| j%|}t|&|| j%}t'| ||S t|(| j|||
|S |) r|  r|$ ro| j%|}t|&|| j%}t'| ||S |# rt|*| j|||S t|+| j|||S |  r|) r|$ s|# st|,| j|||S t|-| j|||S |. r|  r|j!}|dkrt|/| j|||S |d	krt't| tj0|t|1d
tj0|S |  r|. rt|2| j|||S |. r(|. r(t|3| j|||S J d|  d| )NFTz]fp_downcast_rounding should be set only for truncating fp conversions. Source scalar type is z and destination type is convert_custom_typesz0target doesn't provide conversion for this type.r   r   r
   r   zcannot cast z to )4rj   r_   r(   rh   rF   r   r   r{   rS  rk  rs   rm  r	   rh  ri  r'   r[  is_fp8e4b15codegen_fnsgetrO   r)   create_fp_to_fpr   r   rM   rL   rN   r}   rI   create_fp_trunccreate_fp_extrP   r4   r5   r   is_boolr6   r   r  create_int_castis_standard_floatingcreate_fp_to_sicreate_fp_to_uicreate_ui_to_fpcreate_si_to_fprr   create_ptr_to_intrc   r   create_int_to_ptrrn  )r   rl  r!   rt  ro  rp  rq  use_custom_roundingtruncate_fpext_fpsign_extendtyr   bitwidthr   r   r   r}   ?  s   






&r}   c                 C  s\   t jj}| r,| dkrt jj}|S | dkrt jj}|S | dkr$t jj}|S td|  d|S )Nz.ca.cgz.cvCache modifier  not supported)r	   CACHE_MODIFIERr   CACGCVr'   cache_modifiercacher   r   r   _str_to_load_cache_modifier     r  c                 C  sp   t jj}| r6| dkrt jj}|S | dkrt jj}|S | dkr$t jj}|S | dkr.t jj}|S td|  d|S )Nz.wbr  z.csz.wtr  r  )r	   r  r   WBr  CSWTr'   r  r   r   r   _str_to_store_cache_modifier      	r  c                 C  sH   t jj}| r"| dkrt jj}|S | dkrt jj}|S td|  d|S )N
evict_lastevict_firstzEviction policy r  )r	   EVICTION_POLICYNORMAL
EVICT_LASTEVICT_FIRSTr'   )eviction_policyevictionr   r   r   _str_to_eviction_policy  s   r  c                 C  sD   d }| r | dkrt jj}|S | dkrt jj}|S td|  d|S )NzeronanzPadding option r  )r	   PADDING_OPTIONPAD_ZEROPAD_NANr'   )padding_optionpaddingr   r   r   _str_to_padding_option  s   r  c                 C  sp   t jj}| r6| dkrt jj}|S | dkrt jj}|S | dkr$t jj}|S | dkr.t jj}|S td|  d|S )Nacquirereleaseacq_relrelaxedMemory semantic r  )r	   MEM_SEMANTICACQUIRE_RELEASEACQUIRERELEASERELAXEDr'   )
sem_optionsemr   r   r   _str_to_sem  r  r  c                 C  s\   t jj}| r,| dkrt jj}|S | dkrt jj}|S | dkr$t jj}|S td|  d|S )Ngpuctasysr  r  )r	   MEM_SYNC_SCOPEGPUCTASYSTEMr'   )scope_optionscoper   r   r   _str_to_scope  r  r  c                 C  s   | rEt | ds
| g} dd | D } | D ]}t|tr(d|  kr't|k s*J  J qt| dks3J t| tt| ksAJ dt| S dS )N__iter__c                 S  "   g | ]}t |tjr|jn|qS r   r_   r(   rh   rF   r2  elemr   r   r   r3       " z0_canonicalize_boundary_check.<locals>.<listcomp>r   z'Duplicate dimension in `boundary_check`r   )hasattrr_   r    r(  setrN  )boundary_checkblock_shapedimr   r   r   _canonicalize_boundary_check  s   
,r  c	              
   C  s   |d us|d urt d| jjj}	|	tjksJ d|	 r(|tjjkr(t d| jj}
t	||

 }t|| j||||||
S )NK`mask` and `other` arguments cannot be specified for loading block pointers3`tl.int1` should be rewrited in `tl.make_block_ptr`z@Padding option `nan` is not supported for integer block pointers)r'   rj   
element_tyr(   ra   rP   r	   r  r  r  rS  r)   create_tensor_pointer_loadr   )ptrmaskr   r  r  r  r  is_volatiler!   elt_tyrl  r   r   r   _load_block_pointer  s   
r  c	              
   C  s  | j j std| j   d|d u r|d urtd|s!|r%td| j  s@|r5|j  r5td|r@|j  r@td| j  r_|d urRt|| j  |}|d ur_t|| j  |}| j j}	|	j}
|
t	j
k}|r}t	j}
t	|
|	j}	t| |	|} |d urt||
|}| j  r| j  }t	|
|}n|
}|d u rt	|| j||||}nt	|| j|j|r|jnd ||||}|rt|t	j
|}|S )NUnsupported ptr type z in `tl.load`z)`other` cannot be provided without `mask`z`padding_option` or `boundary_check` argument is not supported for loading a tensor ofpointers or loading a scalar. Because the compiler does not know the boundary; please use block pointers (defined by `make_block_ptr`) insteadEMask argument cannot be block type if pointer argument is not a blockzFOther argument cannot be block type if pointer argument is not a block)rj   r{   rr   r'   r   r   rY  rS  r  r(   ra   int8pointer_typeaddress_spacer}   r   r)   create_loadr   create_masked_load)r  r  r   r  r  r  r  r  r!   ptr_tyr  r|  r   rl  r   r   r   r   _load_legacy*  sP   




r  r  r  Optional[tl.tensor]r  r   r  r[  r  r  r  c	              
   C  s^   t |}	t|}
t|}| j r#| jj r#t| |||||	|
||	S t| |||||	|
||	S rq   )	r  r  r  rj   rr   r  r   r  r  )r  r  r   r  r  r  r  r  r!   r  r  r  r   r   r   loadh  s   r  desc_ptrc                 C  s<   t ||dd}|| j|||t|t|}t||S NFrequire_i64)_convert_to_ir_valuescreate_descriptor_loadr   r   r  r  r(   r)   )r  offsetsr  r  rj   r!   rk   r   r   r   descriptor_loadx  s   r  c                 C  s*   t ||dd}t|| j|j|tjS r  )r  r(   r)   create_descriptor_storer   void)r  rF   r  r!   r   r   r   descriptor_store  s   r  global_addressbox_dimList[tl.tensor]
global_dimglobal_strideelement_stride	elem_typeinterleave_layoutswizzle_mode	fill_modec                 C  sj   |r|d j tjksJ t|
| j|jdd |D dd |D dd |D dd |D ||||	
tjS )Nr   c                 S     g | ]}|j qS r   r   r1  r   r   r   r3        z$tensormap_create.<locals>.<listcomp>c                 S  r  r   r  r1  r   r   r   r3    r  c                 S  r  r   r  r1  r   r   r   r3    r  c                 S  r  r   r  r1  r   r   r   r3    r  )r6   r(   rc   r)   create_tensormap_creater   r  )r  r  r  r  r  r  r  r  r  r  r!   r   r   r   tensormap_create  s    r  c                 C     t || jt jS rq   )r(   r)   #create_tensormap_fenceproxy_acquirer   r  )r  r!   r   r   r   tensormap_fenceproxy_acquire     r   c           	   	   C  s   |d urt d| jj }|j st|||}|j s"J d||j ks7J d| d|j  d| jjj|jjksPJ d| jjj d|jj d| jjj}|tjks^J dt||}t	|||}t
|| j|j|||tjS )	Nr  z-Value argument must be block type or a scalarzBlock shape(z) and value shape(z
) mismatchzBlock element type(z) and value element type(r  )r'   rj   r  rS  r   rY  r(   ra   r  r}   r)   create_tensor_pointer_storer   r  )	r  valr  r  r  r  r!   r  r  r   r   r   _store_block_pointer  s"   
2

r  c           	   	   C  s2  | j j std| j   d|rtd| j  s0|j  r%td|r0|j  r0td| j  rKt|| j  |}|d urKt|| j  |}| j j}|j}|t	j
krgt	j}t	||j}t| ||} t|||}|s~t	|| j|j||t	jS |j j stdt	|| j|j|j||t	jS )Nr  z in `tl.store`z`boundary_check` argument is not supported for storing a tensor of pointers or storing a scalar. Because the compiler does not know the boundary; please use block pointers (defined by `make_block_ptr`) insteadzFValue argument cannot be block type if pointer argument is not a blockr  z"Mask must have boolean scalar type)rj   r{   rr   r'   r   r   rY  rS  r  r(   ra   r  r  r  r}   r)   create_storer   r  r|  create_masked_store)	r  r  r  r  r  r  r!   r  r  r   r   r   _store_legacy  s2   



"r  r  c           	      C  sl   t |}t|}| j s| jj rtd| j r,| jj r,t	| ||||||S t
| ||||||S )N"Cannot store to a constant pointer)r  r  rj   is_constr{   r'   rr   r  r   r  r  )	r  r  r  r  r  r  r!   r  r  r   r   r   store  s   r
  cmpr  r  c              	   C  sN   t |}t|}| jjj}|jdvrtdt|	| j
|j
|j
|||jS )N)   r  r   z9atomic_cas only supports elements with width {16, 32, 64})r  r  rj   r{   r  rm  r'   r(   r)   create_atomic_casr   )r  r  r  r  r  r!   r  r   r   r   
atomic_cas  s   

"r  op&Tuple[tl.tensor, tl.tensor, tl.tensor]c                 C  sB  | j j std| j   | j  s| j j rtd| j jj}|tju r4|dkr4td| d |tj	tj
tjtjfv rLtd| d t| | j  rk|d ur^t|| j  |}|d urkt|| j  |}t|| j jj|}|s|d}tj	}| j  r||| j  }ttj	| j  }t||}| ||fS )Nz)Pointer argument of store instruction is r  r   atomic_z does not support fp16z does not support T)rj   r{   rr   r'   r   r	  r  r(   rG   ra   r  int16rH   r[  r   rY  rS  r}   r`   r)  r   r)   )r  r  r  r  r!   r  mask_irmask_tyr   r   r   atom_red_typechecking_impl  s.   




r  c                 C  s  t | ||d|\} }}t|}t|}|jj}| rG| r3t|	t
jj| j|j|j|||jS t|	t
jj| j|j|j|||jS |tjtjhvrVtd| tg d||}|tjkretjntj}t|||}	t| t|d|}
|tjkrtjntj}t|||}t| t|d|}t|||}t|||}t|	t
jj|
j|	jt|||j|||	j}t|	t
jj|j|jt|||j|||j}t||||}t|||S )Nr   z#atomic_max not supported for dtype r^   r
   )r  r  r  rj   r{   rP   r   r(   r)   create_atomic_rmwr	   	ATOMIC_OPMAXr   UMAXrI   rK   r9   re   r+   rc   r   r  rb   rd   r   r  r   UMINwherer  r  r  r  r  r!   sca_tyr  i_typei_vali_ptrui_typeui_valui_ptrposnegpos_retneg_retr   r   r   r   
atomic_max,  J     r(  c                 C  s  t | ||d|\} }}t|}t|}|jj}| rG| r3t|	t
jj| j|j|j|||jS t|	t
jj| j|j|j|||jS |tjtjhvrVtd| tg d||}|tjkretjntj}t|||}	t| t|d|}
|tjkrtjntj}t|||}t| t|d|}t|||}t|||}t|	t
jj|
j|	jt|||j|||	j}t|	t
jj|j|jt|||j|||j}t||||}t|||S )Nr   z#atomic_min not supported for dtype r^   r
   )r  r  r  rj   r{   rP   r   r(   r)   r  r	   r  MINr   r  rI   rK   r9   re   r+   rc   r   r  rb   rd   r   r  r   r  r  r  r   r   r   
atomic_minS  r)  r+  c              
   C  sj   t | ||d|\} }}t|}t|}|jj}| rtjjntjj	}t
||| j|j|j|||jS )Nr   )r  r  r  rj   r{   rs   r	   r  FADDADDr(   r)   r  r   )r  r  r  r  r  r!   r  r  r   r   r   
atomic_addz  s   $r.  c              
   C  N   t | ||d|\} }}t|}t|}t|tjj| j	|j	|j	|||j
S )Nand)r  r  r  r(   r)   r  r	   r  ANDr   rj   r  r  r  r  r  r!   r   r   r   
atomic_and      r3  c              
   C  r/  )Nor)r  r  r  r(   r)   r  r	   r  ORr   rj   r2  r   r   r   	atomic_or  r4  r7  c              
   C  r/  )Nxor)r  r  r  r(   r)   r  r	   r  XORr   rj   r2  r   r   r   
atomic_xor  r4  r:  c              
   C  r/  )Nxchg)r  r  r  r(   r)   r  r	   r  XCHGr   rj   r2  r   r   r   atomic_xchg  s    r=  c                 C  sH   |   |jjv sJ d|jj d|  |  } | dkrd} ttj| S )Nzinput_precision must be one of z. Got TF32X3TF32x3)lowerr   allowed_dot_input_precisionsupperr$  r	   INPUT_PRECISION)input_precisionr!   r   r   r   _str_to_dot_input_precision  s   rE  accrD  max_num_imprecise_acc	out_dtypec              
   C  s  | j  r
|j  sJ | j r|j rn@| jtjtjtjtjtj	fv s.J d| j |jtjtjtjtjtj	fv sEJ d|j | j|jksWJ d| j d|j | j
 sa|j
 rot| tj|} t|tj|}|d u rw|jj}t||}t| j}t|j}||  krdksn ||  krdksn J d| j d|j d	| jd
 j|jd jksJ d| j d|j d| jd
 j d|jd j d		|jdd usJ d|jd | j |j }	| jd j|	d kr| jd
 j|	d kr|jd
 j|	d ksJ d|	d  d|	d  d|	d  | j j r6| j jtjks-J d|d}
tj}n1| r?td| j j sM| j j rV|d}
tj	}n| r`|dn|d}
|}| j jd }|j jd
 }| j jd
 }|dkr| j jd nd }t||r|||gn||g}|d u r||
|r|||gn||g}n|j }|j |ksJ |d u r| j r|j r|jj!}nd}n| j r|j r||krtd| d| d	t"|#| j |j ||||S )NzUnsupported lhs dtype zUnsupported rhs dtype z&Both operands must be same dtype. Got r   r      +Both inputs must be either 2D or 3D; (lhs: 	 vs rhs: r   r=  zFirst input shape (z) and second input shape z= are not compatible for matmul (second index of first shape (z0) must be equal to first index of second shape (min_dot_sizez2target doesn't provide lower shape bounds for dot.r   r
   zInput shapes should have M >= z, N >= z
 and K >= zonly int8 supported!zhout_dtype=bfloat16 is unsupported. Please use out_dtype=float32/float16 and cast with `.to(tl.bfloat16)`zmax_num_imprecise_acc (z) must be <= K ()$rj   r   r6   rO   r(   r  uint8rG   rH   rI   rv  r}   r   default_dot_input_precisionrE  r(  r   rF   rw  rx  r{   rP   	get_int32r+   rN   r'   rL   get_fp32rM   get_fp16r   r)  r   max_num_imprecise_acc_defaultr)   
create_dot)ru   rw   rF  rD  rG  rH  r!   lhs_rankrhs_rankrM  r   ret_scalar_tyMNKBr   
acc_handler   r   r   dot  s   

$


F0, 


 
"

"r]  float_formatc                 C  s`   | dkrt jjS | dkrt jjS | dkrt jjS | dkr t jjS | dkr(t jjS td|  d)Ne4m3e5m2e2m3e3m2e2m1zInvalid float format: rY   )r	   F8F6F4TYE4M3E5M2E2M3E3M2E2M1r'   )r^  r   r   r   _str_to_fp_type  s   rj  	lhs_scale	rhs_scaletl.tensor | Nonec	                 C  s  | j  r
|j  sJ t| j}	t|j}
|	|
  kr dks9n |	|
  kr+dks9n J d| j d|j dt|}t|}|dv sLJ d| |dv sWJ d	| t|tjoa|jd u }|shJ d
| j jd }|j jdd  \}}|dkr~dnd}||| j jd  ksJ d| j d|j d|dksJ d||	dkr| j jd nd }t	||r|||gn||g}|
d}|d u r|||r|||gn||g}n
|j}|j |ksJ |rd n|j}t|| j|j||j||||S )Nr   rI  rJ  rK  r   )rc  r_  r`  zNYI: lhs_format )r_  r`  zNYI: rhs_format zNYI: rhs_scale not supportedrL  rc  r
   r=  zCReduction dimension should pack the same number of elements; (lhs: r   z!scaled_dot NYI for K < 64. Got K=r   )rj   r   r(  r   rj  r_   r(   rh   rF   r   rQ  r)  r   r)   create_dot_scaled)ru   rk  
lhs_formatrw   rl  
rhs_formatrF  rH  r!   rU  rV  lhs_format_enumrhs_format_enumrhs_scale_is_nonerX  rZ  rY  PACKEDr[  r   r   r\  rhs_scale_handler   r   r   
dot_scaled  sB   

F


 rv  	conditionc                 C  s   | j tjkrtd| j   t| tj|} t|||dd\}}| j r6t	| ||\} }t	|||\}}nt	| ||\} }|j}t
|| j|j|j|S )Nzgtl.where with a non-boolean condition is deprecated and will error out in a future triton release. Got T)r6   r(   ra   warningswarnr}   r   rj   r   r~   r)   create_selectr   )rw  rk   r   r!   r`  r   r   r   r   r  0  s   

r  c                 C  s"   |r	t ||}n|}t | |S rq   )r(   r   r)   )rk   rQ   ra  res_tyr   r   r   wrap_tensorF  s   r|  inputsSequence[tl.tensor]Tuple[tl.tensor, ...]c                   s    d u rt fddD d d jjt} |k s'J d| d fddtD tfddD sAJ d	d
d D  |   t fddttD S )Nc                 3  s&    | ]}t ||jjgd  dV  qdS )Tr>  N)r.  r#  rF   r2  tr!   r   r   rL  Q  s   $ zreduction.<locals>.<genexpr>r   z&reduction axis must be < inputs rank (r   c                   s   g | ]
\}}| kr|qS r   r   )r2  rW  r-  )r   r   r   r3  W  s    zreduction.<locals>.<listcomp>c                 3  s    | ]	}|j j kV  qd S rq   )rj   r   r  rM  r   r   rL  X  s    z-all reduction inputs must have the same shapec                 S  r  r   r  r  r   r   r   r3  Z  r  c                 3  *    | ]}t | | jjV  qd S rq   r|  
get_resultrj   r{   r2  rW  )r}  	reduce_opra  r   r   rL  ^     ( )	tuplerj   r   r(  rT  allcreate_reduceverifyr  )r}  r   region_builder_fnr!   rankr   )r   r!   r}  r  ra  r   r   	reductionO  s   "r  reversec                   s    d j jt}| |  kr|k s!n J d| d| d|dk r)||7 } D ]}|j jks7J dq+|dd  D |||   t fdd	tt D S )
Nr   z
scan axis z must be < inputs rank (r   z(all scan inputs must have the same shapec                 S  r  r   r  r  r   r   r   r3  s  r  z$associative_scan.<locals>.<listcomp>c                 3  r  rq   r  r  r}  scan_opr   r   r   rL  w  r  z#associative_scan.<locals>.<genexpr>)rj   r   r(  create_scanr  r  r  )r}  r   r  r  r!   r  r  r   r  r   associative_scanf  s   ."r  num_binsc                 C  sJ   t | jdksJ d| j sJ dt|| j|ttj	|fS )Nr
   z histogram only supports 1D inputz%histogram only supports integer input)
r(  r   r6   rP   r(   r)   create_histogramr   r   r+   )r   r  r!   r   r   r   	histogram  s   "r  valuesc                 C  s@   t dt| jt|krtd| jdt|| j  | S )Nr
   zAShape of input to multiple_of does not match the length of valuesztt.divisibility)	r   r(  r   r'   r   set_attrr	   	make_attrget_contextrk   r  r   r   r   multiple_of  s   r  c                 C  :   t | jt |krtd| jdt|| j  | S )NzDShape of input to max_contiguous does not match the length of valuesztt.contiguityr(  r   r'   r   r  r	   r  r  r  r   r   r   max_contiguous     r  c                 C  r  )NzCShape of input to max_constancy does not match the length of valuesztt.constancyr  r  r   r   r   max_constancy  r  r  c                 C  s   t |  t jS rq   )r(   r)   create_barrierr  r  r   r   r   debug_barrier  s   r  prefixargshexc                 C  s   |  ds|r| d7 } |  ds|r| d d d } t| dkr)| ds)d|  } dd |D }dd |D }t|| |||tjS )N rR  r=  r   c                 S  r  r   r  r2  argr   r   r   r3    r  z device_print.<locals>.<listcomp>c                 S  s*   g | ]}|j tjtjtjtjtjfv qS r   )r6   r(   ra   r  r  r+   rc   r  r   r   r   r3    s   * )endswithr(  
startswithr(   r)   create_printr  )r  r  r  r!   new_args	is_signedr   r   r   device_print  s   r  r   r   c                 C  s$   |j jsd S t|| j|tjS rq   )r   debugr(   r)   create_assertr   r  )r   r   r!   r   r   r   r     s   r   c                 C  r  rq   )r(   r)   create_assumer   r  )r   r!   r   r   r   assume  r  r  c                 C  s  t |tr
t|}t |tjrH|r-d|j  krdk s'n J d|j d| |jS d|j  kr8dk sBn J d|j d| |jS t |tjr|jjdksXJ d	|j	
 saJ d
|j	tjkrv|rv| |j|  |j	 S |j	tjkr|sJ d|jS J dt| )NrW   rX   z@Block pointers only support 64 bit `shape/strides`, got a value z which is out of the rangerU   rV   zFBlock pointers only support 32 bit `offsets/block_shape`, got a value r
   z*Expected a scalar in shape/strides/offsetsz8Expected an integer scalar type in shape/strides/offsetsFzzBlock pointers only support 32 bit `offsets/block_shape`, add a `.to(tl.int32)` or use regular indexing for 64 bit supportz3Unsupported element type in shape/strides/offsets: )r_   r    r(   rh   rF   r   rP  r)   r#  r6   rP   rc   r}  r   get_int64_tyr   r+   rj   )r!   r  r  r   r   r   _convert_elem_to_ir_value  s*   



r  c                   s,   t |dr fdd|D S t |gS )Nr  c                   s   g | ]}t  |qS r   )r  r  r!   r  r   r   r3    s    z)_convert_to_ir_values.<locals>.<listcomp>)r  r  )r!   	list_liker  r   r  r   r    s   
r  basec              	     s8  t ||}t ||}t ||dd}| j r| jj r td| jjtjkr4t| t	tj
| jj|} t ds< g dd  D  tdd  D sPJ d	t|dsX|g}d
d |D }t|ttt|ksoJ dt fdd||||fD sJ d|| j||| |}t|t	t| jj S )NFr  zMExpected `base` to be a pointer type (but not a block pointer type or others)r  c                 S  r  r   r  r  r   r   r   r3    r  z"make_block_ptr.<locals>.<listcomp>c                 s  s2    | ]}t |tod |  kodk n  V  qdS )rU   rV   N)r_   r    r  r   r   r   rL    s   0 z!make_block_ptr.<locals>.<genexpr>zGExpected a list of constant integers (`int32_t` range) in `block_shape`c                 S  r  r   r  r  r   r   r   r3    r  z<Expected a permutation of (0, 1, ..., len(order)-1) in orderc                 3  s     | ]}t  t |kV  qd S rq   )r(  )r2  r  r  r   r   rL    s    zBExpected shape/strides/offsets/block_shape to have the same length)r  rj   rr   r  r   r'   r(   ra   r}   r  r  r  r  r  rN  rO  r  r(  create_make_block_ptrr   r)   r   )r  r   stridesr  r  orderr!   r   r   r  r   make_block_ptr  s,   



  r  c                 C  s&   t ||dd}t|| j|| jS r  )r  r(   r)   create_advancer   rj   )r  r  r!   r   r   r   advance  s   r  )r   r    r!   r"   r#   r$   )r1   r2   r3   r2   r#   r2   )r1   r2   r?   r@   r3   r2   rA   r@   rB   r@   r#   r2   )T)rT   r@   )r   r2   r   r2   ro   r@   r#   rp   )FFTF)ru   rv   rw   rv   r!   r"   r#   rx   )ru   r$   rw   r$   r!   r"   r   r   )
r   rv   r   rv   r   r@   r!   r"   r#   r$   )r   rv   r   rv   r!   r"   r#   r$   )
r   rv   r   rv   r   r@   r!   r"   r#   r$   )rk   r$   r   r$   r   r   r!   r"   )
rk   r$   r   r$   r   r$   r   r   r!   r"   )r   r$   r   r$   r!   r"   r#   rx   )r   r$   r   r$   r!   r"   r#   r$   )r   r$   r!   r"   )r   r$   r#   r$   )r   r$   r!   r"   r#   r$   )r   r$   r!   r$   r#   r$   )r   r$   r#   r   )r  r    r  r    r!   r"   r#   r$   )r   r!  r6   r2   r!   r"   r#   r$   )rF   r$   r   r!  r!   r"   r#   r$   )
r   r$   r*  r!  r+  r@   r!   r"   r#   r$   )r   r$   r   r    r!   r"   r#   r$   )
ru   r$   rw   r$   r+  r@   r!   r"   r#   r$   )r;  r$   r<  r$   r!   r"   r#   r$   )r;  r$   r!   r"   r#   rx   )r   r$   rH  rI  r!   r"   r#   r$   )r   r$   r   r!  r!   r"   r#   r$   )ru   r$   rw   r$   r!   r"   r#   r$   )rd  re  )r   r$   rl  r2   r!   r"   r#   r$   rq   )
r   r$   rl  r2   r!   r"   rt  re  r#   r$   )r  r$   r  r  r   r  r  r   r  r[  r  r[  r  r[  r  r@   r!   r"   r#   r$   )
r  r$   r  r[  r  r[  r!   r"   r#   r$   )r  r$   rF   r$   r!   r"   r#   r$   )r  r$   r  r$   r  r  r  r  r  r  r  r  r  r    r  r    r  r    r  r    r!   r"   r#   r$   )r  r$   r!   r"   r#   r$   )r  r$   r  r$   r  r  r  r[  r  r[  r!   r"   r#   r$   )r  r$   r  r$   r  r$   r  r[  r  r[  r!   r"   r#   r$   )r  r$   r  r$   r  r$   r  r[  r!   r"   r#   r  )r  r$   r  r$   r  r$   r  r[  r  r[  r!   r"   r#   r$   )ru   r$   rw   r$   rF  r$   rD  re  rG  r    rH  r2   r!   r"   r#   r$   )r^  re  )ru   r$   rk  r$   rw   r$   rl  r  rF  rm  rH  r2   r!   r"   r#   r$   )
rw  r$   rk   r$   r   r$   r!   r"   r#   r$   )r}  r~  r   r    r!   r"   r#   r  )
r}  r~  r   r    r  r@   r!   r"   r#   r  )r   r$   r  r    r!   r"   r#   r$   )rk   r$   r  r!  r#   r$   )r!   r"   r#   r$   )
r  r[  r  r  r  r@   r!   r"   r#   r$   )r   r$   r   r[  r!   r"   r#   r$   )r  r$   r!   r"   r#   r$   )u
__future__r   rx  typingr   r   r   r   r   ry   _C.libtritonr	    r   r(   r   r   	Exceptionr   r-   r0   r>   rS   ri   rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r  r   r  r  r   re   r&  r.  r7  r:  rC  rG  rQ  rY  r~   rk  r   r}   r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r
  r  r  r(  r+  r.  r3  r7  r:  r=  rE  r]  rj  rv  r  r|  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r   r   r   r   <module>   s    2+!


:
t>	,	''		H(		'