o
    پi                     @   s  d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dlm
Z
mZmZ d dlmZmZ d dlmZmZmZ ee	jjejjdZee	jjejjdZee	jjejjdZee	jjdejejjdZeddd	d
e	jde	jde	jfddZe	j d
e
e	jB de
fddZ!eddd	de	jdedefddZ"eddd	de#e
B eB ej$B de	jde	jde	jj%ddf
ddZ&eddd	dee#e
f dee#e
f de
fddZ'eddd	de#e
B de
fddZ(eddd	de#e
B defdd Z)eddd	de*eB de*eB d!e*eB defd"d#Z+e	j d$e	jd%ee	j d&e	j,ddfd'd(Z-eddd	de
de
dej$fd)d*Z.eddd	d!ej$dee
e
f fd+d,Z/e	j d5ded-ee defd.d/Z0eddd	de*eB d0e	jdefd1d2Z1eddd	de*eB d0e	jdefd3d4Z2dS )6    N)partial)OptionalTupleUnion)Float32Int32
const_expr)Tdsl_user_op)llvmnvvmvector)rnd)src_c	calc_funcr   locipxcoordreturnc                C   s   | j tj|| j||d S Nr   )iteratorcutecrd2idxlayout)r   r   r   r    r   ?/home/ubuntu/.local/lib/python3.10/site-packages/quack/utils.pyelem_pointer   s   r   c                 C   s<   t t| tjrtt| tdd S t| tsJ | S )N   r   )r   
isinstancer   Pointerr   make_tensormake_layout)r   r   r   r   load_scalar_or_pointer    s   r$   smem_ptrpeer_cta_rank_in_clusterc             
   C   s>   | j ||d }ttjt || gddddtjjdS )zMMap the given smem pointer to the address at another CTA rank in the cluster.r   z$mapa.shared::cluster.u32 $0, $1, $2;z=r,r,rFhas_side_effectsis_align_stackasm_dialect)	tointir_valuer   r   
inline_asmr	   i32
AsmDialectAD_ATT)r%   r&   r   r   smem_ptr_i32r   r   r   set_block_rank)   s   
r2   valmbar_ptrc          
   	   C   s   t ||||d }t ||||d }tt| trt| } t| tttjfs,J dtdtdtjdit	|  }tdtdtjdit	|  }	t
jd || j||d|gd	| d
d|	 dddt
jjd d S )Nr   z$val must be Float32, Int32, or Int64f32s32s64frlz6st.async.shared::cluster.mbarrier::complete_tx::bytes.z [$0], $1, [$2];zr,z,rTFr'   )r2   r,   r   r    floatr   r   cutlassInt64typer   r-   r/   r0   )
r3   r%   r4   r&   r   r   remote_smem_ptr_i32remote_mbar_ptr_i32suffix
constraintr   r   r   store_shared_remote<   s0   



rC   abc                C   s8   t tjt t | j||dt |j||d||dS r   )r   r   fminr	   r5   r,   )rD   rE   r   r   r   r   r   rF   \   s   rF   c             
   C   s4   t tjt t | j||dgddddtjjdS )Nr   zsqrt.approx.f32 $0, $1;z=f,fFr'   )r   r   r-   r	   r5   r,   r/   r0   rD   r   r   r   r   r   sqrti      rH   c             
   C   s4   t tjt t| j||dgddddtjjdS )Nr   zcvt.rpi.ftz.s32.f32 $0, $1;z=r,fFr'   )	r   r   r-   r	   r.   r   r,   r/   r0   rG   r   r   r   ceilx   rI   rJ   cc             
   C   sT   t tjt t | j||dt |j||dt |j||dgddddtjjdS )Nr   zprmt.b32 $0, $1, $2, $3;z=r,r,r,rFr'   )r   r   r-   r	   r.   r,   r/   r0   )rD   rE   rK   r   r   r   r   r   prmt   s   rL   tXsXtXpX
fill_valuec              	   C   s   t | d }|| t| jd d D ]5}t| jd D ]*}t|dur=||d|f s<t || d|fd|f  q t || d|fd|f  q qdS )zFill out-of-bounds values in shared memory tensor.

    Args:
        tXsX: Shared memory tensor to fill
        tXpX: Predicate tensor indicating valid elements
        fill_value: Value to fill OOB locations with
    ))Nr   Nr   r   r      N)r   make_fragment_likefillr<   range_constexprshaper   autovec_copy)rM   rN   rO   	tXrX_fillrest_vrest_kr   r   r   fill_oob   s   	
rY   c             	   C   sb   t jt dt |  | f||d}t t dt |}tt j	|g dg||d}|S )NrP   r   r   r   dynamic_positionstatic_positionr   r   )
r   from_elementsr	   r5   r,   bitcasti64r<   r=   extract)rD   rE   r   r   	vec_f32x2	vec_i64x1resr   r   r   f32x2_to_i64   s    rd   c                C   sx   t jt dt |  f||d}t t dt |}tt j|g dg||d}tt j|g dg||d}||fS )Nr   r   rP   r   rZ   )	r   r]   r	   r_   r,   r^   r5   r   r`   )rK   r   r   rb   ra   res0res1r   r   r   i64_to_f32x2   s   $rg   lanec                 C   sb   t |d u rtj }ttttjj	D ]}d|> }tjj
| |dd}||kr.| |7 } q| S )Nr   r   )offsetmask_and_clamp)r   r   archlane_idxr<   rS   intmathlog2	WARP_SIZEshuffle_sync_up)r3   rh   iri   partial_sumr   r   r   warp_prefix_sum   s   
rt   gmem_ptrc                C   $   t jt t jj|jt|  dS N)rc   opptrrD   )	r   	atomicrmwr	   r.   AtomicOpKindADDllvm_ptrr   r,   rD   ru   r   r   r   r   r   atomic_add_i32      r   c                C   rv   rw   )	r   rz   r	   r.   r{   INCr}   r   r,   r~   r   r   r   atomic_inc_i32   r   r   )N)3rn   	functoolsr   typingr   r   r   r<   cutlass.cuter   r   r   r   cutlass.cutlass_dslr	   r
   cutlass._mlir.dialectsr   r   r   rk   fma_packed_f32x2RoundingModeKindRNmul_packed_f32x2add_packed_f32x2calc_packed_f32x2_opsub_packed_f32x2TensorCoordr!   r   jitr$   r2   r;   r=   IntrC   rF   rH   rJ   rm   rL   NumericrY   rd   rg   rt   r   r   r   r   r   r   <module>   s   &0  0&"&&*