o
    c۷i'                     @   s  d dl Z d dlmZmZmZ d dlZd dlmZ d dlmZm	Z	m
Z
 d dlmZmZ d dlmZmZmZ eddddejdejd	ejfd
dZejdeejB d	efddZeddddejde	d	e	fddZeddddeeB e	B ejB dejdejdejjd	df
ddZeddddee	B dee	B dee	B dee	B dejdejdejjd	dfddZeddddeeef deeef d	efddZeddddeeB d	efd d!ZeddddeeB d	e	fd"d#Z ejd$ejd%eej d&ej!d	dfd'd(Z"eddddeded	ejfd)d*Z#edddd+ejd	eeef fd,d-Z$ejd9de	d.ee	 d	e	fd/d0Z%edddde&e	B d1ejd	e	fd2d3Z'edddde&e	B d1ejd	e	fd4d5Z(e		d:dejd6ejd	dfd7d8Z)dS );    N)OptionalTupleUnion)Float32Int32
const_expr)Tdsl_user_op)llvmnvvmvectorlocipxcoordreturnc                C   s   | j tj|| j||d S )Nr   )iteratorcutecrd2idxlayout)r   r   r   r    r   A/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/utils.pyelem_pointer   s   r   c                 C   s<   t t| tjrtt| tdd S t| tsJ | S )N   r   )r   
isinstancer   Pointerr   make_tensormake_layout)r   r   r   r   load_scalar_or_pointer   s   r   smem_ptrpeer_cta_rank_in_clusterc             
   C   s>   | j ||d }ttjt || gddddtjjdS )zMMap the given smem pointer to the address at another CTA rank in the cluster.r   z$mapa.shared::cluster.u32 $0, $1, $2;z=r,r,rFhas_side_effectsis_align_stackasm_dialect)	tointir_valuer   r
   
inline_asmr   i32
AsmDialectAD_ATT)r    r!   r   r   smem_ptr_i32r   r   r   set_block_rank   s   
r-   valmbar_ptrc          
   	   C   s   t ||||d }t ||||d }tt| trt| } t| tttjfs,J dtdtdtjdit	|  }tdtdtjdit	|  }	t
jd || j||d|gd	| d
d|	 dddt
jjd d S )Nr   z$val must be Float32, Int32, or Int64f32s32s64frlz6st.async.shared::cluster.mbarrier::complete_tx::bytes.z [$0], $1, [$2];zr,z,rTFr"   )r-   r'   r   r   floatr   r   cutlassInt64typer
   r(   r*   r+   )
r.   r    r/   r!   r   r   remote_smem_ptr_i32remote_mbar_ptr_i32suffix
constraintr   r   r   store_shared_remote/   s0   



r>   val0val1val2val3c                C   s  t ||||d }	t ||||d }
t| ttfsJ dt| tr&tnt}tdtdi| }tdtdi| }tjd |	|
|| j||d||j||d||j||d||j||dgd| d| d	| d
| d| d| dd| d| d| d| ddtjjd d S )Nr   zval must be Float32, or Int32r0   r1   r3   r4   z{
	.reg .v4 .z abcd;
	mov.z abcd.x, $2;
	mov.z abcd.y, $3;
	mov.z abcd.z, $4;
	mov.zG abcd.w, $5;
	st.async.shared::cluster.mbarrier::complete_tx::bytes.v4.z [$0], abcd, [$1];
	}
zr,r,,TFr"   )	r-   r'   r   r   r   r
   r(   r*   r+   )r?   r@   rA   rB   r    r/   r!   r   r   r:   r;   dtyper<   r=   r   r   r   store_shared_remote_x4O   sR   
rE   abc                C   s|   t t jjdkr%ttjt t| j	||dt|j	||d||dS ttjt| j	||dt|j	||d||dS )N   r   )
r7   r   CUDA_VERSIONmajorr   r   fminr   r0   r'   )rF   rG   r   r   r   r   r   rK      s$   	rK   c             
   C   s4   t tjt t | j||dgddddtjjdS )Nr   zsqrt.approx.f32 $0, $1;z=f,fFr"   )r   r
   r(   r   r0   r'   r*   r+   rF   r   r   r   r   r   sqrt      rM   c             
   C   s4   t tjt t| j||dgddddtjjdS )Nr   zcvt.rpi.ftz.s32.f32 $0, $1;z=r,fFr"   )	r   r
   r(   r   r)   r   r'   r*   r+   rL   r   r   r   ceil   rN   rO   tXsXtXpX
fill_valuec              	   C   s   t | d }|| t| jd d D ]5}t| jd D ]*}t|dur=||d|f s<t || d|fd|f  q t || d|fd|f  q qdS )zFill out-of-bounds values in shared memory tensor.

    Args:
        tXsX: Shared memory tensor to fill
        tXpX: Predicate tensor indicating valid elements
        fill_value: Value to fill OOB locations with
    ))Nr   Nr   r   r      N)r   make_fragment_likefillr7   range_constexprshaper   autovec_copy)rP   rQ   rR   	tXrX_fillrest_vrest_kr   r   r   fill_oob   s   	
r\   c             	   C   sb   t jt dt |  | f||d}t t dt |}tt j	|g dg||d}|S )NrS   r   r   r   dynamic_positionstatic_positionr   r   )
r   from_elementsr   r0   r'   bitcasti64r7   r8   extract)rF   rG   r   r   	vec_f32x2	vec_i64x1resr   r   r   f32x2_to_i64   s    rg   cc                C   sx   t jt dt |  f||d}t t dt |}tt j|g dg||d}tt j|g dg||d}||fS )Nr   r   rS   r   r]   )	r   r`   r   rb   r'   ra   r0   r   rc   )rh   r   r   re   rd   res0res1r   r   r   i64_to_f32x2   s   $rk   lanec                 C   sb   t |d u rtj }ttttjj	D ]}d|> }tjj
| |dd}||kr.| |7 } q| S )Nr   r   )offsetmask_and_clamp)r   r   archlane_idxr7   rV   intmathlog2	WARP_SIZEshuffle_sync_up)r.   rl   irm   partial_sumr   r   r   warp_prefix_sum   s   
rx   gmem_ptrc                C   b   ddl m} |jdkr"|jdkr"tjt tjj	|j
t|  dS tjtjj	|j
t|  dS Nr   )rI   rH   	   )rf   opptrrF   )r}   r~   rF   )r7   rI   rJ   minorr   	atomicrmwr   r)   AtomicOpKindINCllvm_ptrr   r'   rF   ry   r   r   rI   r   r   r   atomic_inc_i32      r   c                C   rz   r{   )r7   rI   rJ   r   r   r   r   r)   r   ADDr   r   r'   r   r   r   r   atomic_add_i32   r   r   clc_response_ptrc                 C   s"   | j }|j }tj||||d dS )aN  
    The clusterlaunchcontrol.try_cancel instruction requests atomically cancelling the launch
    of a cluster that has not started running yet. It asynchronously writes an opaque response
    to shared memory indicating whether the operation succeeded or failed. On success, the
    opaque response contains the ctaid of the first CTA of the canceled cluster.

    :param mbar_ptr: A pointer to the mbarrier address in SMEM
    :type mbar_ptr:  Pointer
    :param clc_response_ptr: A pointer to the cluster launch control response address in SMEM
    :type clc_response_ptr:  Pointer
    r   N)r   r   clusterlaunchcontrol_try_cancel)r/   r   r   r   mbar_llvm_ptrclc_response_llvm_ptrr   r   r   issue_clc_query_nomulticast  s   
r   )N)NN)*rr   typingr   r   r   r7   cutlass.cuter   r   r   r   cutlass.cutlass_dslr   r	   cutlass._mlir.dialectsr
   r   r   TensorCoordr   r   jitr   r-   r6   r8   Intr>   rE   rK   rM   rO   Numericr\   rg   rk   rx   rq   r   r   r   r   r   r   r   <module>   s   &
/0  &"&&&