o
    پi=                     @   s  d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	 d dl
mZ d dlZd dlmZ d dlmZmZ d dlmZmZ d dlmZmZ d dlmZ eejjejjdZeejjejjdZeejjejjdZeejjdej ejjd	Z ddede!fddZ"dd Z#ddej$fddZ%	ddej$fddZ&	ddej'dej(dej)e* dej+fddZ,	ddej'dej(dej)e* dej+fddZ-	ddej$d ej.j/dej)e* dej$fd!d"Z0	ddej$d ej.j/dej)e* dej$fd#d$Z1	dd%ej)e2 d&eej3 d'e*dej'fd(d)Z4ej5ejj6fd*ej7ej3B d+ed,ej)e2 dej7ej3B fd-d.Z8dd/ej9d'e*dej9fd0d1Z:dd2ej$d'e*dej$fd3d4Z;ej5d/ej9dej9fd5d6Z<d2ej$dej$fd7d8Z=d9ej$d:e>e2 dej$fd;d<Z?d9ej$dej$fd=d>Z@d?ejAdejBfd@dAZCej5dBej7eB dej7eB fdCdDZDedddEd9eEeB defdFdGZFedddEd9eEeB defdHdIZGe	ddddEd9eEeB dJeEeB dKeEeB dB defdLdMZHej5	NddBej7dOeEeB dB d%ej)e2 defdPdQZIej5	NddBej7dOeEeB dB d%ej)e2 defdRdSZJedddEd9eEeB dTejAddfdUdVZKedddEdBej$dWejLdejAfdXdYZMedddEdBej$dWejLdejAfdZd[ZNej5d\ej$d]ejOdej$fd^d_ZPdd`e*dejOfdadbZQej5ejj6fdcej3ddejjRd,ej)e2 dej3fdedfZSedddEd*ejTdgejTdejTfdhdiZUej5dd*ejOdjeejO dejOfdkdlZVedddEd9eEeB dJeEeB dmedejOfdndoZWe	dpej$dqej$ddfdrdsZXe	dpej$dteej3 dej$fdudsZXej5dpej$fdvdsZXeej5dddEdBedweedxf defdydzZYeej5dddEdBed{edweedxf deeef fd|d}ZZedddEdBeEeB d{eEeB defd~dZ[edddEdededefddZ\edddEdBedefddZ]edddEdBed{edeeef fddZ^edddEdBed{edeeef fddZ_edddEdWejLdej$dej$fddZ`edddEdWejLdej$dej$fddZaedddEdej$dejjRde2dej$fddZbej5d9ej3dej7fddZcdd ZddS )    N)TypeCallableOptionalTupleoverload)partial)Float32
const_expr)Tdsl_user_op)nvvmllvm)from_dlpackrnd)src_c	calc_funcr   Tfuncreturnc           	   	   C   s   t | dr| jS t | dr| j}t |dr|jS |} z	t|  }W n  ttfyC   t | dr;| jdur;| jj	}nt
|  }Y nw t|}t | dri| jdurit| jD ]\}}|j}|t
|  qX| }|rr|| _|S )a  Hash a callable based on the source code or bytecode and closure values.

    Fast-path: if the callable (or its __wrapped__ base) has a ``__cute_hash__``
    attribute, that value is returned immediately. Code-generation backends such
    as Inductor can set this attribute to avoid expensive runtime hashing.

    set_cute_hash: whether or not to set func.__cute_hash__ if not present
    __cute_hash____wrapped____code__N__closure__)hasattrr   r   inspect	getsourceencodeOSError	TypeErrorr   co_codereprhashlibsha256r   	enumeratecell_contentsupdate	hexdigest)	r   set_cute_hash	base_funcdatahasheridxcell
cell_valuehash r/   `/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/flash_attention/cute/utils.pyhash_callable   s0   
	



r1   c                    s   d|   t j fdd}|S )N      ?c                    s   |   }|t jj|dd S )NT)fastmath)cutemathtanh)	acc_S_SSA	batch_idxhead_idxq_idxkv_idxaux_tensorsscoresinv_softcapr/   r0   scoremod_premask_fnL   s   z4create_softcap_scoremod.<locals>.scoremod_premask_fn)r4   jit)softcap_valr@   r/   r>   r0   create_softcap_scoremodI   s   rC         c                 C   s$   t | |dj|dj||  |dS )Nassumed_align)leading_dim)modestride_orderdivisibility)r   mark_layout_dynamicmark_compact_shape_dynamic	dim_order)xrH   	alignmentrK   r/   r/   r0   convert_from_dlpackT   s   

rQ   c                 C   sV   |d u r|   }t| |d}t| jD ]}||kr(|d u s!||vr(|j||d}q|S )NrF   )rI   rJ   )rN   r   rangendimrM   )rO   rH   rP   static_modesrJ   x_ir/   r/   r0   "convert_from_dlpack_leading_static^   s   rW   F	copy_atom	tiled_mmaswapABc                 C       t |r
t| |S t| |S N)r	   r4   make_tiled_copy_Bmake_tiled_copy_ArX   rY   rZ   r/   r/   r0   r^   j      r^   c                 C   r[   r\   )r	   r4   r^   r]   r_   r/   r/   r0   r]   s   r`   r]   smemthr_mmac                 C   "   t |r	t| |S ||| S r\   )r	   mma_make_fragment_Bmake_fragment_Apartition_Ara   rb   rZ   r/   r/   r0   mma_make_fragment_A|      
rh   c                 C   rc   r\   )r	   rh   make_fragment_Bpartition_Brg   r/   r/   r0   rd      ri   rd   archelement_type	transposec                 C   sL   t | dk p	|jdkrtjtj |d|j dS ttjjj|dd|S )NZ   rD      )num_bits_per_copy   )rn   num_matrices)r	   widthr4   make_copy_atomnvgpuCopyUniversalOpwarpStMatrix8x8x16bOp)rl   rm   rn   r/   r/   r0   get_smem_store_atom   s   rz   valoprt   c                 C   s   t t| tjr/t| j| j}||  t	t
| jD ]}t|| ||||< q| S t	tt|D ]}|| tjj| d|> d} q9| S )NrE   )offset)r	   
isinstancer4   	TensorSSAmake_fragmentshapedtypestorecutlassrange_constexprsizewarp_reduceloadintr5   log2rl   shuffle_sync_bfly)r{   r|   rt   resrV   r/   r/   r0   r      s   
r   
acc_layoutc                 C   s  t | j}|jd d |jd f|jd d g|jd dd |jd R g|jdd R }|jd d |jd f|jd d g|jd dd |jd R g|jdd R }t|r~|d |d g|dd R }|d |d g|dd R }t j||d}t | |S )z
    For Sm80, convert ((2, 2), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, MMA_N), ...).
    For Sm90, convert ((2, 2, V), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, V, MMA_N), ...).
    r   rE   rp   N   stride)r4   make_layoutr   r   r	   composition)r   rn   acc_layout_col_majorr   r   acc_layout_mnr/   r/   r0   convert_layout_acc_mn   s4   
	r   accc                 C   s   t | jt| j|dS )N)rn   )r4   make_tensoriteratorr   layout)r   rn   r/   r/   r0   make_acc_tensor_mn_view      r   c                 C   s@  t t| jd dkrgt| d}tj|jd d |jd d |jd d d f|jd |jd d d |jd ff|jd d |jd d |jd d d f|jd |jd d d |jd ffd}|S t| d}tj|jd |jd d f|jd |jd d f|jd |jd d f|jd |jd d fd}|S )Nr   r   )NNrp   NNrE   rp   r   r   )r	   r4   rankr   logical_divider   r   )r   lrA_mma_viewr/   r/   r0   convert_layout_acc_frgA   s6   **r   c                 C   s   t | jt| jS r\   )r4   r   r   r   r   )r   r/   r/   r0   make_acc_tensor_frgA_view   s   r   arI   c                 C   s   t | jt | j|S r\   )r4   r   r   selectr   )r   rI   r/   r/   r0   r      r   r   c                 C   sT   | j d | j d g| j dd R }ddgtdt| R }t| tj||dS )z7Transpose the first two dimensions of a tensor on smem.rE   r   rp   N)order)r   rR   r4   r   r   make_ordered_layout)r   r   r   r/   r/   r0   transpose_view  s   $r   ptrc                 C   sf   t | jj}td|}|r,t|dt|dt|d}}}t|||S t	d| )ai  Extract swizzle parameters from a pointer's swizzle_type.

    The swizzle_type string has the form '!cute.swizzle<"S<b,m,s>">' where
    b, m, s are the swizzle parameters (bits, base, shift).

    Returns:
        A cute.Swizzle object constructed from the extracted parameters

    Raises:
        ValueError: If the swizzle_type string cannot be parsed
    zS<(\d+),(\d+),(\d+)>rE   rp   r   zCould not parse swizzle_type: )
strtypeswizzle_typeresearchr   groupr4   make_swizzle
ValueError)r   swizzle_strmatchbmsr/   r/   r0   parse_swizzle_from_pointer  s   .r   rO   c                 C   sh   t t| tjr.t| jt}||  t	t
| jD ]}tj|| ||< q| S tj| S )zexp2f calculation for both vector and scalar.
    :param x: input value
    :type x: cute.TensorSSA or Float32
    :return: exp2 value
    :rtype: cute.TensorSSA or Float32
    )r	   r~   r4   r   r   r   r   r   r   r   r   rl   exp2r   )rO   r   rV   r/   r/   r0   exp2f$  s   
r   locipc             
   C   s4   t tjt t | j||dgddddtjjdS )Nr   zlg2.approx.ftz.f32 $0, $1;z=f,fFhas_side_effectsis_align_stackasm_dialect)r   r   
inline_asmr
   f32ir_value
AsmDialectAD_ATTr   r   r   r/   r/   r0   log2f6  s   r   c                C   s   t | ||dtd S )Nr   g       @)r   r5   logr   r/   r/   r0   logfE  s   r   r   cc             	   C   sT   t tjt t | j||dt |j||d|d ur#t |j||dnd ||dS )Nr   )r   r   r   )r   r   fmaxr
   r   r   )r   r   r   r   r   r/   r/   r0   r   J  s   r   P   init_valc                 C   s  t |dk pt| jd dkrt| jt}||  |d |d |d |d g}tdt| jdD ]6}t	|d ||d  |d< t	|d ||d  |d< t	|d ||d  |d< t	|d ||d  |d< q5t	|d |d |d< t	|d |d |d< t	|d |d |d< t |d u r|d S t	|d |S t| jt}||  t |d urt	||d |d nt	|d |d }|t	|d |d t	|d |d t	|d	 |d
 g}tdt| jdD ]H}t	|d || ||d  |d< t	|d ||d  ||d  |d< t	|d ||d  ||d  |d< t	|d ||d	  ||d
  |d< qt	|d |d |d< t	|d |d |d S )Nd      r   rE   rp   r   rr            )
r	   r4   r   r   r   r   r   r   r   r   )rO   r   rl   r   	local_maxrV   local_max_0r/   r/   r0   fmax_reduceZ  s>    
"

 $$&r   c                 C   s  t |dk pt| jd dkr"t |d u rtj}| tjj|dS t	| jt}|
|  t |d urAt|df|d |d fn|d |d f}||d |d f|d |d	 f|d
 |d fg}tdt| jdD ]N}t|d ||d  ||d  f|d< t|d ||d  ||d  f|d< t|d ||d  ||d	  f|d< t|d ||d
  ||d  f|d< qlt|d |d |d< t|d |d |d< t|d |d |d< |d d |d d  S )Nr   r   r   g        rE   rp   r   rr   r   r   r   )r	   r4   r   r   r   zeroreduceReductionOpADDr   r   add_packed_f32x2r   r   )rO   r   rl   r   local_sum_0	local_sumrV   r/   r/   r0   fadd_reduce  s(    

0&&&(r   gmem_ptrc                C   s(   t jt t jj|jt|  d d S )N)r   r|   r   r   )	r   	atomicrmwr
   r   AtomicOpKindFADDllvm_ptrr   r   )r   r   r   r   r/   r/   r0   atomic_add_fp32  s   
r   coordc                C   s   | j tj|| j||d S )Nr   )r   r4   crd2idxr   )rO   r   r   r   r/   r/   r0   elem_pointer  s   r   c                C   s   t dd t|D }t| j}t|t|ksJ dtdd t||D }|| jj	 d }tj
| j| j | | j| jjdS )Nc                 s       | ]}t |V  qd S r\   r   Int64.0r   r/   r/   r0   	<genexpr>      z#elem_pointer_i64.<locals>.<genexpr>/Coordinate and stride must have the same lengthc                 s       | ]	\}}|| V  qd S r\   r/   r   r   r   r/   r/   r0   r         r   rF   )tupler4   flattenflatten_to_tupler   lensumziprm   rt   make_ptrr   tointmemspacerP   )rO   r   r   r   flat_coord_i64flat_strider}   byte_offsetr/   r/   r0   elem_pointer_i64  s   r   tAcAlimitc              	   C   s   t t jt j| ddgdt j| dgdt j| dgdft j| dgdddfdtj}t|jd D ]!}t|jd D ]}t | d|fd|f d |||d|f< q=q3|S )Nr   rE   )rI   rp   r   )	r4   r   r   r   r   Booleanr   r   	elem_less)r   r   tApArest_vrest_kr/   r/   r0   predicate_k  s   .*r  syncc                 C   s*   t j d d }t| rt j|}|S )Nr      )r4   rl   
thread_idxr	   make_warp_uniform)r  warp_group_idxr/   r/   r0   canonical_warp_group_idx  s   r  valuer}   c           	      C   s   | j d dksJ dtjj| }tjjd }|d> |B }ttjdddt| }| |d< t|tj	}t
t|D ]}tjj|| ||d||< q=|d S )	N    r   z(value type must be a multiple of 32 bitsrE   r   )rE   r   )mask_and_clamp)rt   r4   rl   	WARP_SIZEmake_rmem_tensorr   r   recast_tensorr   Int32r   r   shuffle_sync)	r  r}   rt   maskclampr  r{   val_i32rV   r/   r/   r0   r    s   r  shiftc                C   sJ   t tjt t | j||dt |j||dgddddtjjdS )Nr   zshr.s32 $0, $1, $2;z=r,r,rFr   )	r   Uint32r   r   r
   i32r   r   r   )r{   r  r   r   r/   r/   r0   shr_u32"  s   r  lanec                 C   sb   t |d u rtj }ttttjj	D ]}d|> }tjj
| |dd}||kr.| |7 } q| S )NrE   r   )r}   r  )r	   r4   rl   lane_idxr   r   r   r5   r   r  shuffle_sync_up)r{   r  rV   r}   partial_sumr/   r/   r0   warp_prefix_sum4  s   
r  to_dtypec                C   st   |t jt jfv sJ dt tjt t| j	||dt|j	||dgd|t ju r,dnd ddddtj
jd	S )
Nz$to_dtype must be BFloat16 or Float16r   zcvt.rn.bf16x2f16x2z.f32 $0, $2, $1;z=r,f,fFr   )r   BFloat16Float16r  r   r   r
   r  r   r   r   r   )r   r   r   r   r   r/   r/   r0   cvt_f16x2_f32C  s   "r%  srcdstc                 C      d S r\   r/   )r&  r'  r/   r/   r0   cvt_f16U     r)  r   c                 C   r(  r\   r/   )r&  r   r/   r/   r0   r)  Y  r*  c                 C   s  t t|tr|}t| j|}t| | |S |}t|jt| jks)J dt| jd dks7J d|jt	j
t	jfv sDJ d| jtu sMJ dt|t	j}t|jd t| jksdJ t	t|D ]}t| d|  | d| d  |j||< qldS )	a  Convert Float32 tensor to Float16/BFloat16.

    Args:
        src: Source tensor with Float32 element type
        dst_or_dtype: Either a destination tensor or a dtype (Float16/BFloat16)

    Returns:
        None if dst is a tensor, or a new tensor if dtype is provided
    z#dst and src must have the same sizerp   r   z(src must have an even number of elementszdst must be BFloat16 or Float16zsrc must be Float32rE   N)r	   r~   r   r4   r   r   r)  r   rm   r   r#  r$  r   r  r  r   r%  )r&  dst_or_dtyper   r'  dst_i32rV   r/   r/   r0   r)  ]  s"   
  (poly.c                C   s@   t |d }|| }t|d ddD ]
}||  ||  }q|S NrE   )r   r   r   )rO   r-  r   r   degoutrV   r/   r/   r0   evaluate_polynomial}  s
   r2  yc                C   sT   t |d }|| || f}t|d ddD ]}t|| |f|| || f}q|S r.  )r   r   r   fma_packed_f32x2)rO   r3  r-  r   r   r0  r1  rV   r/   r/   r0   evaluate_polynomial_2  s
   r5  c                C   F   t tjt t| j||dt|j||dgddddtjjdS )Nr   zadd.rm.ftz.f32 $0, $1, $2;=f,f,fFr   	r   r   r   r   r
   r   r   r   r   )rO   r3  r   r   r/   r/   r0   add_round_down  s   "r9  	x_roundedfrac_ex2c                C   r6  )Nr   z{
	.reg .s32 x_rounded_i, frac_ex_i, x_rounded_e, out_i;
	mov.b32 x_rounded_i, $1;
	mov.b32 frac_ex_i, $2;
	shl.b32 x_rounded_e, x_rounded_i, 23;
	add.s32 out_i, x_rounded_e, frac_ex_i;
	mov.b32 $0, out_i;
	}
r7  Fr   r8  )r:  r;  r   r   r/   r/   r0   combine_int_frac_ex2  s   
r<  c          
      C   sZ   d}t d}tj| d}t||||d}|| }|| }t||||d}	t||	||dS )Nr2   g    >?g    ?g   ?        _r   )floatr4   rl   r   r9  r2  r<  )
rO   r   r   poly_ex2_deg3fp32_round_int	x_clampedr:  x_rounded_backx_frac
x_frac_ex2r/   r/   r0   ex2_emulation  s   rG  c                C   s   d}t d}tj| dtj|df}tjj|||ftjjd}t|||f}t||}	t	g |	|R ||d}
t
|d |
d ||d}t
|d |
d ||d}||fS )Nr=  r>  r?  r   r   r   rE   )r@  r4   rl   r   r   r   RoundingModeKindRMsub_packed_f32x2r5  r<  )rO   r3  r   r   rA  rB  
xy_clamped
xy_roundedxy_rounded_backxy_fracxy_frac_ex2x_outy_outr/   r/   r0   ex2_emulation_2  s   
rR  c             	   C   s   t jt jt t gt| j||dt|||d gddddt jj	d}tt j
t |dg||d}tt j
t |dg||d}||fS )Nr   a>  {
	.reg .f32 f1, f2, f3, f4, f5, f6, f7;
	.reg .b64 l1, l2, l3, l4, l5, l6, l7, l8, l9, l10;
	.reg .s32 r1, r2, r3, r4, r5, r6, r7, r8;
	max.ftz.f32 f1, $2, 0fC2FE0000;
	max.ftz.f32 f2, $3, 0fC2FE0000;
	mov.b64 l1, {f1, f2};
	mov.f32 f3, 0f4B400000;
	mov.b64 l2, {f3, f3};
	add.rm.ftz.f32x2 l7, l1, l2;
	sub.rn.ftz.f32x2 l8, l7, l2;
	sub.rn.ftz.f32x2 l9, l1, l8;
	mov.f32 f7, 0f3D9DF09D;
	mov.b64 l6, {f7, f7};
	mov.f32 f6, 0f3E6906A4;
	mov.b64 l5, {f6, f6};
	mov.f32 f5, 0f3F31F519;
	mov.b64 l4, {f5, f5};
	mov.f32 f4, 0f3F800000;
	mov.b64 l3, {f4, f4};
	fma.rn.ftz.f32x2 l10, l9, l6, l5;
	fma.rn.ftz.f32x2 l10, l10, l9, l4;
	fma.rn.ftz.f32x2 l10, l10, l9, l3;
	mov.b64 {r1, r2}, l7;
	mov.b64 {r3, r4}, l10;
	shl.b32 r5, r1, 23;
	add.s32 r7, r5, r3;
	shl.b32 r6, r2, 23;
	add.s32 r8, r6, r4;
	mov.b32 $0, r7;
	mov.b32 $1, r8;
	}
z	=r,=r,f,fFr   r   rE   )r   r   
StructTypeget_literalr
   r   r   r   r   r   extractvalue)rO   r3  r   r   	out_f32x2out0out1r/   r/   r0   e2e_asm2  s   " (rY  tensorc                C   sD   t |jtjs	J tj|jt||  |j|jj	d}t
||jS )NrF   )r~   r   r4   Pointerr   rm   r   r   r   rP   r   r   )r   rZ  r   r   new_ptrr/   r/   r0   domain_offset_aligned  s   r]  c                C   s   t dd t| D }t|j}t|t|ksJ dtdd t||D }t|j	tj
s3J tj|j|j	 ||jj d  |j|j	jd}t||jS )Nc                 s   r   r\   r   r   r/   r/   r0   r   -  r   z$domain_offset_i64.<locals>.<genexpr>r   c                 s   r   r\   r/   r   r/   r/   r0   r   2  r   r   rF   )r   r4   r   r   r   r   r   r   r~   r   r[  r   rm   r   rt   r   max_alignmentr   r   )r   rZ  r   r   r   r   r}   r\  r/   r/   r0   domain_offset_i64+  s   r_  r+   dimc                C   s   t |t| j|  }t| jtjsJ tj| j	| j
 || j	j d  | j| jjd}t| jg d g| dd gt| | d  R }t||S )Nr   rF   r   rE   )r   r   r4   r   r   r~   r   r[  r   rm   r   rt   r   r^  slice_r   r   r   )rZ  r+   r`  r   r   r}   r\  
new_layoutr/   r/   r0   coord_offset_i64>  s   .rc  c                 C   s   t d|}| |d< | S )zBConvert a scalar to a cute TensorSSA of shape (1,) and given dtyperE   r   )r4   r   r   )r   r   vecr/   r/   r0   scalar_to_ssaQ  s   re  c                 C   s   | d S )z2Could inline but nice for reflecting the above apir   r/   )r{   r/   r/   r0   ssa_to_scalarY  s   rf  )T)rD   rE   )rD   NN)Fr\   )Nr   )er5   r!   r   r   typingr   r   r   r   r   	functoolsr   r   cutlass.cuter4   r   r	   cutlass.cutlass_dslr
   r   cutlass._mlir.dialectsr   r   cutlass.cute.runtimer   rl   r4  rH  RNmul_packed_f32x2r   calc_packed_f32x2_oprJ  r   r1   rC   TensorrQ   rW   CopyAtomTiledMma	Constexprbool	TiledCopyr^   r]   coreThrMmarh   rd   r   Numericrz   rA   r  r   r   Layoutr   r   r   r   listr   r   r[  Swizzler   r   r@  r   r   r   r   r   r   Coordr   r   r  r  r  Intr  r  r  r  r%  r)  r2  r5  r9  r<  rG  rR  rY  r]  r_  rc  re  rf  r/   r/   r/   r0   <module>   s  *











'	   

2
(&&&&$"*


( ((.&