o
    پi~                     @   s  d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	 d dl
mZ d dlZd dlmZ d dlmZmZ d dlmZmZ d dlmZmZ d dlmZ eejjejjdZeejjejjdZeejjejjdZeejjdej ejjd	Z d
ede!fddZ"dd Z#ddej$fddZ%	ddej$fddZ&	ddej'dej(dej)e* dej+fddZ,	ddej'dej(dej)e* dej+fddZ-	ddej$dej.j/dej)e* dej$fd d!Z0	ddej$dej.j/dej)e* dej$fd"d#Z1	dd$ej)e2 d%eej3 d&e*dej'fd'd(Z4ej5ejj6fd)ej7ej3B d*ed+ej)e2 dej7ej3B fd,d-Z8dd.ej9d&e*dej9fd/d0Z:dd1ej$d&e*dej$fd2d3Z;ej5d.ej9dej9fd4d5Z<d1ej$dej$fd6d7Z=d8ej$d9e>e2 dej$fd:d;Z?d8ej$dej$fd<d=Z@d>ejAdejBfd?d@ZCej5dAej7eB dej7eB fdBdCZDedddDd8eEeB defdEdFZFedddDd8eEeB defdGdHZGe	ddddDd8eEeB dIeEeB dJeEeB dB defdKdLZHej5	MddAej7dNeEeB dB d$ej)e2 defdOdPZIej5	MddAej7dNeEeB dB d$ej)e2 defdQdRZJedddDd8eEeB dSejAddfdTdUZKedddDdAej$dVejLdejAfdWdXZMedddDdAej$dVejLdejAfdYdZZNej5d[ej$d\ejOdej$fd]d^ZPdd`e*dejOfdadbZQej5ejj6fdcej3ddejjRd+ej)e2 dej3fdedfZSedddDd)ejTdgejTdejTfdhdiZUej5dd)ejOdjeejO dejOfdkdlZVedddDd8eEeB dIeEeB dmedejOfdndoZWe	dpej$dqej$ddfdrdsZXe	dpej$dteej3 dej$fdudsZXej5dpej$fdvdsZXeej5dddDdAedweedxf defdydzZYeej5dddDdAed{edweedxf deeef fd|d}ZZedddDdAeEeB d{eEeB defd~dZ[edddDdededefddZ\edddDdAedefddZ]edddDdAed{edeeef fddZ^edddDdAed{edeeef fddZ_edddDdVejLdej$dej$fddZ`edddDdVejLdej$dej$fddZaedddDdej$dejjRde2dej$fddZbej5d8ej3dej7fddZcdd ZddS )    N)TypeCallableOptionalTupleoverload)partial)Float32
const_expr)Tdsl_user_op)nvvmllvm)from_dlpackrnd)src_c	calc_funcr   funcreturnc              	   C   s   t | dr| jS t | dr| j}t |dr|jS |} z	t|  }W n  ttfyC   t | dr;| jdur;| jj	}nt
|  }Y nw t|}t | dri| jdurit| jD ]\}}|j}|t
|  qX| S )a;  Hash a callable based on the source code or bytecode and closure values.

    Fast-path: if the callable (or its __wrapped__ base) has a ``__cute_hash__``
    attribute, that value is returned immediately. Code-generation backends such
    as Inductor can set this attribute to avoid expensive runtime hashing.
    __cute_hash____wrapped____code__N__closure__)hasattrr   r   inspect	getsourceencodeOSError	TypeErrorr   co_codereprhashlibsha256r   	enumeratecell_contentsupdate	hexdigest)r   	base_funcdatahasheridxcell
cell_value r-   P/home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_origin/cute/utils.pyhash_callable   s*   




r/   c                    s   d|   t j fdd}|S )N      ?c                    s   |   }|t jj|dd S )NT)fastmath)cutemathtanh)	acc_S_SSA	batch_idxhead_idxq_idxkv_idxaux_tensorsscoresinv_softcapr-   r.   scoremod_premask_fnE   s   z4create_softcap_scoremod.<locals>.scoremod_premask_fn)r2   jit)softcap_valr>   r-   r<   r.   create_softcap_scoremodB   s   rA         c                 C   s$   t | |dj|dj||  |dS )Nassumed_align)leading_dim)modestride_orderdivisibility)r   mark_layout_dynamicmark_compact_shape_dynamic	dim_order)xrF   	alignmentrI   r-   r-   r.   convert_from_dlpackM   s   

rO   c                 C   sV   |d u r|   }t| |d}t| jD ]}||kr(|d u s!||vr(|j||d}q|S )NrD   )rG   rH   )rL   r   rangendimrK   )rM   rF   rN   static_modesrH   x_ir-   r-   r.   "convert_from_dlpack_leading_staticW   s   rU   F	copy_atom	tiled_mmaswapABc                 C       t |r
t| |S t| |S N)r	   r2   make_tiled_copy_Bmake_tiled_copy_ArV   rW   rX   r-   r-   r.   r\   c      r\   c                 C   rY   rZ   )r	   r2   r\   r[   r]   r-   r-   r.   r[   l   r^   r[   smemthr_mmac                 C   "   t |r	t| |S ||| S rZ   )r	   mma_make_fragment_Bmake_fragment_Apartition_Ar_   r`   rX   r-   r-   r.   mma_make_fragment_Au      
rf   c                 C   ra   rZ   )r	   rf   make_fragment_Bpartition_Bre   r-   r-   r.   rb   ~   rg   rb   archelement_type	transposec                 C   sL   t | dk p	|jdkrtjtj |d|j dS ttjjj|dd|S )NZ   rB      )num_bits_per_copy   )rl   num_matrices)r	   widthr2   make_copy_atomnvgpuCopyUniversalOpwarpStMatrix8x8x16bOp)rj   rk   rl   r-   r-   r.   get_smem_store_atom   s   rx   valoprr   c                 C   s   t t| tjr/t| j| j}||  t	t
| jD ]}t|| ||||< q| S t	tt|D ]}|| tjj| d|> d} q9| S )NrC   )offset)r	   
isinstancer2   	TensorSSAmake_fragmentshapedtypestorecutlassrange_constexprsizewarp_reduceloadintr3   log2rj   shuffle_sync_bfly)ry   rz   rr   resrT   r-   r-   r.   r      s   
r   
acc_layoutc                 C   s  t | j}|jd d |jd f|jd d g|jd dd |jd R g|jdd R }|jd d |jd f|jd d g|jd dd |jd R g|jdd R }t|r~|d |d g|dd R }|d |d g|dd R }t j||d}t | |S )z
    For Sm80, convert ((2, 2), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, MMA_N), ...).
    For Sm90, convert ((2, 2, V), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, V, MMA_N), ...).
    r   rC   rn   N   stride)r2   make_layoutr   r   r	   composition)r   rl   acc_layout_col_majorr   r   acc_layout_mnr-   r-   r.   convert_layout_acc_mn   s4   
	r   accc                 C   s   t | jt| j|dS )N)rl   )r2   make_tensoriteratorr   layout)r   rl   r-   r-   r.   make_acc_tensor_mn_view      r   c                 C   s@  t t| jd dkrgt| d}tj|jd d |jd d |jd d d f|jd |jd d d |jd ff|jd d |jd d |jd d d f|jd |jd d d |jd ffd}|S t| d}tj|jd |jd d f|jd |jd d f|jd |jd d f|jd |jd d fd}|S )Nr   r   )NNrn   NNrC   rn   r   r   )r	   r2   rankr   logical_divider   r   )r   lrA_mma_viewr-   r-   r.   convert_layout_acc_frgA   s6   **r   c                 C   s   t | jt| jS rZ   )r2   r   r   r   r   )r   r-   r-   r.   make_acc_tensor_frgA_view   s   r   arG   c                 C   s   t | jt | j|S rZ   )r2   r   r   selectr   )r   rG   r-   r-   r.   r      r   r   c                 C   sT   | j d | j d g| j dd R }ddgtdt| R }t| tj||dS )z7Transpose the first two dimensions of a tensor on smem.rC   r   rn   N)order)r   rP   r2   r   r   make_ordered_layout)r   r   r   r-   r-   r.   transpose_view   s   $r   ptrc                 C   sf   t | jj}td|}|r,t|dt|dt|d}}}t|||S t	d| )ai  Extract swizzle parameters from a pointer's swizzle_type.

    The swizzle_type string has the form '!cute.swizzle<"S<b,m,s>">' where
    b, m, s are the swizzle parameters (bits, base, shift).

    Returns:
        A cute.Swizzle object constructed from the extracted parameters

    Raises:
        ValueError: If the swizzle_type string cannot be parsed
    zS<(\d+),(\d+),(\d+)>rC   rn   r   zCould not parse swizzle_type: )
strtypeswizzle_typeresearchr   groupr2   make_swizzle
ValueError)r   swizzle_strmatchbmsr-   r-   r.   parse_swizzle_from_pointer  s   .r   rM   c                 C   sh   t t| tjr.t| jt}||  t	t
| jD ]}tj|| ||< q| S tj| S )zexp2f calculation for both vector and scalar.
    :param x: input value
    :type x: cute.TensorSSA or Float32
    :return: exp2 value
    :rtype: cute.TensorSSA or Float32
    )r	   r|   r2   r}   r~   r   r   r   r   r   r   rj   exp2r   )rM   r   rT   r-   r-   r.   exp2f  s   
r   locipc             
   C   s4   t tjt t | j||dgddddtjjdS )Nr   zlg2.approx.ftz.f32 $0, $1;z=f,fFhas_side_effectsis_align_stackasm_dialect)r   r   
inline_asmr
   f32ir_value
AsmDialectAD_ATTr   r   r   r-   r-   r.   log2f/  s   r   c                C   s   t | ||dtd S )Nr   g       @)r   r3   logr   r-   r-   r.   logf>  s   r   r   cc             	   C   sT   t tjt t | j||dt |j||d|d ur#t |j||dnd ||dS )Nr   )r   r   r   )r   r   fmaxr
   r   r   )r   r   r   r   r   r-   r-   r.   r   C  s   r   P   init_valc                 C   s  t |dk pt| jd dkrt| jt}||  |d |d |d |d g}tdt| jdD ]6}t	|d ||d  |d< t	|d ||d  |d< t	|d ||d  |d< t	|d ||d  |d< q5t	|d |d |d< t	|d |d |d< t	|d |d |d< t |d u r|d S t	|d |S t| jt}||  t |d urt	||d |d nt	|d |d }|t	|d |d t	|d |d t	|d	 |d
 g}tdt| jdD ]H}t	|d || ||d  |d< t	|d ||d  ||d  |d< t	|d ||d  ||d  |d< t	|d ||d	  ||d
  |d< qt	|d |d |d< t	|d |d |d S )Nd      r   rC   rn   r   rp            )
r	   r2   r   r   r~   r   r   r   r   r   )rM   r   rj   r   	local_maxrT   local_max_0r-   r-   r.   fmax_reduceS  s>    
"

 $$&r   c                 C   s  t |dk pt| jd dkr"t |d u rtj}| tjj|dS t	| jt}|
|  t |d urAt|df|d |d fn|d |d f}||d |d f|d |d	 f|d
 |d fg}tdt| jdD ]N}t|d ||d  ||d  f|d< t|d ||d  ||d  f|d< t|d ||d  ||d	  f|d< t|d ||d
  ||d  f|d< qlt|d |d |d< t|d |d |d< t|d |d |d< |d d |d d  S )Nr   r   r   g        rC   rn   r   rp   r   r   r   )r	   r2   r   r   r   zeroreduceReductionOpADDr~   r   add_packed_f32x2r   r   )rM   r   rj   r   local_sum_0	local_sumrT   r-   r-   r.   fadd_reduce  s(    

0&&&(r   gmem_ptrc                C   s(   t jt t jj|jt|  d d S )N)r   rz   r   r   )	r   	atomicrmwr
   r   AtomicOpKindFADDllvm_ptrr   r   )r   r   r   r   r-   r-   r.   atomic_add_fp32  s   
r   coordc                C   s   | j tj|| j||d S )Nr   )r   r2   crd2idxr   )rM   r   r   r   r-   r-   r.   elem_pointer  s   r   c                C   s   t dd t|D }t| j}t|t|ksJ dtdd t||D }|| jj	 d }tj
| j| j | | j| jjdS )Nc                 s       | ]}t |V  qd S rZ   r   Int64.0r   r-   r-   r.   	<genexpr>      z#elem_pointer_i64.<locals>.<genexpr>/Coordinate and stride must have the same lengthc                 s       | ]	\}}|| V  qd S rZ   r-   r   r   r   r-   r-   r.   r         r   rD   )tupler2   flattenflatten_to_tupler   lensumziprk   rr   make_ptrr   tointmemspacerN   )rM   r   r   r   flat_coord_i64flat_strider{   byte_offsetr-   r-   r.   elem_pointer_i64  s   r   tAcAlimitc              	   C   s   t t jt j| ddgdt j| dgdt j| dgdft j| dgdddfdtj}t|jd D ]!}t|jd D ]}t | d|fd|f d |||d|f< q=q3|S )Nr   rC   )rG   rn   r   )	r2   r~   r   r   r   Booleanr   r   	elem_less)r   r   tApArest_vrest_kr-   r-   r.   predicate_k  s   .*r  Tsyncc                 C   s*   t j d d }t| rt j|}|S )Nr      )r2   rj   
thread_idxr	   make_warp_uniform)r  warp_group_idxr-   r-   r.   canonical_warp_group_idx  s   r	  valuer{   c           	      C   s   | j d dksJ dtjj| }tjjd }|d> |B }ttjdddt| }| |d< t|tj	}t
t|D ]}tjj|| ||d||< q=|d S )	N    r   z(value type must be a multiple of 32 bitsrC   r   )rC   r   )mask_and_clamp)rr   r2   rj   	WARP_SIZEmake_rmem_tensorr   r   recast_tensorr   Int32r   r   shuffle_sync)	r
  r{   rr   maskclampr  ry   val_i32rT   r-   r-   r.   r    s   r  shiftc                C   sJ   t tjt t | j||dt |j||dgddddtjjdS )Nr   zshr.s32 $0, $1, $2;z=r,r,rFr   )	r   Uint32r   r   r
   i32r   r   r   )ry   r  r   r   r-   r-   r.   shr_u32  s   r  lanec                 C   sb   t |d u rtj }ttttjj	D ]}d|> }tjj
| |dd}||kr.| |7 } q| S )NrC   r   )r{   r  )r	   r2   rj   lane_idxr   r   r   r3   r   r  shuffle_sync_up)ry   r  rT   r{   partial_sumr-   r-   r.   warp_prefix_sum-  s   
r  to_dtypec                C   st   |t jt jfv sJ dt tjt t| j	||dt|j	||dgd|t ju r,dnd ddddtj
jd	S )
Nz$to_dtype must be BFloat16 or Float16r   zcvt.rn.bf16x2f16x2z.f32 $0, $2, $1;z=r,f,fFr   )r   BFloat16Float16r  r   r   r
   r  r   r   r   r   )r   r   r  r   r   r-   r-   r.   cvt_f16x2_f32<  s   "r#  srcdstc                 C      d S rZ   r-   )r$  r%  r-   r-   r.   cvt_f16N     r'  r   c                 C   r&  rZ   r-   )r$  r   r-   r-   r.   r'  R  r(  c                 C   s  t t|tr|}t| j|}t| | |S |}t|jt| jks)J dt| jd dks7J d|jt	j
t	jfv sDJ d| jtu sMJ dt|t	j}t|jd t| jksdJ t	t|D ]}t| d|  | d| d  |j||< qldS )	a  Convert Float32 tensor to Float16/BFloat16.

    Args:
        src: Source tensor with Float32 element type
        dst_or_dtype: Either a destination tensor or a dtype (Float16/BFloat16)

    Returns:
        None if dst is a tensor, or a new tensor if dtype is provided
    z#dst and src must have the same sizern   r   z(src must have an even number of elementszdst must be BFloat16 or Float16zsrc must be Float32rC   N)r	   r|   r   r2   r~   r   r'  r   rk   r   r!  r"  r   r  r  r   r#  )r$  dst_or_dtyper   r%  dst_i32rT   r-   r-   r.   r'  V  s"   
  (poly.c                C   s@   t |d }|| }t|d ddD ]
}||  ||  }q|S NrC   )r   r   r   )rM   r+  r   r   degoutrT   r-   r-   r.   evaluate_polynomialv  s
   r0  yc                C   sT   t |d }|| || f}t|d ddD ]}t|| |f|| || f}q|S r,  )r   r   r   fma_packed_f32x2)rM   r1  r+  r   r   r.  r/  rT   r-   r-   r.   evaluate_polynomial_2  s
   r3  c                C   F   t tjt t| j||dt|j||dgddddtjjdS )Nr   zadd.rm.ftz.f32 $0, $1, $2;=f,f,fFr   	r   r   r   r   r
   r   r   r   r   )rM   r1  r   r   r-   r-   r.   add_round_down  s   "r7  	x_roundedfrac_ex2c                C   r4  )Nr   z{
	.reg .s32 x_rounded_i, frac_ex_i, x_rounded_e, out_i;
	mov.b32 x_rounded_i, $1;
	mov.b32 frac_ex_i, $2;
	shl.b32 x_rounded_e, x_rounded_i, 23;
	add.s32 out_i, x_rounded_e, frac_ex_i;
	mov.b32 $0, out_i;
	}
r5  Fr   r6  )r8  r9  r   r   r-   r-   r.   combine_int_frac_ex2  s   
r:  c          
      C   sZ   d}t d}tj| d}t||||d}|| }|| }t||||d}	t||	||dS )Nr0   g    >?g    ?g   ?        _r   )floatr2   rj   r   r7  r0  r:  )
rM   r   r   poly_ex2_deg3fp32_round_int	x_clampedr8  x_rounded_backx_frac
x_frac_ex2r-   r-   r.   ex2_emulation  s   rE  c                C   s   d}t d}tj| dtj|df}tjj|||ftjjd}t|||f}t||}	t	g |	|R ||d}
t
|d |
d ||d}t
|d |
d ||d}||fS )Nr;  r<  r=  r   r   r   rC   )r>  r2   rj   r   r   r   RoundingModeKindRMsub_packed_f32x2r3  r:  )rM   r1  r   r   r?  r@  
xy_clamped
xy_roundedxy_rounded_backxy_fracxy_frac_ex2x_outy_outr-   r-   r.   ex2_emulation_2  s   
rP  c             	   C   s   t jt jt t gt| j||dt|||d gddddt jj	d}tt j
t |dg||d}tt j
t |dg||d}||fS )Nr   a>  {
	.reg .f32 f1, f2, f3, f4, f5, f6, f7;
	.reg .b64 l1, l2, l3, l4, l5, l6, l7, l8, l9, l10;
	.reg .s32 r1, r2, r3, r4, r5, r6, r7, r8;
	max.ftz.f32 f1, $2, 0fC2FE0000;
	max.ftz.f32 f2, $3, 0fC2FE0000;
	mov.b64 l1, {f1, f2};
	mov.f32 f3, 0f4B400000;
	mov.b64 l2, {f3, f3};
	add.rm.ftz.f32x2 l7, l1, l2;
	sub.rn.ftz.f32x2 l8, l7, l2;
	sub.rn.ftz.f32x2 l9, l1, l8;
	mov.f32 f7, 0f3D9DF09D;
	mov.b64 l6, {f7, f7};
	mov.f32 f6, 0f3E6906A4;
	mov.b64 l5, {f6, f6};
	mov.f32 f5, 0f3F31F519;
	mov.b64 l4, {f5, f5};
	mov.f32 f4, 0f3F800000;
	mov.b64 l3, {f4, f4};
	fma.rn.ftz.f32x2 l10, l9, l6, l5;
	fma.rn.ftz.f32x2 l10, l10, l9, l4;
	fma.rn.ftz.f32x2 l10, l10, l9, l3;
	mov.b64 {r1, r2}, l7;
	mov.b64 {r3, r4}, l10;
	shl.b32 r5, r1, 23;
	add.s32 r7, r5, r3;
	shl.b32 r6, r2, 23;
	add.s32 r8, r6, r4;
	mov.b32 $0, r7;
	mov.b32 $1, r8;
	}
z	=r,=r,f,fFr   r   rC   )r   r   
StructTypeget_literalr
   r   r   r   r   r   extractvalue)rM   r1  r   r   	out_f32x2out0out1r-   r-   r.   e2e_asm2  s   " (rW  tensorc                C   sD   t |jtjs	J tj|jt||  |j|jj	d}t
||jS )NrD   )r|   r   r2   Pointerr   rk   r   r   r   rN   r   r   )r   rX  r   r   new_ptrr-   r-   r.   domain_offset_aligned  s   r[  c                C   s   t dd t| D }t|j}t|t|ksJ dtdd t||D }t|j	tj
s3J tj|j|j	 ||jj d  |j|j	jd}t||jS )Nc                 s   r   rZ   r   r   r-   r-   r.   r   &  r   z$domain_offset_i64.<locals>.<genexpr>r   c                 s   r   rZ   r-   r   r-   r-   r.   r   +  r   r   rD   )r   r2   r   r   r   r   r   r   r|   r   rY  r   rk   r   rr   r   max_alignmentr   r   )r   rX  r   r   r   r   r{   rZ  r-   r-   r.   domain_offset_i64$  s   r]  r*   dimc                C   s   t |t| j|  }t| jtjsJ tj| j	| j
 || j	j d  | j| jjd}t| jg d g| dd gt| | d  R }t||S )Nr   rD   r   rC   )r   r   r2   r   r   r|   r   rY  r   rk   r   rr   r   r\  slice_r   r   r   )rX  r*   r^  r   r   r{   rZ  
new_layoutr-   r-   r.   coord_offset_i647  s   .ra  c                 C   s   t d|}| |d< | S )zBConvert a scalar to a cute TensorSSA of shape (1,) and given dtyperC   r   )r2   r~   r   )r   r   vecr-   r-   r.   scalar_to_ssaJ  s   rc  c                 C   s   | d S )z2Could inline but nice for reflecting the above apir   r-   )ry   r-   r-   r.   ssa_to_scalarR  s   rd  )rB   rC   )rB   NN)FrZ   )Nr   )T)er3   r!   r   r   typingr   r   r   r   r   	functoolsr   r   cutlass.cuter2   r   r	   cutlass.cutlass_dslr
   r   cutlass._mlir.dialectsr   r   cutlass.cute.runtimer   rj   r2  rF  RNmul_packed_f32x2r   calc_packed_f32x2_oprH  r   r/   rA   TensorrO   rU   CopyAtomTiledMma	Constexprbool	TiledCopyr\   r[   coreThrMmarf   rb   r   Numericrx   r?   r  r}   r   Layoutr   r   r   r   listr   r   rY  Swizzler   r   r>  r   r   r   r   r   r   Coordr   r   r  r  r	  Intr  r  r  r  r#  r'  r0  r3  r7  r:  rE  rP  rW  r[  r]  ra  rc  rd  r-   r-   r-   r.   <module>   s  #











'	   

2
(&&&&$"*


( ((.&