o
    ijP                     @   s  d dl Z d dlmZmZmZmZ d dlZd dlmZ d dlm	Z	m
Z
 d dlmZmZ d dlmZmZmZmZ d dlmZ dfd	ejfd
dZ	dgdejdejdeje d	ejfddZ	dgdejdejdeje d	ejfddZ	dgdejdejjdeje d	ejfddZ 	dgdejdejjdeje d	ejfddZ!deje" deej# d	ejfddZ$ej%ej&j'fdej(ej#B ded eje" d	ej(ej#B fd!d"Z)d#ej*d	ej*fd$d%Z+d&ejd	ejfd'd(Z,ej%d#ej*d	ej*fd)d*Z-d+ejd	ejfd,d-Z.eddd.d+e/e	B d	e	fd/d0Z0ej%d1ej(e	B d	ej(e	B fd2d3Z1eddd.d+e/e	B d	e	fd4d5Z2e	dhddd.d+e/e	B d6e/e	B d7e/e	B dB d	e	fd8d9Z3ej%	:did1ej(d;e/e	B dB deje" d	e	fd<d=Z4ej%	:did1ej(d;e/e	B dB deje" d	e	fd>d?Z5eddd.d+e/e	B d@ej6d	dfdAdBZ7eddd.d1ejdCej8d	ej6fdDdEZ9ej%dFejdGej
d	ejfdHdIZ:e	dgddd.dJej6dKed	dfdLdMZ;djdOed	ej
fdPdQZ<ej%ej&j'fdRej#dSejj=d eje" d	ej#fdTdUZ>eddd.dej?dVej?d	ej?fdWdXZ@ej%dhdej
dYeej
 d	ej
fdZd[ZAeddd.d+e/e	B d6e/e	B d\ed	ej
fd]d^ZBej%d_ejd`ejfdadbZCeddd.d1e	dce	d	ee	e	f fdddeZDdS )k    N)TypeCallableOptionalTuple)Float32Int32)Tdsl_user_op)nvvmllvmarithvector)from_dlpack      returnc                 C   s$   t | |dj|dj||  |dS )N)assumed_align)leading_dim)modestride_orderdivisibility)r   mark_layout_dynamicmark_compact_shape_dynamic	dim_order)xr   	alignmentr    r   K/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/cute/utils.pyconvert_from_dlpack   s   

r   F	copy_atom	tiled_mmaswapABc                 C   "   t |rt| |S t| |S N)cutlass
const_exprcutemake_tiled_copy_Bmake_tiled_copy_Ar   r    r!   r   r   r   r(         
r(   c                 C   r"   r#   )r$   r%   r&   r(   r'   r)   r   r   r   r'   "   r*   r'   smemthr_mmac                 C   $   t |r
t| |S ||| S r#   )r$   r%   mma_make_fragment_Bmake_fragment_Apartition_Ar+   r,   r!   r   r   r   mma_make_fragment_A+      

r2   c                 C   r-   r#   )r$   r%   r2   make_fragment_Bpartition_Br1   r   r   r   r.   4   r3   r.   archelement_typec                 C   sD   t | dk rtjtj |d|j dS ttjjjddd|S )NZ      )num_bits_per_copyF   )	transposenum_matrices)	r$   r%   r&   make_copy_atomnvgpuCopyUniversalOpwidthwarpStMatrix8x8x16bOp)r6   r7   r   r   r   get_smem_store_atom=   s   rD   valoprA   c                 C   s   t t| tjr0t| j| j}||  t 	t
| jD ]}t|| ||||< q| S t 	tt|D ]}|| tjj| d|> d} q:| S )Nr   )offset)r$   r%   
isinstancer&   	TensorSSAmake_fragmentshapedtypestorerange_constexprsizewarp_reduceloadintmathlog2r6   shuffle_sync_bfly)rE   rF   rA   resir   r   r   rP   M   s   
rP   
acc_layoutc                 C   s   t | j}t j|jd d |jd f|jd d g|jd dd |jd R g|jdd R |jd d |jd f|jd d g|jd dd |jd R g|jdd R d}t | |S )z
    For Sm80, convert ((2, 2), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, MMA_N), ...).
    For Sm90, convert ((2, 2, V), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, V, MMA_N), ...).
    r   r   r9   N   stride)r&   make_layoutrK   r[   composition)rX   acc_layout_col_majoracc_layout_mnr   r   r   convert_layout_acc_mn_   s0   
r`   accc                 C   s   t | jt| jS r#   )r&   make_tensoriteratorr`   layout)ra   r   r   r   make_acc_tensor_mn_view|   s   re   c                 C   sB  t t| jd dkrht| d}tj|jd d |jd d |jd d d f|jd |jd d d |jd ff|jd d |jd d |jd d d f|jd |jd d d |jd ffd}|S t| d}tj|jd |jd d f|jd |jd d f|jd |jd d f|jd |jd d fd}|S )Nr   rY   )NNr9   NNr   r9   rZ   rf   )r$   r%   r&   rankrK   logical_divider\   r[   )rX   lrA_mma_viewr   r   r   convert_layout_acc_frgA   s6   **rk   ac                 C   sT   | j d | j d g| j dd R }ddgtdt| R }t| tj||dS )z7Transpose the first two dimensions of a tensor on smem.r   r   r9   N)order)rK   ranger&   rg   r]   make_ordered_layout)rl   rK   rm   r   r   r   transpose_view   s   $rp   locipc             
   C   4   t tjt t | j||dgddddtjjdS )Nrq   zex2.approx.ftz.f32 $0, $1;=f,fFhas_side_effectsis_align_stackasm_dialectr   r   
inline_asmr   f32ir_value
AsmDialectAD_ATTrl   rr   rs   r   r   r   	exp2f_asm      r   r   c                 C   sj   t t| tjr/t| jt}||  t 	t
| jD ]}tj|| ||< q| S tj| S )zexp2f calculation for both vector and scalar.
    :param x: input value
    :type x: cute.TensorSSA or Float32
    :return: exp2 value
    :rtype: cute.TensorSSA or Float32
    )r$   r%   rH   r&   rI   rJ   rK   r   rM   rN   rO   r6   exp2rQ   )r   rV   rW   r   r   r   exp2f   s   
r   c             
   C   rt   )Nrq   zlg2.approx.ftz.f32 $0, $1;ru   Frv   rz   r   r   r   r   log2f   r   r   bcc             	   C   sT   t tjt t | j||dt |j||d|d ur#t |j||dnd ||dS )Nrq   )r   rr   rs   )r   r
   fmaxr   r|   r}   )rl   r   r   rr   rs   r   r   r   r      s   r   P   init_valc                 C   s  t |dk pt| jd dkrt| jt}||  |d |d |d |d g}t dt| jdD ]6}t	|d ||d  |d< t	|d ||d  |d< t	|d ||d  |d< t	|d ||d  |d< q6t	|d |d |d< t	|d |d |d< t	|d |d |d< t |d u r|d S t	|d |S t| jt}||  t |d urt	||d |d nt	|d |d t	|d |d t	|d |d t	|d	 |d
 g}t dt| jdD ]H}t	|d || ||d  |d< t	|d ||d  ||d  |d< t	|d ||d  ||d  |d< t	|d ||d	  ||d
  |d< qt	|d |d |d< t	|d |d |d S )Nd      r   r   r9   rY   r;            )
r$   r%   r&   rO   rK   rJ   r   rM   rN   r   )r   r   r6   rV   	local_maxrW   r   r   r   fmax_reduce   s:   "
$
 $$&r   c                 C   s  t |dk pt| jd dkr$t |d u rtj}| tjj	|dS t
| jt}||  t |d urFtj|df|d |d fn|d |d f}||d |d f|d |d	 f|d
 |d fg}t dt| jdD ]V}tj|d ||d  ||d  f|d< tj|d ||d  ||d  f|d< tj|d ||d  ||d	  f|d< tj|d ||d
  ||d  f|d< qqtj|d |d |d< tj|d |d |d< tj|d |d |d< |d d |d d  S )Nr   r   r   g        r   r9   rY   r;   r   r   r   )r$   r%   r&   rO   rK   r   zeroreduceReductionOpADDrJ   rM   r6   add_packed_f32x2rN   )r   r   r6   rV   local_sum_0	local_sumrW   r   r   r   fadd_reduce  s(   "
 0***,r   gmem_ptrc                C   s(   t jt t jj|jt|  d d S )N)rV   rF   ptrrl   )	r
   	atomicrmwr   r|   AtomicOpKindFADDllvm_ptrr   r}   )rl   r   rr   rs   r   r   r   atomic_add_fp32H  s   
r   coordc                C   s   | j tj|| j||d S )Nrq   )rc   r&   crd2idxrd   )r   r   rr   rs   r   r   r   elem_pointer^  s   r   tAcAlimitc              	   C   s   t t jt j| ddgdt j| dgdt j| dgdft j| dgdddfdtj}t|jd D ]!}t|jd D ]}t | d|fd|f d |||d|f< q=q3|S )Nr   r   )r   r9   rZ   )	r&   rJ   r\   rO   r$   BooleanrN   rK   	elem_less)r   r   tApArest_vrest_kr   r   r   predicate_kc  s   .*r   mbar_ptrnoincc                C   s   t j| j|||d d S )N)r   rr   rs   )r
   cp_async_mbarrier_arrive_sharedr   )r   r   rr   rs   r   r   r   r   s  s   
r   Tsyncc                 C   s,   t j d d }t| rt j|}|S )Nr      )r&   r6   
thread_idxr$   r%   make_warp_uniform)r   warp_group_idxr   r   r   canonical_warp_group_idx  s   
r   valuerG   c           	      C   s   | j d dksJ dtjj| }tjjd }|d> |B }tdt| }| |d< t|tj}t	t
|D ]}tjj|| ||d||< q8|d S )N    r   z(value type must be a multiple of 32 bitsr   r   )mask_and_clamp)rA   r&   r6   	WARP_SIZErJ   typerecast_tensorr$   r   rN   rO   shuffle_sync)	r   rG   rA   maskclampr   rE   val_i32rW   r   r   r   r     s   r   shiftc                C   sJ   t tjt t | j||dt |j||dgddddtjjdS )Nrq   zshr.s32 $0, $1, $2;z=r,r,rFrv   )	r$   Uint32r   r{   r   i32r}   r~   r   )rE   r   rr   rs   r   r   r   shr_u32  s   &r   lanec                 C   sd   t |d u rtj }t tttjj	D ]}d|> }tjj
| |dd}||kr/| |7 } q| S )Nr   r   )rG   r   )r$   r%   r&   r6   lane_idxrN   rR   rS   rT   r   shuffle_sync_up)rE   r   rW   rG   partial_sumr   r   r   warp_prefix_sum  s   
r   to_dtypec                C   st   |t jt jfv sJ dt tjt t| j	||dt|j	||dgd|t ju r,dnd ddddtj
jd	S )
Nz$to_dtype must be BFloat16 or Float16rq   zcvt.rn.bf16x2f16x2z.f32 $0, $2, $1;z=r,f,fFrv   )r$   BFloat16Float16r   r   r{   r   r   r   r}   r~   r   )rl   r   r   rr   rs   r   r   r   cvt_f16x2_f32  s   "r   srcdstc                 C   s   t |jt | jksJ dt | jd dksJ d|jtjtjfv s+J d| jtu s4J dt |tj	}t |jd t | jksKJ t
t |D ]}t| d|  | d| d  |j||< qSd S )Nz#dst and src must have the same sizer9   r   z(src must have an even number of elementszdst must be BFloat16 or Float16zsrc must be Float32r   )r&   rO   rK   r7   r$   r   r   r   r   r   rN   r   )r   r   dst_i32rW   r   r   r   cvt_f16  s     (r   yc             	   C   s   t jt jt t gt| j||dt|||d gddddt jj	d}tt j
t |dg||d}tt j
t |dg||d}||fS )Nrq   a>  {
	.reg .f32 f1, f2, f3, f4, f5, f6, f7;
	.reg .b64 l1, l2, l3, l4, l5, l6, l7, l8, l9, l10;
	.reg .s32 r1, r2, r3, r4, r5, r6, r7, r8;
	max.ftz.f32 f1, $2, 0fC2FE0000;
	max.ftz.f32 f2, $3, 0fC2FE0000;
	mov.b64 l1, {f1, f2};
	mov.f32 f3, 0f4B400000;
	mov.b64 l2, {f3, f3};
	add.rm.ftz.f32x2 l7, l1, l2;
	sub.rn.ftz.f32x2 l8, l7, l2;
	sub.rn.ftz.f32x2 l9, l1, l8;
	mov.f32 f7, 0f3D9DF09D;
	mov.b64 l6, {f7, f7};
	mov.f32 f6, 0f3E6906A4;
	mov.b64 l5, {f6, f6};
	mov.f32 f5, 0f3F31F519;
	mov.b64 l4, {f5, f5};
	mov.f32 f4, 0f3F800000;
	mov.b64 l3, {f4, f4};
	fma.rn.ftz.f32x2 l10, l9, l6, l5;
	fma.rn.ftz.f32x2 l10, l10, l9, l4;
	fma.rn.ftz.f32x2 l10, l10, l9, l3;
	mov.b64 {r1, r2}, l7;
	mov.b64 {r3, r4}, l10;
	shl.b32 r5, r1, 23;
	add.s32 r7, r5, r3;
	shl.b32 r6, r2, 23;
	add.s32 r8, r6, r4;
	mov.b32 $0, r7;
	mov.b32 $1, r8;
	}
z	=r,=r,f,fFrv   r   r   )r   r{   
StructTypeget_literalr   r|   r   r}   r~   r   extractvalue)r   r   rr   rs   	out_f32x2out0out1r   r   r   e2e_asm2  s   " (r   )r   r   )Fr#   )Nr   )T)ErS   typingr   r   r   r   r$   cutlass.cuter&   r   r   cutlass.cutlass_dslr   r	   cutlass._mlir.dialectsr
   r   r   r   cutlass.cute.runtimer   Tensorr   CopyAtomTiledMma	Constexprbool	TiledCopyr(   r'   coreThrMmar2   r.   rR   NumericrD   jitr6   r   rI   rP   Layoutr`   re   rk   rp   floatr   r   r   r   r   r   Pointerr   Coordr   r   r   r   Intr   r   r   r   r   r   r   r   r   r   r   <module>   s0  






	


'   

/
(&&&$.,