o
    iFv                  "   @   s  d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlm  mZ ej	d*dejdejd	ejd
ejdeejB ddfddZdedeeef fddZej	d*dejjjjdejd	ejd
ejdeej dejdeej dejdeejB ddfddZej	d*dejjjjdejd	ejd
ejdeej dejdeej dejdeejB ddfddZej			d+dejjjjdeje d	ejd
ejdeej dejdeej dejdeej deej deejB ddfddZ ej	d*dejjjjdeje d	ejd
ejd ejd!eje d"ejd#ejd$eje d%ejd&eej! d'eej! deej dejdeejB ddf d(d)Z"dS ),    )OptionalTupleN)tcgen05)T)llvmF	tiled_mmaacctCrAtCrB	zero_initreturnc              
   C   s`   t t|jd D ]"}| tjj| p|dk t	| ||d d |f |d d |f | qd S )N   r   )
cutlassrange_constexprcutesizeshapesetr   Field
ACCUMULATEgemm)r   r   r	   r
   r   k r   W/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/cute/blackwell_helpers.pyr      s   (r   ic                 C   s   | d@ | d? d@ fS )z;Convert a 64-bit integer to a tuple of two 32-bit integers.l        r   )r   r   r   r   i64_to_i32x2   s   r   opsAsB
sA_swizzle
sB_swizzlec	                 C   s0  | j tjjjjk}	t|	 r|d usJ d|d usJ d|d ur&|jnd }
|j}tt	
| }t|	 rptt	td| jj|
d |t| jtjjjjjkrYt	jjnt	jj}t|\}}t|}t|}nd }d\}}tt	td| jj|d |t| jtjjjjjkrt	jjnt	jj}t|\}}t|}t|}t|	 rt|t	|d jB }nd }t|t	|d jB }tt|jd D ]}t|	 r|tdd|f|
|jj d d	?  }|tdd|f||jj d d	?  }tj !  t|	 rLt"j#d |j$ % |% |% t| p-|dk% gd
t&| dt&| dt&| ddddt"j'j(d n9t"j#d |j$ % |d d |f j$ % |% t| pl|dk% gdt&| dt&| ddddt"j'j(d W d    n	1 sw   Y  qd S )N*sA must be provided when a_src is not TMEM2sA_swizzle must be provided when a_src is not TMEM   r   NNNNr   r         zX{
	.reg .pred p;
	.reg .b64 smem_desc_a, smem_desc_b;
	.reg .b32 idesc;
	mov.b32 idesc, z;
	mov.b64 smem_desc_a, {$1, z};
	mov.b64 smem_desc_b, {$2, zk};
	setp.ne.b32 p, $3, 0;
	tcgen05.mma.cta_group::1.kind::f16 [$0], smem_desc_a, smem_desc_b, idesc, p;
	}
r,r,r,rTFhas_side_effectsis_align_stackasm_dialectzD{
	.reg .pred p;
	.reg .b64 smem_desc_b;
	mov.b64 smem_desc_b, {$2, zW};
	setp.ne.b32 p, $3, 0;
	tcgen05.mma.cta_group::1.kind::f16 [$0], [$1], smem_desc_b, z, p;
	}
))a_srcr   nvgpur   OperandSourceTMEMr   
const_exprlayout
sm100_descmma_op_to_idescmake_smem_desc_baserecast_layouta_dtypewidtha_major_modemmaOperandMajorModeKMajorMNr   b_dtypeb_major_modeInt32make_smem_desc_start_addriteratorr   r   r   crd2idxelement_typearch	elect_oner   
inline_asmtointir_valuehex
AsmDialectAD_ATT)r   r   r	   r
   r   r   r    r!   r   is_ts	sA_layout	sB_layoutidescsmem_desc_base_asmem_desc_base_a_losmem_desc_a_hismem_desc_base_bsmem_desc_base_b_losmem_desc_b_hismem_desc_start_a_losmem_desc_start_b_lor   smem_desc_a_losmem_desc_b_lor   r   r   gemm_ptx   s   (
(

&&
r]   c	                    s   j tjjjjk}	t|	 rd usJ d|d usJ dd ur&jn|jjtt	
}
t|	 rqtt	tdjjd |tjtjjjjjkrZt	jjnt	jj}t|\}}t|}t|}nd }d\}}tt	tdjjd |tjtjjjjjkrt	jjnt	jj}t|\}}t|}t|}t|	 rǇfddtt|jd D  nfd	dtt|jd D   fd
dtdt|jd D fddtt|jd D fddtdt|jd D t|	 r(t|t	d jB }nd }t|t	d jB }t|tjr@dn|rEdnd}t|	 rt j!d |j" # ttj$%|# ttj$%|# t| # gdt&|
 dt&| dt&| d| d	d'fddtdt|jd D  d dddt j(j)d d S t j!d |j" # t|d j" # t|# t| # gdt&|
 d t&| d!| dd' fd"dtdt|jd D  d dddt j(j)d d S )#Nr"   r#   r$   r   r%   c                    .   g | ]}t d d |f jj d d? qS r   r'   r(   r   rE   rF   r9   .0r   )r   rP   r   r   
<listcomp>       &z!gemm_ptx_loop.<locals>.<listcomp>r   c                    *   g | ]}t d d |f jj d qS r   r   r   rE   r8   r9   ra   r   rP   r   r   rc          "c                        g | ]} |  |d    qS    r   ra   offset_ar   r   rc           rl   c                    r^   r_   r`   ra   )r   rQ   r   r   rc      rd   c                    rj   rk   r   ra   offset_br   r   rc      ro   r&   p01z{
	.reg .pred leader_thread;
	.reg .pred p;
	.reg .b32 idesc;
	.reg .b32 smem_desc_a_lo, smem_desc_b_lo;
	.reg .b32 smem_desc_a_hi, smem_desc_b_hi;
	.reg .b64 smem_desc_a, smem_desc_b;
	elect.sync _|leader_thread, -1;
	mov.b32 idesc, zU;
	mov.b32 smem_desc_a_lo, $1;
	mov.b32 smem_desc_b_lo, $2;
	mov.b32 smem_desc_a_hi, ;
	mov.b32 smem_desc_b_hi, z;
	mov.b64 smem_desc_a, {smem_desc_a_lo, smem_desc_a_hi};
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	setp.ne.b32 p, $3, 0;
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], smem_desc_a, smem_desc_b, idesc, ;
	 c                 3   :    | ]}d t  |d   dt |d   dV  qdS )(add.u32 smem_desc_a_lo, smem_desc_a_lo, rl   +;
	add.u32 smem_desc_b_lo, smem_desc_b_lo, z;
	mov.b64 smem_desc_a, {smem_desc_a_lo, smem_desc_a_hi};
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], smem_desc_a, smem_desc_b, idesc, 1;
	NrL   ra   offset_a_diffoffset_b_diffr   r   	<genexpr>       
z gemm_ptx_loop.<locals>.<genexpr>}
r)   TFr*   {
	.reg .pred leader_thread;
	.reg .pred p;
	.reg .b32 idesc;
	.reg .b32 tmem_a;
	.reg .b32 smem_desc_b_lo;
	.reg .b32 smem_desc_b_hi;
	.reg .b64 smem_desc_b;
	elect.sync _|leader_thread, -1;
	mov.b32 idesc, M;
	mov.b32 tmem_a, $1;
	mov.b32 smem_desc_b_lo, $2;
	mov.b32 smem_desc_b_hi, z;
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	setp.ne.b32 p, $3, 0;
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], [tmem_a], smem_desc_b, idesc, c                 3   6    | ]}d t |d   dt  |  dV  qdS )(add.u32 smem_desc_b_lo, smem_desc_b_lo, rl   z};
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], [tmem_a + ], smem_desc_b, idesc, 1;
	Nr{   ra   rn   r~   r   r   r          

)*r.   r   r/   r   r0   r1   r   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r   r@   rA   r   r   r   rB   rC   rD   
isinstanceBooleanr   rI   rJ   rK   rG   make_warp_uniformrL   joinrM   rN   )r   r   r	   r
   r   r   r    r!   r   rO   rR   rS   rT   rU   rV   rW   rX   rY   rZ   pred_strr   )	rn   r}   rq   r~   r   r   rP   r   rQ   r   gemm_ptx_loop   s   (
(

&& 
*	
r   acc_tmem_addrmbar_ptr
mbar_phasec                    s  | j tjjjjk}t| r|d usJ d|d usJ d|d ur&|jn|j}|j}tt	
| }t| rqtt	td| jj|d |t| jtjjjjjkrZt	jjnt	jj}t|\}}t|}t|}nd }d\}}tt	td| jj|d |t| jtjjjjjkrt	jjnt	jj}t|\}}t|}t|}t| r|jn	td|jj|jfddtt|jd	 D   fd
dtdt|jd	 D fddttjd	 D fddtdtjd	 D t| rt|t	|d jB }nd }t|t	|d jB }t|
tj r3dn|
r8dnd}t| r|d u sJJ dt!j"d t|# t|# t|
 # gdt$| dt$| dt$| dt$| d| dd%fddtdt|jd	 D  d dddt!j&j'd  d S t|d j( # t|# t|
 # g}t|d ur|	d usJ d!|)|( #  |)t|	#  d"}nd}t!j"d |d#t$| dt$| d$t$| d%| d	d% fd&dtdt|d u rt|jd	 nt|jd	 d' d( D  | t|d urLd% fd)dtt|jd	 d' d( t|jd	 D nd d t|d u rZdnd*ddt!j&j'd  d S )+Nr"   r#   r$   r   r%   r   c                    s   g | ]}t d d |f qS r   )r   rE   ra   )tCrA_layoutr   r   rc   :  s    z$gemm_ptx_partial.<locals>.<listcomp>r   c                    rj   rk   r   ra   rm   r   r   rc   ;  ro   rl   c                    s    g | ]}t d d |f jqS r   )r   rE   r3   ra   )r
   r   r   rc   <  ro   c                    rj   rk   r   ra   rp   r   r   rc   =  ro   r&   rr   rs   rt   z,mbar_ptr must be None when a_src is not TMEM{
	.reg .pred leader_thread;
	.reg .pred p;
	.reg .b32 idesc;
	.reg .b32 tmem_acc;
	.reg .b32 smem_desc_a_lo, smem_desc_b_lo;
	.reg .b32 smem_desc_a_hi, smem_desc_b_hi;
	.reg .b64 smem_desc_a, smem_desc_b;
	elect.sync _|leader_thread, -1;
	mov.b32 idesc, ;
	mov.b32 tmem_acc, zU;
	mov.b32 smem_desc_a_lo, $0;
	mov.b32 smem_desc_b_lo, $1;
	mov.b32 smem_desc_a_hi, ru   z;
	mov.b64 smem_desc_a, {smem_desc_a_lo, smem_desc_a_hi};
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	setp.ne.b32 p, $2, 0;
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], smem_desc_a, smem_desc_b, idesc, rv   rw   c                 3   rx   )ry   rl   rz   z;
	mov.b64 smem_desc_a, {smem_desc_a_lo, smem_desc_a_hi};
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], smem_desc_a, smem_desc_b, idesc, 1;
	Nr{   ra   r|   r   r   r   b  r   z#gemm_ptx_partial.<locals>.<genexpr>r   zr,r,rTFr*   z5mbar_phase must be provided when mbar_ptr is not Nonez.reg .pred P1; 
	LAB_WAIT: 
	mbarrier.try_wait.parity.shared::cta.b64 P1, [$3], $4, 10000000; 
	@P1 bra DONE; 
	bra     LAB_WAIT; 
	DONE: 
	z{
	.reg .pred leader_thread;
	.reg .pred p;
	.reg .b32 idesc;
	.reg .b32 tmem_acc;
	.reg .b32 tmem_a;
	.reg .b32 smem_desc_b_lo;
	.reg .b32 smem_desc_b_hi;
	.reg .b64 smem_desc_b;
	elect.sync _|leader_thread, -1;
	mov.b32 idesc, zM;
	mov.b32 tmem_a, $0;
	mov.b32 smem_desc_b_lo, $1;
	mov.b32 smem_desc_b_hi, z;
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	setp.ne.b32 p, $2, 0;
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], [tmem_a], smem_desc_b, idesc, c                 3   r   r   rl   z;
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], [tmem_a + r   Nr{   ra   r   r   r   r     r   r(      c                 3   r   r   r{   ra   r   r   r   r     s    

z	r,r,r,r,r)*r.   r   r/   r   r0   r1   r   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r   r@   rA   rF   ranger   r   rB   rC   rD   r   r   r   rI   rK   rL   r   rM   rN   rJ   append)r   r   r	   r
   r   r   r    r!   r   r   r   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r   
input_argsmbar_wait_strr   )rn   r}   rq   r~   r   r
   r   gemm_ptx_partial  s   (
(

&"$"$ 	

-	
<$(%'
r   sA_base_addr_for_descsA_addr_offset_for_descsA_stagesB_base_addr_for_descsB_addr_offset_for_descsB_stagerP   rQ   c                    s&  j tjjjjk}t| rd usJ d|d usJ dtt	}t| rdtt
tdjjd |tjtjjjjjkrMtjjntjj}t|\}}t|}t|}nd }d\}}tt
tdjjd |tjtjjjjjkrtjjntjj}t|\}}t|}t|}tdgd }t| rfddtt|jd	 D  nfd
dtt|jd	 D   fddtdt|jd	 D fddtt|jd	 D fddtdt|jd	 D t| rt|}nd }t|}t|tjr&dn|r+dnd}t| rtjd t| t| t| t|	 t|  |d  |d  |d	  |d  g	dt | dt | dt | dt | dt | dt | d| dd!fddtdt|jd	 D  d dd d!tj"j#d" d S tjd t|d# j$%  t| t|  |d  |d  |d	  |d  gd$t | d%t | d&| dd!fd'dtdt|jd	 D  d d(d d!tj"j#d" d S ))Nz1sA_layout must be provided when a_src is not TMEMr#   r$   r   r%   r(   c                    r^   r_   rg   ra   rh   r   r   rc     rd   z%gemm_ptx_partial1.<locals>.<listcomp>r   c                    re   rf   rg   ra   rh   r   r   rc     ri   c                    rj   rk   r   ra   rm   r   r   rc     ro   rl   c                    r^   r_   )r   rE   r@   r9   ra   )r   rQ   r   r   rc     rd   c                    rj   rk   r   ra   rp   r   r   rc     ro   rr   rs   rt   r   r   r   z";
	mad.lo.u32 smem_desc_a_lo, $1, z&, $0;
	mad.lo.u32 smem_desc_b_lo, $3, z, $2;
	mov.b32 smem_desc_a_hi, ru   z;
	mov.b64 smem_desc_a, {smem_desc_a_lo, smem_desc_a_hi};
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	setp.ne.b32 p, $4, 0;
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], smem_desc_a, smem_desc_b, idesc, {$5, $6, $7, $8}, rv   rw   c                 3   rx   )ry   rl   rz   z;
	mov.b64 smem_desc_a, {smem_desc_a_lo, smem_desc_a_hi};
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], smem_desc_a, smem_desc_b, idesc, {$5, $6, $7, $8}, 1;
	Nr{   ra   r|   r   r   r   #  r   z$gemm_ptx_partial1.<locals>.<genexpr>r   zr,r,r,r,r,r,r,r,rTFr*   r&   r   r   z;
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	setp.ne.b32 p, $3, 0;
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], [tmem_a], smem_desc_b, idesc, {$4, $5, $6, $7}, c                 3   rx   )zadd.u32 tmem_a, tmem_a, rl   rz   z;
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], [tmem_a], smem_desc_b, idesc, {$4, $5, $6, $7}, 1;
	Nr{   ra   r|   r   r   r   P  s    
zr,r,r,r,r,r,r,r)&r.   r   r/   r   r0   r1   r   r2   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r   r@   rA   rB   r   r   r   r   r   r   rI   rK   rL   r   rM   rN   rD   rJ   )r   r   r	   r
   r   r   r   r   r   r   rP   rQ   r    r!   r   rO   rR   rS   rT   rU   rV   rW   rX   maskrY   rZ   r   r   )rn   r}   rq   r~   r   rP   rQ   r   gemm_ptx_partial1  s   (
(

$$
 



	
 !
7




	
r   )F)NNF)#typingr   r   r   cutlass.cuter   cutlass.cute.nvgpur   cutlass.cutlass_dslr   cutlass._mlir.dialectsr   flash_attn.cute.mma_sm100_descmma_sm100_descr4   jitTiledMmaTensorboolr   r   intr   r/   r;   MmaOpSwizzler]   r   	ConstexprPointerrB   r   Layoutr   r   r   r   r   <module>   s  

	
b

	
 

	
 -
	
