o
    پiY                  "   @   s  d dl mZmZ d dlZd dlmZ d dlmZmZmZ d dl	m
Z
 d dlmZ d dlm  mZ d dlmZ ej				d3dejd	ejd
ejdejdee dee deeB deddfddZej			d4dejd	ejd
ejdejdeej dejdee dee deeB ddfddZej	d5dejd	ejd
ejdejdeeB dejfddZdedeeef fddZej	d5dejj
jjd	ejd
ejdejdeej dejdeeB ddfddZej	d5dejj
jjd	ejd
ejdejdeej dejdeeB ddfdd Zej				d6dejj
jjd!ed
ejdejdeej dejd"eej  d#ee deeB d$ee ddfd%d&Z!ej	d5dejj
jjd!ej"e d
ejdejd'ed(ej"e d)ed*ed+ej"e d,ed-eej# d.eej# d/eej$ d0ej$deeB ddf d1d2Z%dS )7    )OptionalTupleN)Int32Boolean
const_expr)tcgen05)llvm)parse_swizzle_from_pointerF	tiled_mmaacctCrAtCrBA_idxB_idx	zero_initswap_ABreturnc              
   C   s   t |rt| ||||||ddS t |d u r|n|d d d |f }t |d u r(|n|d d d |f }	t| j}
tt|jd D ]"}|
	t
jj| pN|dk t|
||d d |f |	d d |f | qAd S )NF)r   r      r   )r   
gemm_w_idxcutemake_mma_atomopcutlassrange_constexprsizeshapesetr   Field
ACCUMULATEgemm)r
   r   r   r   r   r   r   r   rArBmma_atomk r$   \/home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_origin/cute/blackwell_helpers.pyr      s     (r   sAsBc	                 K   s   t |d u r|n|d d d |f }
t |d u r|n|d d d |f }d }t |d ur8t |d u r0|n|d d d |f }t |d u r@|n|d d d |f }t| j}|j }t|j||
|||fd|i|	 d S )Nr   )r   r   r   r   iteratortointgemm_ptx_partial)r
   r   r   r   r&   r'   r   r   r   kwargsr    r!   sA_cursB_curr"   acc_tmem_addrr$   r$   r%   gemm_ptx_w_idx&   s       

r/   c              
   C   s`   t t|jd D ]"}| tjj| p|dk t	| ||d d |f |d d |f | q| S )Nr   r   )
r   r   r   r   r   r   r   r   r   r   )r
   r   r   r   r   r#   r$   r$   r%   r   @   s   (r   ic                 C   s   | d@ | d? d@ fS )z;Convert a 64-bit integer to a tuple of two 32-bit integers.l        r$   )r0   r$   r$   r%   i64_to_i32x2N   s   r2   r   c                 C   s  | j tjjjjk}t| r|d usJ d|d ur|jnd }|j}	tt	| }
t| rft
|j}tttd| jj|d |t| jtjjjjjkrQtjjntjj}t|\}}t|}t|}nd }d\}}t
|j}tttd| jj|	d |t| jtjjjjjkrtjjntjj}t|\}}t|}t|}t| rt|t|d jB }nd }t|t|d jB }tt|jd D ]}t| r|tdd|f||j j d d?  }|tdd|f|	|j j d d?  }tj!" | t| r=t#j$d |j% & |& |& t| p|dk& gd	t'|
 d
t'| dt'| ddddt#j(j)d n8t#j$d |j% & |d d |f j% & |& t| p\|dk& gdt'| dt'|
 ddddt#j(j)d W d    n	1 sw   Y  qd S )N*sA must be provided when a_src is not TMEM   r   NNNNr   r         zX{
	.reg .pred p;
	.reg .b64 smem_desc_a, smem_desc_b;
	.reg .b32 idesc;
	mov.b32 idesc, z;
	mov.b64 smem_desc_a, {$1, z};
	mov.b64 smem_desc_b, {$2, zk};
	setp.ne.b32 p, $3, 0;
	tcgen05.mma.cta_group::1.kind::f16 [$0], smem_desc_a, smem_desc_b, idesc, p;
	}
r,r,r,rTFhas_side_effectsis_align_stackasm_dialectzD{
	.reg .pred p;
	.reg .b64 smem_desc_b;
	mov.b64 smem_desc_b, {$2, zW};
	setp.ne.b32 p, $3, 0;
	tcgen05.mma.cta_group::1.kind::f16 [$0], [$1], smem_desc_b, z, p;
	}
)*a_srcr   nvgpur   OperandSourceTMEMr   layout
sm100_descmma_op_to_idescr	   r(   make_smem_desc_baserecast_layouta_dtypewidtha_major_modemmaOperandMajorModeKMajorMNr2   b_dtypeb_major_moder   make_smem_desc_start_addrr   r   r   r   crd2idxelement_typearch	elect_oner   
inline_asmr)   ir_valuehex
AsmDialectAD_ATT)r   r   r   r   r&   r'   r   is_ts	sA_layout	sB_layoutidesc
sA_swizzlesmem_desc_base_asmem_desc_base_a_losmem_desc_a_hi
sB_swizzlesmem_desc_base_bsmem_desc_base_b_losmem_desc_b_hismem_desc_start_a_losmem_desc_start_b_lor#   smem_desc_a_losmem_desc_b_lor$   r$   r%   gemm_ptxS   s   




	


	



  
rk   c                    s  j tjjjjk}t| rd usJ dd urjn|jjtt	}t| rgt
j}	tttdjjd |	tjtjjjjjkrRtjjntjj}
t|
\}}t|}t|}nd }
d\}}t
j}tttdjjd |tjtjjjjjkrtjjntjj}t|\}}t|}t|}t| rfddtt|jd D  nfddtt|jd D   fd	dtd
t|jd D fddtt|jd D fddtd
t|jd D t| rt|td jB }nd }t|td jB }t|t r2dn|r7dnd}t| rt!j"d |j# $ ttj%&|$ ttj%&|$ t| $ gdt'| dt'| dt'| d| d	d(fddtd
t|jd D  d dddt!j)j*d d S t!j"d |j# $ t|d j# $ t|$ t| $ gdt'| dt'| d | dd( fd!dtd
t|jd D  d dddt!j)j*d d S )"Nr3   r4   r   r5   c                    .   g | ]}t d d |f jj d d? qS r   r7   r8   r   rR   rS   rH   .0r#   )r&   r\   r$   r%   
<listcomp>        z!gemm_ptx_loop.<locals>.<listcomp>r   c                    *   g | ]}t d d |f jj d qS r   r1   r   rR   rG   rH   ro   r   r\   r$   r%   rq          c                        g | ]} |  |d    qS    r$   ro   offset_ar$   r%   rq          rz   c                    rl   rm   rn   ro   )r'   r]   r$   r%   rq     rr   c                    rx   ry   r$   ro   offset_br$   r%   rq     r}   r6   p01z{
	.reg .pred leader_thread;
	.reg .pred p;
	.reg .b32 idesc;
	.reg .b32 smem_desc_a_lo, smem_desc_b_lo;
	.reg .b32 smem_desc_a_hi, smem_desc_b_hi;
	.reg .b64 smem_desc_a, smem_desc_b;
	elect.sync _|leader_thread, -1;
	mov.b32 idesc, zU;
	mov.b32 smem_desc_a_lo, $1;
	mov.b32 smem_desc_b_lo, $2;
	mov.b32 smem_desc_a_hi, ;
	mov.b32 smem_desc_b_hi, z;
	mov.b64 smem_desc_a, {smem_desc_a_lo, smem_desc_a_hi};
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	setp.ne.b32 p, $3, 0;
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], smem_desc_a, smem_desc_b, idesc, ;
	 c                 3   :    | ]}d t  |d   dt |d   dV  qdS )(add.u32 smem_desc_a_lo, smem_desc_a_lo, rz   +;
	add.u32 smem_desc_b_lo, smem_desc_b_lo, z;
	mov.b64 smem_desc_a, {smem_desc_a_lo, smem_desc_a_hi};
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], smem_desc_a, smem_desc_b, idesc, 1;
	NrX   ro   offset_a_diffoffset_b_diffr$   r%   	<genexpr>.      
z gemm_ptx_loop.<locals>.<genexpr>}
r9   TFr:   {
	.reg .pred leader_thread;
	.reg .pred p;
	.reg .b32 idesc;
	.reg .b32 tmem_a;
	.reg .b32 smem_desc_b_lo;
	.reg .b32 smem_desc_b_hi;
	.reg .b64 smem_desc_b;
	elect.sync _|leader_thread, -1;
	mov.b32 idesc, M;
	mov.b32 tmem_a, $1;
	mov.b32 smem_desc_b_lo, $2;
	mov.b32 smem_desc_b_hi, z;
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	setp.ne.b32 p, $3, 0;
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], [tmem_a], smem_desc_b, idesc, c                 3   6    | ]}d t |d   dt  |  dV  qdS )(add.u32 smem_desc_b_lo, smem_desc_b_lo, rz   z};
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], [tmem_a + ], smem_desc_b, idesc, 1;
	Nr   ro   r|   r   r$   r%   r   W  s    

)+r>   r   r?   r   r@   rA   r   rB   rC   rD   r	   r(   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   r2   rO   rP   r   r   r   r   r   rQ   
isinstancer   r   rV   r)   rW   rT   make_warp_uniformrX   joinrY   rZ   )r   r   r   r   r&   r'   r   r[   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   pred_strr$   )	r|   r   r   r   r   r&   r\   r'   r]   r%   gemm_ptx_loop   s   




	


	



*
	
r   r.   mbar_ptr
mbar_phasetA_addrc
                    s  | j tjjjjk}
t|
 r|d usJ d|d ur|jn|j}|j}tt	| }t|
 rgt
|j}tttd| jj|d |t| jtjjjjjkrRtjjntjj}t|\}}t|}t|}nd }d\}}t
|j}tttd| jj|d |t| jtjjjjjkrtjjntjj}t|\}}t|}t|}t|
 r|jn	td|jj|jfddtt|jd D   fd	dtd
t|jd D }fddttjd D fddtd
tjd D t|
 rt|t|d jB }nd }t|t|d jB }t|t r%dn|r*dnd}t|
 r|d u s;J dt!j"d ttj#$|% ttj#$|% t| % ttj#$|% gdt&| dt&| dt&| d| d	d' fddtd
t|jd D  d dddt!j(j)d d S |	d u r|d j* n|	}	ttj#$|	% ttj#$|% t| % ttj#$|% g}t|d ur|d usJ d|+|* %  |+t|%  d }nd}t!j"d |d!t&| d"t&| d#| dd' fd$dtd
t|d u rt|jd nt|jd d% d& D  | t|d urTd' fd'dtt|jd d% d& t|jd D nd d t|d u radnd(ddt!j(j)d d S ))Nr3   r4   r   r5   r1   c                    s   g | ]}t d d |f qS r   )r   rR   ro   )tCrA_layoutr$   r%   rq     s    z$gemm_ptx_partial.<locals>.<listcomp>r   c                    rx   ry   r$   ro   r{   r$   r%   rq          rz   c                    s    g | ]}t d d |f jqS r   )r   rR   rB   ro   )r   r$   r%   rq     r   c                    rx   ry   r$   ro   r~   r$   r%   rq     r   r6   r   r   r   z,mbar_ptr must be None when a_src is not TMEMa6  {
	.reg .pred leader_thread;
	.reg .pred p;
	.reg .b32 idesc;
	.reg .b32 tmem_acc;
	.reg .b32 smem_desc_a_lo_start, smem_desc_b_lo_start;
	.reg .b32 smem_desc_a_lo, smem_desc_b_lo;
	.reg .b32 smem_desc_a_hi, smem_desc_b_hi;
	.reg .b64 smem_desc_a, smem_desc_b;
	elect.sync _|leader_thread, -1;
	mov.b32 idesc, zx;
	mov.b32 tmem_acc, $3;
	mov.b32 smem_desc_a_lo_start, $0;
	mov.b32 smem_desc_b_lo_start, $1;
	mov.b32 smem_desc_a_hi, r   z;
	mov.b64 smem_desc_a, {smem_desc_a_lo_start, smem_desc_a_hi};
	mov.b64 smem_desc_b, {smem_desc_b_lo_start, smem_desc_b_hi};
	setp.ne.b32 p, $2, 0;
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], smem_desc_a, smem_desc_b, idesc, r   r   c                 3   s2    | ]}d t  |  dt |  dV  qdS )z.add.u32 smem_desc_a_lo, smem_desc_a_lo_start, z1;
	add.u32 smem_desc_b_lo, smem_desc_b_lo_start, z;
	mov.b64 smem_desc_a, {smem_desc_a_lo, smem_desc_a_hi};
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], smem_desc_a, smem_desc_b, idesc, 1;
	Nr   ro   r|   r   r$   r%   r     s    


z#gemm_ptx_partial.<locals>.<genexpr>r   r9   TFr:   z5mbar_phase must be provided when mbar_ptr is not Nonez.reg .pred P1; 
	LAB_WAIT: 
	mbarrier.try_wait.parity.shared::cta.b64 P1, [$4], $5, 10000000; 
	@P1 bra DONE; 
	bra     LAB_WAIT; 
	DONE: 
	a  {
	.reg .pred leader_thread;
	.reg .pred p;
	.reg .b32 idesc;
	.reg .b32 tmem_acc;
	.reg .b32 tmem_a;
	.reg .b32 smem_desc_b_lo_start;
	.reg .b32 smem_desc_b_lo;
	.reg .b32 smem_desc_b_hi;
	.reg .b64 smem_desc_b;
	elect.sync _|leader_thread, -1;
	mov.b32 idesc, zj;
	mov.b32 tmem_acc, $3;
	mov.b32 tmem_a, $0;
	mov.b32 smem_desc_b_lo_start, $1;
	mov.b32 smem_desc_b_hi, z;
	mov.b64 smem_desc_b, {smem_desc_b_lo_start, smem_desc_b_hi};
	setp.ne.b32 p, $2, 0;
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], [tmem_a], smem_desc_b, idesc, c                 3   s2    | ]}d t |  dt  |  dV  qdS )z.add.u32 smem_desc_b_lo, smem_desc_b_lo_start, ;
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], [tmem_a + r   Nr   ro   r   r$   r%   r     s    	

r8      c                 3   r   )r   rz   r   r   Nr   ro   r   r$   r%   r   .  s    

zr,r,r,r,r,r),r>   r   r?   r   r@   rA   r   rB   rC   rD   r	   r(   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   r2   rO   rP   rS   ranger   r   r   rQ   r   r   r   rV   rT   r   rW   rX   r   rY   rZ   r)   append)r   r.   r   r   r&   r'   r   r   r   r   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   r   rg   rh   r   
input_argsmbar_wait_strr$   )r|   r   r   r   r   r%   r*   i  s   



	


	"$"$

!#
4		
$
.(	12
r*   sA_base_addr_for_descsA_addr_offset_for_descsA_stagesB_base_addr_for_descsB_addr_offset_for_descsB_stager\   r]   r_   rc   c                    s  j tjjjjk}t| rd usJ d|d usJ dtt}t| r]tt	t
djjd |tjtjjjjjkrHtjjntjj}t|\}}t|}t|}nd }d\}}tt	t
djjd |tjtjjjjjkrtjjntjj}t|\}}t|}t|}tdgd }t| rfddtt|jd	 D  nfd
dtt|jd	 D   fddtdt|jd	 D fddtt|jd	 D fddtdt|jd	 D t| rt|}nd }t|}t|trdn|rdnd}t| rtjd t| t| t| t|	 t|  |d  |d  |d	  |d  g	dt| dt| dt| dt| dt| dt| d| dd fddtdt|jd	 D  d dd d!tj!j"d" d S tjd t|d# j#$  t| t|  |d  |d  |d	  |d  gd$t| d%t| d&| dd fd'dtdt|jd	 D  d d(d d!tj!j"d" d S ))Nz1sA_layout must be provided when a_src is not TMEMz2sA_swizzle must be provided when a_src is not TMEMr4   r   r5   r8   c                    rl   rm   ru   ro   rv   r$   r%   rq   w  rr   z%gemm_ptx_partial1.<locals>.<listcomp>r   c                    rs   rt   ru   ro   rv   r$   r%   rq   |  rw   c                    rx   ry   r$   ro   r{   r$   r%   rq     r   rz   c                    rl   rm   )r   rR   rO   rH   ro   )r   r]   r$   r%   rq     rr   c                    rx   ry   r$   ro   r~   r$   r%   rq     r   r   r   r   r   z{
	.reg .pred leader_thread;
	.reg .pred p;
	.reg .b32 idesc;
	.reg .b32 tmem_acc;
	.reg .b32 smem_desc_a_lo, smem_desc_b_lo;
	.reg .b32 smem_desc_a_hi, smem_desc_b_hi;
	.reg .b64 smem_desc_a, smem_desc_b;
	elect.sync _|leader_thread, -1;
	mov.b32 idesc, z;
	mov.b32 tmem_acc, z";
	mad.lo.u32 smem_desc_a_lo, $1, z&, $0;
	mad.lo.u32 smem_desc_b_lo, $3, z, $2;
	mov.b32 smem_desc_a_hi, r   z;
	mov.b64 smem_desc_a, {smem_desc_a_lo, smem_desc_a_hi};
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	setp.ne.b32 p, $4, 0;
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], smem_desc_a, smem_desc_b, idesc, {$5, $6, $7, $8}, r   r   c                 3   r   )r   rz   r   z;
	mov.b64 smem_desc_a, {smem_desc_a_lo, smem_desc_a_hi};
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], smem_desc_a, smem_desc_b, idesc, {$5, $6, $7, $8}, 1;
	Nr   ro   r   r$   r%   r     r   z$gemm_ptx_partial1.<locals>.<genexpr>r   zr,r,r,r,r,r,r,r,rTFr:   r6   r   r   z;
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	setp.ne.b32 p, $3, 0;
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], [tmem_a], smem_desc_b, idesc, {$4, $5, $6, $7}, c                 3   r   )zadd.u32 tmem_a, tmem_a, rz   r   z;
	mov.b64 smem_desc_b, {smem_desc_b_lo, smem_desc_b_hi};
	@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], [tmem_a], smem_desc_b, idesc, {$4, $5, $6, $7}, 1;
	Nr   ro   r   r$   r%   r     s    
zr,r,r,r,r,r,r,r)%r>   r   r?   r   r@   rA   r   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   r2   rO   rP   r   r   r   r   r   r   r   rV   rW   rX   r   rY   rZ   r(   r)   )r   r.   r   r   r   r   r   r   r   r   r\   r]   r_   rc   r   r[   r^   r`   ra   rb   rd   re   rf   maskrg   rh   r   r$   )r|   r   r   r   r   r\   r]   r%   gemm_ptx_partial1A  s   


	

	
$$








	
 !
7





	
r   )NNFF)NNF)F)NNFN)&typingr   r   r   cutlass.cuter   r   r   r   cutlass.cute.nvgpur   cutlass._mlir.dialectsr   %flash_attn_origin.cute.mma_sm100_descmma_sm100_descrC   flash_attn_origin.cute.utilsr	   jitTiledMmaTensorboolr   r/   r   intr2   r?   rJ   MmaOprk   r   Pointerr*   	ConstexprLayoutSwizzler   r$   r$   r$   r%   <module>   s`  		
r
 #
	 X
	
