o
    پi-                     @   s,  d dl Z d dlmZmZmZ d dlZd dlmZ d dlmZm	Z	m
Z
 d dlmZ d dlm  mZ d dlmZmZ d dlmZ d dlZedddddejd	ejd
ejdeej ddf
ddZedddd	ejdejfddZe	dHddddeej dededejfddZe	dIddddejdedejfddZeddddddd	ejd
ejdeej dededdfddZ 	dJdeej d edededej!f
d!d"Z"	dHdeej d#ed ededej!f
d$d%Z#edddd&ed'ed(ed)ed*ej$ddfd+d,Z%edddd-ej$d.e	de	fd/d0Z&edddd&ed'ed(ed)ed-ej$d1ej$d.e	ddfd2d3Z'edddd*ej$d-ej$d4ej$d5ee	B fd6d7Z(edddd-ej$d*ej$d8ee	B fd9d:Z)	dHd;ejd<ejd=edefd>d?Z*		dKdejd@ej+dAej,d;ejd<ejdBed=edefdCdDZ-dedEej.j/fdFdGZ0dS )L    N)OptionalTypeCallable)Float32Int32
const_expr)cpasync)Tdsl_user_op)llvmpredlocipatomsrcdstr   returnc                K   s   t |jtjr|jtjjksJ t|j|jkr/tj	||j||d}|
| |j |}tj| ||f|||d| d S )Nr   r   r   )
isinstanceiteratorcutePointermemspaceAddressSpacermemr   element_typemake_fragment_likestoreloadtocopy)r   r   r   r   r   r   kwargssrc_cvt r$   U/home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_origin/cute/copy_utils.pycvt_copy   s    "r&   r   c                C   s*   t j| | j||d}t j| |||d |S )Nr   )r   r   r   autovec_copy)r   r   r   r   r$   r$   r%   load_s2r#   s   r(   Fdtypenum_copy_elemsis_asyncc                C   s:   t td|| j }|rt ntj }tj|| |dS )N   num_bits_per_copy)	r   minwidthr   	CopyG2SOpr   nvgpuCopyUniversalOpmake_copy_atom)r)   r*   r+   r   r   num_copy_bitscopy_opr$   r$   r%   get_copy_atom*   s   r7      tmem_copy_atomnum_wgc          
      C   s   t | \}}}}|dksJ |dksJ tjd| | d dfddf}tjdd|f|dffddd| fdd| | ffd}	t| |	|S )N    r,   )r;   r8   )stride   r   r8   )sm100_utilsget_tmem_copy_propertiesr   make_layoutmake_tiled_copy)
r9   r:   r   r   num_dpnum_bitsnum_rep_tiler_mn	layout_tvr$   r$   r%   make_tmem_copy3   s    ,rH   )r   r*   r+   r   r   c          	      K   s0   t | j||}tj|| |f|||d| d S )Nr   )r7   r   r   r!   )	r   r   r   r*   r+   r   r   r"   	copy_atomr$   r$   r%   r!   A   s   "r!   num_threadsc           	      C   sR   || j  }|rt ntj }tj|| |d}t|}t|}t|||S )Nr-   )	r0   r   r1   r   r2   r3   r4   r@   make_tiled_copy_tv)	r)   rJ   r*   r+   r5   r6   rI   
thr_layout
val_layoutr$   r$   r%   tiled_copy_1dQ   s   


rN   major_mode_sizec                 C   s   t |d| j | j }|| j }|rt ntj }tj|| |d}|| }|| dks0J tj	|| |fdd}	t
d|f}
t||	|
S )Nr,   r-   r   )r8   r   )orderr8   )mathgcdr0   r   r1   r   r2   r3   r4   make_ordered_layoutr@   rK   )r)   rO   rJ   r+   r5   
copy_elemsr6   rI   gmem_threads_per_rowrL   rM   r$   r$   r%   tiled_copy_2d\   s   

rV   abcdgmem_ptrc             
   C   st   |j ||d }tjd |t| j||dt|j||dt|j||dt|j||dgddddtjjd d S )Nr   z{
	.reg .v4 .f32 abcd;
	mov.f32 abcd.x, $1;
	mov.f32 abcd.y, $2;
	mov.f32 abcd.z, $3;
	mov.f32 abcd.w, $4;
	red.global.add.v4.f32 [$0], abcd;
	}
z	l,f,f,f,fTFhas_side_effectsis_align_stackasm_dialect)tointir_valuer   
inline_asmr   
AsmDialectAD_ATT)rW   rX   rY   rZ   r[   r   r   gmem_ptr_i64r$   r$   r%   atomic_add_fp32x4m   s   
rf   smem_ptrpeer_cta_rank_in_clusterc             
   C   s>   | j ||d }ttjt || gddddtjjdS )zMMap the given smem pointer to the address at another CTA rank in the cluster.r   z$mapa.shared::cluster.u32 $0, $1, $2;z=r,r,rFr\   )	r`   ra   r   r   rb   r	   i32rc   rd   )rg   rh   r   r   smem_ptr_i32r$   r$   r%   set_block_rank   s   
rk   mbar_ptrc                C   s   t ||||d }	t ||||d }
tjd |	|
t| j||dt|j||dt|j||dt|j||dgddddtjjd d S )Nr   z{
	.reg .v4 .f32 abcd;
	mov.f32 abcd.x, $2;
	mov.f32 abcd.y, $3;
	mov.f32 abcd.z, $4;
	mov.f32 abcd.w, $5;
	st.async.shared::cluster.mbarrier::complete_tx::bytes.v4.f32 [$0], abcd, [$1];
	}
zr,r,f,f,f,fTFr\   )rk   ra   r   rb   r   rc   rd   )rW   rX   rY   rZ   rg   rl   rh   r   r   remote_smem_ptr_i32remote_mbar_ptr_i32r$   r$   r%   store_shared_remote_fp32x4   s2   
ro   tma_bar_ptrsizec          	   	   C   sf   | j ||d }|j ||d }|j ||d }tjd |||t| gddddtjjd d S )Nr   zScp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [$1], [$0], $3, [$2];zl,r,r,rTFr\   )r`   ra   r   rb   r   rc   rd   )	r[   rg   rp   rq   r   r   re   rj   mbar_ptr_i32r$   r$   r%   cpasync_bulk_g2s   s   

rs   store_bytesc             	   C   sB   | j ||d }tjd |j|t| gddddtjjd d S )Nr   zJcp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [$0], [$1], $2;zl,r,rTFr\   )r`   ra   r   rb   llvm_ptrr   rc   rd   )rg   r[   rt   r   r   rj   r$   r$   r%   cpasync_reduce_bulk_add_f32   s   	
rv   
src_tensor
dst_tensorsingle_stagec                    s   t t| |s	dnd }t t||sdnd }t| d|t|d|  fdd} fdd}t | r?|S |S )Nr8   r   c                    sV   t tjd d jj d }td | f j d |f jfd|i| d S )N   rq   r   r   rq   shaper   r0   rs   r   )src_idxdst_idx
new_kwargsrq   r   r"   r   r$   r%   	copy_bulk  s   $
z+cpasync_bulk_get_copy_fn.<locals>.copy_bulkc                     s>   t tjjj d }tj jfd|i|  d S )Nr{   rq   r|   )r   rq   r   r$   r%   copy_bulk_single_stage  s   "z8cpasync_bulk_get_copy_fn.<locals>.copy_bulk_single_stage)r   r   rankgroup_modes)rw   rx   ry   r"   group_rank_srcgroup_rank_dstr   r   r$   r   r%   cpasync_bulk_get_copy_fn  s   

r   	cta_coord
cta_layoutfilter_zerosc                    s  t t|jtjo|jtjjk}|r||fn||f\}	}
t t|	|s%dnd }t t|
|s2dnd }t	
 ||t|	d|t|
d|\}}t |rYt|}t|}|r_||fn||f\ fdd} fdd}t | r|||fS |||fS )Nr8   r   c                    s.   t j d | f d |f fi | d S Nr   r!   )r~   r   r   r   r   r"   r   r$   r%   copy_tmaB  s   .z!tma_get_copy_fn.<locals>.copy_tmac                     s   t j fi |  d S r   r   )r   r   r$   r%   copy_tma_single_stageE  s   z.tma_get_copy_fn.<locals>.copy_tma_single_stage)r   r   r   r   r   r   r   smemr   r   tma_partitionr   r   )r   r   r   rw   rx   r   ry   r"   src_is_smemsmem_tensorgmem_tensorgroup_rank_smemgroup_rank_gmemsgr   r   r$   r   r%   tma_get_copy_fn$  s*   


r   pipelinec                    s   dt jjf fdd}|S )Nproducer_statec                    s"    d| |j |d| d S )N)r~   r   rp   r$   )indexproducer_get_barrier)r~   r   r   r!   r   r$   r%   copy_fnL  s   
z%tma_producer_copy_fn.<locals>.copy_fn)cutlassr   PipelineState)r!   r   r   r$   r   r%   tma_producer_copy_fnK  s   r   )F)r8   )r8   F)FF)1rQ   typingr   r   r   r   cutlass.cuter   r   r   r   cutlass.cute.nvgpur   cutlass.utils.blackwell_helpersutilsblackwell_helpersr>   cutlass.cutlass_dslr	   r
   cutlass._mlir.dialectsr   cutlass.pipelineCopyAtomTensorr&   r(   Numericintboolr7   rH   r!   	TiledCopyrN   rV   r   rf   rk   ro   rs   rv   r   CoordLayoutr   r   PipelineAsyncr   r$   r$   r$   r%   <module>   s|  	 


%
+
'	
'