o
    پi>                  T   @   sB  d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	  m
Z d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z( edddd Z)dd Z*dd Z+ej,ej-ej.ej/ej0ej1iZ2dd Z3																										dkdej4dej4dej4deej4 d eej4 d!eej4 d"eej4 d#ee5 d$ee5 d%eej4 d&ee6 d'e7d(ee6 d)ee5 d*ee5 d+eej4 d,e5d-e5d.e5d/e5d0ee7 d1ee5 d2ee d3ee d4ee$ d5e7d6eej4 d7eej4 d8ee8ej4  d9eej4ej4f f<d:d;Z9i e9_:			<			=		>		?	?				?	?	?																dldej4dej4dej4d6ej4d@ej4d7ej4d&ee6 d'e7d(e6d)ee5 d*ee5 d,e5d-e5d.e5d0e7dAe5dBe5dCe7dDe7dEe7dFe5dGe5dHe5dIe7deej4 d eej4 d!eej4 d"eej4 d#ee5 d$ee5 dJe7dKeej4 dLeej4 dMeej4 d2ee dNee d3ee d8ee8ej4  d4ee$ d9eej4ej4ej4f fPdOdPZ;i e;_<i e;_:i e;_=G dQdR dRej>j?Z@G dSdT dTej>j?ZA			U		<								dmdej4dej4dej4d&ee6 d'e7dVeee5 ee5 f d+eej4 d(e6d/e5d0ee7 dJe7d3ee dWeej4 dXeej4 dYeej4 dZeej4 f d[d\ZB										U		<					dndej4dej4dej4deej4 d eej4 d#ee5 d$ee5 d!eej4 d"eej4 d%eej4 d&ee6 d'e7dVeee5 ee5 f d+eej4 d(e6d/e5d0ee7 dJe7d2ee d8ee8 f(d]d^ZC					dod_ej4d`ej4d6ej4d7eej4 daeej4 dbeej4 dceej4 ddeej4 d9dfdedfZDi eD_:					gdpd_ej4d`ej4d6eej4 dheejE daeej4 dbeej4 d5e7d9eej4eej4 f fdidjZFdS )q    N)	lru_cache)OptionalTupleCallable)utilsto_cute_tensor)FlashAttentionForwardSm90)FlashAttentionForwardSm100) FlashAttentionBackwardPreprocess)FlashAttentionBackwardSm80)FlashAttentionBackwardSm90)FlashAttentionBackwardSm100)!FlashAttentionBackwardPostprocess)FlashAttentionForwardCombine)BlockSparseTensorsTorchto_cute_block_sparse_tensorsnormalize_block_sparse_tensors get_block_sparse_expected_shapes$get_block_sparse_expected_shapes_bwd)maxsizec                   C   s   t j d S )zCached device capability check.r   )torchcudaget_device_capability r   r   T/home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_origin/cute/interface.py_get_device_capability5   s   r   c                 C   s"   | d ur|  ddkr|  S | S )N   )stride
contiguous)xr   r   r   maybe_contiguous:   s   "r"   c                 C   s   | j |ksJ | d| j  d| | j|ks$J | d| j d| | j|ks6J | d| j d| | js@J | dd S )Nz shape z != expected z dtype z device z must be on CUDA)shapedtypedeviceis_cuda)tnameexpected_shapeexpected_dtypeexpected_devicer   r   r   _validate_tensor>   s   $$$r,   c                 C   s   |dkrdS t ||  ||S )N   r   )min)total_mblocksnum_SMsnum_n_blocks
max_splitsr   r   r   num_splits_heuristicL   s   r3   F     r   qkvcu_seqlens_qcu_seqlens_k	seqused_q	seqused_kmax_seqlen_qmax_seqlen_k
page_tablesoftmax_scalecausalsoftcapwindow_size_leftwindow_size_rightlearnable_sinkm_block_sizen_block_sizenum_threads
num_splitspack_gqa_compute_capability	score_modmask_modblock_sparse_tensors
return_lseoutlseaux_tensorsreturnc           U      C   s  dd | ||fD \} }}| j dd \}}|du r(| j dd \}} ||  }!n|j d d }d} | j d }!|	durs|du sBJ d|	jtjksLJ d	|	d
dksWJ d|	j d }"|	j ||"fkseJ |j dd \}#}$|#|$ }%n	d\}#}$|j d }%|j d }&|j d
 }'|du r|	du r|j ||%|&|fksJ |j ||%|&|'fksJ n7|j |#|$|&|fksJ |j |#|$|&|'fksJ n |j |%|&|fksJ |j |%|&|'fksJ |j |d fksJ d|dur|j |d fksJ d|du s|j |fksJ d|du s
|j |fks
J d| jtjtjfv sJ d| j|j  kr*|jks/J d J d||||fD ]}(|(durS|(jtjksGJ d|(ddksSJ dq5|durn|j |fkscJ |jtjksnJ dtdd | |||||||	|f	D sJ d||& dksJ d|dksJ dd|   })||) dksJ d|) |'|) dksJ d|) |
du rd t	
| }
|d!krd}||& }*|du r|*dk}| j}+| j},|du r|| fn|!f}-|du r||| fn||!f}.| jp|jp|j}/|du rtjg |-||'R |+|,d"}nt|d#g |-||'R |+|, |du rB|/s6|r?tj|.tj|,d"nd}n|durPt|d$|.tj|, t| j }0|du r]t n|}1|1d%v shJ d&|du}2|du r|rvd}|dup~|du}3|dus|dur|du r|dkrd'\}}3d}n	d(\}}3nd)\}}3ttj j}4|1d*kr||'  krd+krn n|s|3s|2sd,}|1d-v r|rd+|* dkrd.}|r|dkr|du rd.}|du r|du r| n|!}|du r|%}||* }5|1d/kr|5|krdnd}6nd}6|dk rR|6| }7|3s"|ntdt||| d | }8|8| d | }9|5|7 d |7 }:||& |: };t|;tj|,j|9d+}|dk}<|<rytj|g|-||'R tj|,d"}=tj|g|.R tj|,d"}>|durt|nd.}?|durt|nd.}@|dur|du sJ d0t|}|dup|dup|dup|du}A|dur|Artd1|2r|Artd2|r|jj d dkrd.}|<rtd3|0||'|*||?|@|2|durt |nd|du |du |du |du |du |	du|du|du|du|||6||<||1|$d4vf}B|Bt!j"vr)d5d |||||fD \}C}D}E}F}G|	dur>t#|	d6dd7nd}Hd8d | |||<sK|n|=fD \}I}J}K}L|<r^t#|>d6d9}Mn|durjt#|d6d9}Mnd}Md}N|dur| du r|t$d:t%||| |%|||6\}O}Pt&||O|Pd;}Qt'|Q}Nd}R|durd<d |D }R|1d*kr|	du sJ d=|<rJ d>t(|0||'|*||3|||d|d.d?d?|||dud@}Sn@|1d-v rt)||'|*||3|<||||6| o|3 o|du o|du o|< |||du|$d4v|dup|dudA}Snt$dB|1 dCt*j+|S|I|J|K|L|M|
|4|C|D|E|F|H|||G|N|RdDdEt!j"|B< d}T|durCt%||| |%|||6\}O}Pt&||O|Pd;}Tt!j"|B | |||<sO|n|=|<rU|>n||
|4|||||	||||T| |<rt,|=|>-d
d||durz|-d
dnd|| ||fS )Fa  Forward pass for FlashAttention.

    Args:
        ...
        score_mod: A callable that takes the attention scores and applies a modification.
        mask_mod: A callable that takes token position information and selectively masks
        block_sparse_tensors: A tuple of tensors used for block sparsity.
        return_lse: Whether to return the log softmax of the attention scores. If set to True will always calculate
        out: Optional pre-allocated output tensor. If None, will be allocated internally.
        lse: Optional pre-allocated log-sum-exp tensor. If None, will be allocated when needed.
        aux_tensors: Some score_mods will want to read from global aux_tensors. This is how we thread them through to the inner kernel.
    c                 S      g | ]}t |qS r   r"   .0r'   r   r   r   
<listcomp>       z#_flash_attn_fwd.<locals>.<listcomp>N   r   r   z-page_table is not supported with cu_seqlens_kzpage_table must be int32r   z3page_table must be contiguous in the last dimensionNN.cu_seqlens_k must have shape (batch_size + 1,).cu_seqlens_q must have shape (batch_size + 1,)z'seqused_q must have shape (batch_size,)z'seqused_k must have shape (batch_size,)"inputs must be float16 or bfloat16inputs must have the same dtypez>cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k must be int32zCcu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k must be contiguouszlearnable_sink must be bfloat16c                 s       | ]
}|d u p
|j V  qd S Nr&   rV   r   r   r   	<genexpr>   s
    
z"_flash_attn_fwd.<locals>.<genexpr>inputs must be on CUDA device)num_head must be divisible by num_head_kv   *head_dim must be less than or equal to 256   head_dim must be divisible by  head_dim_v must be divisible by       ?        r$   r%   rP   rQ   	   
      :Unsupported compute capability. Supported: 9.x, 10.x, 11.xTFFT)FFrq   r4      rr   rs   Frr   z-softcap and score_mod cannot be used togetherzgmask_mod with aux_tensors is not yet supported for varlen sequences. This will be fixed in a future PR.z\Block sparsity is not yet supported for varlen sequences. This will be fixed in a future PR.z_Block sparsity is not yet supported with SplitKV. TODO: partition sparse block lists per split.)Nr4   c                 S   &   g | ]}|d urt |dddnd qS Nr-   r   assumed_alignleading_dimr   rV   r   r   r   rX   z      r-   r{   c                 S   rT   r   r   rV   r   r   r   rX         r|   zHBlock sparsity requires fixed-length sequences (seqlen_q must be known).)expected_count_shapeexpected_index_shapec                 S      g | ]	}t |d ddqS NT)r|   fully_dynamicr   rW   bufr   r   r   rX         z paged KV not supported on SM 9.0zSplitKV not supported on SM 9.0T)	is_causalis_localrJ   tile_mtile_n
num_stagesrH   	Q_in_regsintra_wg_overlapmma_pv_is_rsrM   rL   has_aux_tensors)qhead_per_kvheadr   r   is_split_kvrJ   rF   rG   q_stageis_persistentrL   rM   r   paged_kv_non_tmais_varlen_qz Unsupported compute capability: z. Supported: 9.x, 10.x, 11.x--enable-tvm-ffioptions).r#   r$   r   int32r   float16bfloat16allelement_sizemathsqrtr%   requires_grademptyr,   float32torch2cute_dtype_mapr   r   CUstreamcurrent_streamcuda_streammaxr.   r3   get_device_propertiesmulti_processor_countr   hash_callablecreate_softcap_scoremodNotImplementedErrormask_block_cntlen_flash_attn_fwdcompile_cacher   
ValueErrorr   r   r   r	   r
   cutecompile_flash_attn_fwd_combine	transpose)Ur6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   num_headhead_dim
batch_sizeseqlen_qtotal_qmax_num_pages_per_seq	num_pages	page_sizeseqlen_knum_head_kv
head_dim_vr'   	alignmentr   out_torch_dtyper%   q_batch_seqlen_shape	lse_shaper   r$   compute_capabilityuse_block_sparsitylocalr   seqlen_q_packgqar   m_block_size_effectiveseqlen_k_loadedr1   num_m_blocksr/   r   out_partiallse_partialscore_mod_hashmask_mod_hash	is_varlencompile_keycu_seqlens_q_tensorcu_seqlens_k_tensorseqused_q_tensorseqused_k_tensorlearnable_sink_tensorpage_table_tensorq_tensork_tensorv_tensoro_tensor
lse_tensorsparse_tensorsr   r   compile_time_normalizedcute_aux_tensorsfa_fwdnormalized_block_sparse_tensorsr   r   r   r   V   s  .






.












.




&$




	









r   rn   @   rh   r[   doutnum_stages_Qnum_stages_dO
SdP_swapAB
dKV_swapAB	dQ_swapABAtomLayoutMSdPAtomLayoutNdKVAtomLayoutMdQ	V_in_regsdeterministicdqdkdvscore_mod_bwdc'           t         s  t  }'|'dv sJ d|'dkr9|sdnd}d d}d}d}(d}d	}| }d
}d}d
}d
})|	d u r4|
d u s8J dnd}d d	}d	}d
}d
}d
})dd | |||||||||f
D \
} }}}}}}}}}| jdd  \}*}+|d u r}| jd d \},}-|,|- }.n|jd d
 },| jd }.|d ur|n|.}-|d u r|jd d \},}/|,|/ }0n|jd d
 },|jd }0|d ur|n|0}/|jd }1|jd }2|rd}
|	d up|
d u}3|3r|	d u r|
dkrd\}}3d }
nd\}}3|&d u}4|'dkr|4rd}d	}d}5|5| |-| d
 | | }6|/  d
     }7|d u r(|j|,|/|1|+fksJ |j|,|/|1|2fks'J n#|j|0|1|+fks3J |j|0|1|2fks>J |j|,d
 fksKJ d|d ur|j|,d
 fks]J d|j|.|*|2fkshJ |j|.|*|2fkssJ |j|*|.fksJ dn%|j|,|-|*|2fksJ |j|,|-|*|2fksJ |j|,|*|-fksJ d| jtjtjfv sJ d| j|j  kr|j  kr|j  kr|jksJ d J d||fD ]}8|8d ur|8jtjksJ dq|jtjksJ dtdd | |||||||fD sJ d|*|1 dksJ d|+dks%J d d!| 	  }9|+|9 dks9J d"|9 |2|9 dksGJ d#|9 |d u rSd$t
|+ }|*|1 }:|d u r`|:d
k}d	}|'d%vrp|d	u spJ d&|"d ur|#d us~J d'|d(ksJ d)|d u r|d u sJ d*| j};| j}<|d u rt| }n	t|d+| j|<|; | d u rt|} n	t| d,|j|<|; |!d u rt|}!n	t|!d-|j|<|; |+d. d
 d. d. }=|d u r
tj|,|*|6|= tj|;d/}>tj|,|*|6tj|;d/}?tj|,|*|6tj|;d/}@n/|.|jd |  d
 | | }Atj|*|A|= tj|;d/}>tj|*|Atj|;d/}?tj|*|Atj|;d/}@|:d
k}B|Br|2d. d
 d. d. }C|d u r~|7  }D|)dkrc|D|) dkrc|7  }7tj|,|1|7|= tj|;d/}Etj|,|1|7|C tj|;d/}Fn;|0|jd    d
     }G|G  }D|)dkr|D|) dkr|G  }Gtj|1|G|= tj|;d/}Etj|1|G|C tj|;d/}Ft| j }Httj j}I|rtj|,|*|6| d
tjd0d/}Jnd }J|r |:d
kr tj|,|1|7  dtjd0d/}Ktj|,|1|7  dtjd0d/}Lnd }Kd }L|'|H|2|||d u |d u f}M|Mtjvr`d1d ||fD \}N}Od2d |>|?|@fD \}P}Q}Rt|d3d4}Sd5d ||fD \}T}U|'d6 }Vt|H|2|V||d7}Wtj|W|N|O|Q|S|R|P|T|U|Id8d9tj|M< tj|M |||?||@|>|||I	 d:}|"ryt|"nd	}X|#rt|#nd	}Y|$rt|$nd	}Z|%rt|%nd}[d }\|%d urd;d |%D }\|'dkr|'|H|+|2|:||d(k| ||||||||||||d u |d u |d u |d u |X|Y|Z|[|4f}]n(|'|H|+|2|:||	d u|
d u|d(k| |||)||X|Y|Z|[|4|d u |d u |d u |d u f}]|]tjvrd<d | ||||| |!fD \}^}_}`}O}a}b}cd=d |>|?|@fD \}P}Q}R|Br3d>d |E|FfD \}d}ed?d ||||fD \}T}f}U}gd@d |J|K|LfD \}h}i}jt |H|+|2|:| ||||||||||||dA}k|'dkrt!|H|+|2|:|| |||(|||||||||"|#|$|%d u|5dB}lnt"|+|2||3|:|)||"|#|$|%d u|5dC}ld }m|&d urt#|,|*|-|/| |5\}n}ot$|&|n|odD fdEdFdG}pt%|p}mtj|l|^|_|`|O|R|Q|P|Bs|bn|d|Bs|cn|e||I|T|f|U|gd |	|
|h|i|j|\|md8d9tj|]< d }q|&d urt#|,|*|-|/| |5\}n}ot$|&|n|odD fdHdFdG}qtj|] | ||||@|?|>|Bs| n|E|Bs!|!n|F||I||||d |	|
|J|K|L|%|q |'dkr9dnd}|'d6 }V|'|H|+|||||d u |d u |d u |d u f}r|rtj&vrt|>}Pt|}adId ||fD \}T}Ut'|H|+|V||||}stj|s|P|a||T|U|Id8d9tj&|r< tj&|r |>|||||I |BrI|'|H|+ ||||d u |d u |d u |d u f}r|rtj&vrt|E}dt| }bdJd ||fD \}f}g|'d6 }Vt'|H|+|V |||}stj|s|d|b||f|g|Id8d9tj&|r< tj&|r |E| ||||I |'|H|2 ||||d u |d u |d u |d u f}r|rtj&vr=t|F}et|!}cdKd ||fD \}f}g|'d6 }Vt'|H|2|V |||}stj|s|e|ct()d$|f|g|Id8d9tj&|r< tj&|r |F|!d$|||I || |!fS )LNrp   rt   rq   P   r   r4   r[   TFr   zlocal not supported yet on 9.xc                 S   rT   r   rU   rV   r   r   r   rX   ^  s    z#_flash_attn_bwd.<locals>.<listcomp>rZ   r   r   ru   rv   r^   r_   z'lse must have shape (num_head, total_q)z4lse must have shape (batch_size, num_head, seqlen_q)r`   ra   z(cu_seqlens_q, cu_seqlens_k must be int32zlse must be float32c                 s   rb   rc   rd   rV   r   r   r   re     s    
z"_flash_attn_bwd.<locals>.<genexpr>rf   rg   rh   ri   rj   rk   rl   rm   rx   z8bwd deterministic only supported for sm100/sm110 for nowz4score_mod_bwd is required when score_mod is providedrn   zEsoftcap and score_mod are mutually exclusive (different log2 scaling)z+varlen + score_mod not supported in bwd yetr   r   r       ro   r   c                 S   rT   r   r   rV   r   r   r   rX   9  rY   c                 S   rT   r   r   rV   r   r   r   rX   :  r   r-   r   c                 S   $   g | ]}|d urt |ddnd qS Nr-   r   r   rV   r   r   r   rX   >      rr   rH   r   r   r5   c                 S   r   r   r   r   r   r   r   rX   q  r   c                 S   rT   r   r   rV   r   r   r   rX     r   c                 S   rT   r   r   rV   r   r   r   rX     r   c                 S   rT   r   r   rV   r   r   r   rX     r   c                 S   r   r   r   rV   r   r   r   rX     r   c                 S   s2   g | ]}|d urt j| dd| dnd qS )N   r-   )r}   r   stride_order)r   "convert_from_dlpack_leading_staticdetach	dim_orderrV   r   r   r   rX     s    )r   )r   rL   r   rM   r   subtile_factor)
r   r   r   cluster_sizer   rL   r   rM   r   r   _flash_attn_bwdc                         d d  d dS NzBackward expects Q-direction block-sparse tensors (q_mask_cnt/q_mask_idx, and optionally full_q_cnt/full_q_idx). Regenerate the backward BlockMask with BLOCK_SIZE=(z, z) (sparse_block_size_q=z).r   r   rG   sparse_block_size_qr   r   <lambda>     z!_flash_attn_bwd.<locals>.<lambda>)r   r   contexthintc                      r  r  r   r   r  r   r   r  @  r  c                 S   r   r   r   rV   r   r   r   rX   t  r   c                 S   r   r   r   rV   r   r   r   rX     r   c                 S   r   r   r   rV   r   r   r   rX     r   )*r   r#   r$   r   r   r   r   r   r   r   r   r   r%   
empty_liker,   r   zerosr   r   r   r   r   r  compile_cache_prer   r   r   r   r   r   r   r   r   r   r   r   r   r   compile_cache_postr   cutlassFloat32)tr6   r7   r8   rP   r   rQ   r@   rA   rB   rC   rD   rF   rG   rH   rJ   r   r   r   r   r   r   r   r   r   r9   r:   r;   r<   r=   r>   r   r   r   r   rL   r   rM   rR   rN   r   num_stages_PdSr   r   r   r   r   r   r   total_kr   r   r   r   r   seqlen_q_roundedseqlen_k_roundedr'   r   r   r%   r   head_dim_roundeddq_accumdpsumlse_log2total_q_rounded_paddeddKV_postprocesshead_dim_v_roundedr1   dk_accumdv_accumtotal_k_rounded_paddedr$   r   dQ_semaphoredK_semaphoredV_semaphorecompile_key_prer   	do_tensordq_accum_tensordpsum_tensorlse_log2_tensorr   r   r   arch
fa_bwd_prer   score_mod_bwd_hashr   num_aux_tensorsr   r   r   r   r   	dq_tensor	dk_tensor	dv_tensordk_accum_tensordv_accum_tensorr   r   dQ_semaphore_tensordK_semaphore_tensordV_semaphore_tensorfa_bwd_sm80
fa_bwd_objsparse_tensors_compiler   r   r   r   compile_key_postfa_bwd_postr   r  r   r    s  )







:









	
	


!






	

	


	r  c                #   @   s   e Zd Ze													ddejdejdejd	ee d
ede	ee
 ee
 f deej dede
dee dedee deej deej deej deej f ddZedd ZdS )FlashAttnFuncNFr\   rn   r   r6   r7   r8   r@   rA   window_sizerE   rB   rI   rJ   r   rM   full_block_cntfull_block_idxr   mask_block_idxc                 C   s   d }t dd ||||fD rt||||d}t||||||d |d |||	|
||d\}}| ||||| || _|| _|| _|| _|| _||fS )Nc                 s   s    | ]}|d uV  qd S rc   r   rV   r   r   r   re     s    z(FlashAttnFunc.forward.<locals>.<genexpr>)r9  r:  r   r;  r   r   )
r@   rA   rC   rD   rE   rB   rI   rJ   rM   rN   )	anyr   r   save_for_backwardr@   rA   r8  rB   r   )ctxr6   r7   r8   r@   rA   r8  rE   rB   rI   rJ   r   rM   r9  r:  r   r;  rN   rP   rQ   r   r   r   forward  s<   
zFlashAttnFunc.forwardc                 G   sZ   | j \}}}}}t||||||| j| j| j| jd | jd | jd\}}	}
||	|
gdR S )Nr   r   )rC   rD   r   NNNNNNNNNNNNNNNNNNNN)saved_tensorsr  r@   rA   rB   r8  r   )r>  r   argsr6   r7   r8   rP   rQ   r   r   r   r   r   r   backward%  s    zFlashAttnFunc.backwardNFr\   Nrn   r   NFNNNNN)__name__
__module____qualname__staticmethodr   Tensorr   floatboolr   intr   r?  rC  r   r   r   r   r7    sd    	
3r7  c                +   @   s   e Zd Ze															ddejdejdejd	eej d
eej deej deej dee dee deej dee de	de
ee ee f deej dededee	 de	dee dee f(ddZedd ZdS )FlashAttnVarlenFuncNFr\   rn   r   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   r8  rE   rB   rI   rJ   r   rL   rR   c                 C   s   t |||||||f||	|
|||d |d ||||||d\}}| |||||||||	 || _|| _|| _|| _|| _|| _|	| _||fS )Nr   r   )r=   r>   r?   r@   rA   rC   rD   rE   rB   rI   rJ   rL   rR   )	r   r=  r@   rA   r8  rB   r   r=   r>   )r>  r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   r8  rE   rB   rI   rJ   r   rL   rR   rP   rQ   r   r   r   r?  :  s@   zFlashAttnVarlenFunc.forwardc                 G   s   | j \	}}}}}}}	}
}| jdksJ t||||||| j| j| j| jd | jd ||	|
|| j| j| jd\}}}|||gdR S )Nrn   r   r   )	rC   rD   r9   r:   r;   r<   r=   r>   r   r@  )	rA  rB   r  r@   rA   r8  r=   r>   r   )r>  r   rB  r6   r7   r8   rP   rQ   r9   r:   r;   r<   r   r   r   r   r   r   rC  r  s.   zFlashAttnVarlenFunc.backward)NNNNNNFr\   Nrn   r   NFNN)rE  rF  rG  rH  r   rI  r   rL  rJ  rK  r   r   listr?  rC  r   r   r   r   rM  9  sx    	
7rM  r\   r8  r9  r:  r   r;  c                 C   s(   t | |||||||||	|
|||||S rc   )r7  apply)r6   r7   r8   r@   rA   r8  rE   rB   rI   rJ   r   rM   r9  r:  r   r;  r   r   r   flash_attn_func  s$   rP  c                 C   s0   t | |||||||||	|
|||||||||S rc   )rM  rO  )r6   r7   r8   r9   r:   r=   r>   r;   r<   r?   r@   rA   r8  rE   rB   rI   rJ   r   rL   rR   r   r   r   flash_attn_varlen_func  s,   rQ  r   r   
cu_seqlenssequsednum_splits_dynamic_ptrsemaphore_to_resetc                 C   sP  |   dv s
J d|  dv sJ d| jtjtjtjfv s#J d|jtjks-J d| jr3|js7J d| dd	ksBJ d
|dd	ksMJ d|j| jdd ksYJ |   dk}|j| jd	d ksmJ d|dur|j|jd	d ksJ d|jtjksJ d|df|df|dffD ]*\}	}
|	dur|	jtj	ksJ |
 d|	jsJ |
 d|	
 sJ |
 dq| jd }| jd }|dksJ |dkrdnd}|d dkrdn	|d dkrdnd}ttt|d}|dkrt|d}ttj j}t|j }t| j }|||||||du|du|duf	}|tjvrt| |s.dnd d!}t|d|jd" d#}t||sCd nd"d!}|durVt|d|jd" d#nd}d$d% ||||fD }|\}}}}t||||||d&}|j||||||dd'std(tj||||||||||d)d*tj|< tj| | ||||||||	 dS )+aT  Forward combine kernel for split attention computation.

    Combines partial outputs and log-sum-exp values from multiple splits
    of attention computation into final outputs.

    Args:
        out_partial: Partial outputs tensor (num_splits, batch, seqlen, nheads, headdim) or
                                            (num_splits, total_q, nheads, headdim) if there's cu_seqlens
        lse_partial: Partial LSE tensor (num_splits, batch, seqlen, nheads) or
                                       (num_splits, total_q, nheads) if there's cu_seqlens
        out: Output tensor (batch, seqlen, nheads, headdim) or (total_q, nheads, headdim) if there's cu_seqlens
        lse: Output LSE tensor (batch, seqlen, nheads) or (total_q, nheads) if there's cu_seqlens.
        cu_seqlens: Cumulative sequence lengths for variable length sequences
        seqused: Used sequence lengths for each batch
        num_splits_dynamic_ptr: Dynamic number of splits per batch
        semaphore_to_reset: Semaphore for synchronization
        k_block_size: Block size for head dimension

    Returns:
        None
    r-      'out_partial must have 4 or 5 dimensionsr   r-   'lse_partial must have 3 or 4 dimensionsz'out_partial must be fp16, bf16, or fp32lse_partial must be fp32ztensors must be on CUDA devicer   r   z4out_partial must be contiguous in the last dimensionrZ   z6lse_partial must be contiguous in the seqlen dimensionNr-   zout shape mismatchzlse shape mismatchzlse must be fp32rR  rS  rT  z must be int32z must be on CUDA devicez must be contiguousr   rh   r   r4      rj   r   rW  r   )r}   r[   r{   c                 S   ry   rz   r   rV   r   r   r   rX   P  r~   z+_flash_attn_fwd_combine.<locals>.<listcomp>)r$   dtype_partialr   rF   k_block_sizelog_max_splitsr   zIFlashAttention combine kernel cannot be implemented with given parametersr   r   )dimr$   r   r   r   r   r&   r   r#   r   is_contiguousr   r   ceillog2r   r   r   r   r   r   r   r   ndimr   can_implementRuntimeErrorr   r   )r   r   rP   rQ   rR  rS  rT  rU  r   r'   r(   r   rI   r^  rF   r_  r   r$   r]  r   out_partial_tensorlse_partial_tensor
out_tensorr   optional_tensorscu_seqlens_tensorseqused_tensornum_splits_dynamic_tensorsemaphore_tensor
fa_combiner   r   r   r     s    

$






	r   T	out_dtypec                 C   sj  |   dv s
J d|  dv sJ d| jtjksJ d|jtjks(J d|   dk}|rH| j\}}	}
}|j||	|
fksCJ dd	}|	}n| j\}}}}
}|j||||
fks]J d
|du rd| j}| j}|du r|rxtj|	|
|||d}ntj|||
|||d}|r|rtj|
|	tj|ddd	}ntj||
|tj|dd	d}nd}t| ||||| ||fS )a  Flash Attention combine function for split attention computation.

    Combines partial outputs and log-sum-exp values from multiple splits
    of attention computation into final outputs. This is the main user-facing
    interface for the combine kernel.

    Args:
        out_partial: Partial outputs tensor with shape:
            - (num_splits, batch_size, seqlen, num_heads, head_size) for regular batched input
            - (num_splits, total_q, num_heads, head_size) for variable length input
        lse_partial: Partial LSE tensor with shape:
            - (num_splits, batch_size, seqlen, num_heads) for regular batched input
            - (num_splits, total_q, num_heads) for variable length input
        out: Optional output tensor. If None, will be created automatically.
        out_dtype: Optional output dtype. If None, will use fp16/bf16 based on input.
        cu_seqlens: Cumulative sequence lengths for variable length sequences
        seqused: Used sequence lengths for each batch
        return_lse: Whether to return the combined LSE tensor. Default is True.

    Returns:
        Tuple of (out, lse) where:
        - out: Combined output tensor with shape (batch_size, seqlen, num_heads, head_size)
              or (total_q, num_heads, head_size) for varlen
        - lse: Combined log-sum-exp tensor with shape (batch_size, seqlen, num_heads)
              or (total_q, num_heads) for varlen. None if return_lse=False

    Note:
        This function expects the input tensors to be in the format produced by
        split attention computation, where the first dimension is num_splits.
        The permuting from user format to kernel format is now done inside the kernel.
    rV  rX  rY  rZ  z,out_partial must be fp32 (from accumulation)r[  r-   z%lse_partial shape mismatch for varlenr   zlse_partial shape mismatchNro   r   r[   )	r`  r$   r   r   r#   r%   r   r   r   )r   r   rP   rp  rR  rS  rO   r   rI   r   	num_heads	head_sizer   seqlenr%   rQ   r   r   r   flash_attn_combine  sZ   )rt  )NNNNNNNNFNNNNr4   r4   r5   r   NNNNNFNNN)!NFrn   NNr   r4   rh   Fr[   r[   FFFr[   r[   r[   FNNNNNNFNNNNNNNNrD  )NNNNNNNNFr\   Nrn   r   NFNN)NNNNN)NNNNT)Gr   	functoolsr   typingr   r   r   r   cuda.bindings.driverbindingsdriverr   r  cutlass.cuter   flash_attn_origin.cuter   %flash_attn_origin.cute.cute_dsl_utilsr    flash_attn_origin.cute.flash_fwdr	   &flash_attn_origin.cute.flash_fwd_sm100r
   +flash_attn_origin.cute.flash_bwd_preprocessr    flash_attn_origin.cute.flash_bwdr   %flash_attn_origin.cute.flash_bwd_sm90r   &flash_attn_origin.cute.flash_bwd_sm100r   ,flash_attn_origin.cute.flash_bwd_postprocessr   (flash_attn_origin.cute.flash_fwd_combiner   %flash_attn_origin.cute.block_sparsityr   r   r   r   r   r   r"   r,   r   Float16r   BFloat16r   r  r   r3   rI  rL  rJ  rK  rN  r   r   r  r  r  autogradFunctionr7  rM  rP  rQ  r   r$   rt  r   r   r   r   <module>   s  
	
 !
   E
	
 !"#$%&'(
     UIY	

*	

2	
 )