o
    پid                  T   @   sR  d dl Z d dlmZmZmZ d dlZd dlmZ d dlm	  m
Z d dlZd dlmZ d dlm  m  m  mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ edd Z,dd Z-dd Z.ej/ej0ej1ej2ej3ej4iZ5dd Z6																										didej7dej7dej7deej7 deej7 deej7 d eej7 d!ee8 d"ee8 d#eej7 d$ee9 d%e:d&ee9 d'ee8 d(ee8 d)eej7 d*e8d+e8d,e8d-e8d.ee: d/ee8 d0ee d1ee d2ee& d3e:d4eej7 d5eej7 d6ee;ej7  d7eej7ej7f f<d8d9Z<i e<_=			:			;		<		=	=				=	=	=																djdej7dej7dej7d4ej7d>ej7d5ej7d$ee9 d%e:d&e9d'ee8 d(ee8 d*e8d+e8d,e8d.e:d?e8d@e8dAe:dBe:dCe:dDe8dEe8dFe8dGe:deej7 deej7 deej7 d eej7 d!ee8 d"ee8 dHe:dIeej7 dJeej7 dKeej7 d0ee dLee d1ee d6ee;ej7  d2ee& d7eej7ej7ej7f fPdMdNZ>i e>_?i e>_=i e>_@G dOdP dPejAjBZCG dQdR dRejAjBZD			S		:								dkdej7dej7dej7d$ee9 d%e:dTeee8 ee8 f d)eej7 d&e9d-e8d.ee: dHe:d1ee dUeej7 dVeej7 dWeej7 dXeej7 f dYdZZE										S		:					dldej7dej7dej7deej7 deej7 d!ee8 d"ee8 deej7 d eej7 d#eej7 d$ee9 d%e:dTeee8 ee8 f d)eej7 d&e9d-e8d.ee: dHe:d0ee d6ee; f(d[d\ZF					dmd]ej7d^ej7d4ej7d5eej7 d_eej7 d`eej7 daeej7 dbeej7 d7dfdcddZGi eG_=					ednd]ej7d^ej7d4eej7 dfeejH d_eej7 d`eej7 d3e:d7eej7eej7 f fdgdhZIdS )o    N)OptionalTupleCallable)
cache_once   to_cute_tensor)FlashAttentionForwardSm90)FlashAttentionForwardSm100) FlashAttentionBackwardPreprocess)FlashAttentionBackwardSm80)FlashAttentionBackwardSm90)FlashAttentionBackwardSm100)!FlashAttentionBackwardPostprocess)FlashAttentionForwardCombine)BlockSparseTensorsTorchto_cute_block_sparse_tensorsnormalize_block_sparse_tensors get_block_sparse_expected_shapes$get_block_sparse_expected_shapes_bwd"get_block_sparse_broadcast_patternc                   C   s   t j d S )zCached device capability check.r   )torchcudaget_device_capability r   r   d/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/flash_attention/cute/interface.py_get_device_capability8   s   r   c                 C   s"   | d ur|  ddkr|  S | S )Nr   )stride
contiguous)xr   r   r   maybe_contiguous=   s   "r!   c                 C   s   | j |ksJ | d| j  d| | j|ks$J | d| j d| | j|ks6J | d| j d| | js@J | dd S )Nz shape z != expected z dtype z device z must be on CUDA)shapedtypedeviceis_cuda)tnameexpected_shapeexpected_dtypeexpected_devicer   r   r   _validate_tensorA   s   $$$r+   c                 C   s   |dkrdS t ||  ||S )N   r   )min)total_mblocksnum_SMsnum_n_blocks
max_splitsr   r   r   num_splits_heuristicO   s   r2   F     qkvcu_seqlens_qcu_seqlens_k	seqused_q	seqused_kmax_seqlen_qmax_seqlen_k
page_tablesoftmax_scalecausalsoftcapwindow_size_leftwindow_size_rightlearnable_sinkm_block_sizen_block_sizenum_threads
num_splitspack_gqa_compute_capability	score_modmask_modblock_sparse_tensors
return_lseoutlseaux_tensorsreturnc           U      C   s
  dd | ||fD \} }}| j dd \}}|du r(| j dd \}} ||  }!n|j d d }d} | j d }!|	durs|du sBJ d|	jtjksLJ d	|	d
dksWJ d|	j d }"|	j ||"fkseJ |j dd \}#}$|#|$ }%n	d\}#}$|j d }%|j d }&|j d
 }'|du r|	du r|j ||%|&|fksJ |j ||%|&|'fksJ n7|j |#|$|&|fksJ |j |#|$|&|'fksJ n |j |%|&|fksJ |j |%|&|'fksJ |j |d fksJ d|dur|j |d fksJ d|du s|j |fksJ d|du s
|j |fks
J d| jtjtjfv sJ d| j|j  kr*|jks/J d J d||||fD ]}(|(durS|(jtjksGJ d|(ddksSJ dq5|durn|j |fkscJ |jtjksnJ dtdd | |||||||	|f	D sJ d||& dksJ d|dksJ dd|   })||) dksJ d|) |'|) dksJ d|) |
du rd t	
| }
|d!krd}||& }*|du r|*dk}| j}+| j},|du r|| fn|!f}-|du r||| fn||!f}.| jp|jp|j}/|du rtjg |-||'R |+|,d"}nt|d#g |-||'R |+|, |du rB|/s6|r?tj|.tj|,d"nd}n|durPt|d$|.tj|, t| j }0|du r]t n|}1|1d%v shJ d&|du}2|du r|rvd}|dup~|du}3|dus|dur|du r|dkrd'\}}3d}n	d(\}}3nd)\}}3ttj j}4|1d*kr||'  krd+krn n|s|3s|2sd,}|1d-v r|rd+|* dkrd.}|r|dkr|du rd.}|du r|du r| n|!}|du r|%}||* }5|1d/kr|5|krdnd}6nd}6|dk rR|6| }7|3s"|ntdt||| d | }8|8| d | }9|5|7 d |7 }:||& |: };t|;tj|,j|9d+}|dk}<|<rytj|g|-||'R tj|,d"}=tj|g|.R tj|,d"}>|durt|nd.}?|durt|nd.}@|dur|du sJ d0t|}|dup|dup|dup|du}A|dur|Artd1|2r|Artd2|r|jj d dkrd.}|<rtd3d}Bd}C|dur| du rt d4t!||| |%|||6\}D}Et"||D|Ed5}Ct#|C}B|0||'|*||?|@|2|B|durt$|nd|du |du |du |du |du |	du|du|du|du|||6||<||1|$d6vf}F|Ft%j&vr7d7d |||||fD \}G}H}I}J}K|	durht'|	d8dd9nd}Ld:d | |||<su|n|=fD \}M}N}O}P|<rt'|>d8d;}Qn|durt'|d8d;}Qnd}Qd}R|Cdurt(|C}Rd}S|durd<d |D }S|1d*kr|	du sJ d=|<rJ d>t)|0||'|*||3|||d|d.d?d?|||dud@}Tn@|1d-v rt*||'|*||3|<||||6| o |3 o |du o |du o |< |||du|$d6v|dup|dudA}Tnt dB|1 dCt+j,|T|M|N|O|P|Q|
|4|G|H|I|J|L|||K|R|SdDdEt%j&|F< t%j&|F | |||<sC|n|=|<rI|>n||
|4|||||	||||C| |<rst-|=|>.d
d||durn|.d
dnd|| ||fS )Fa  Forward pass for FlashAttention.

    Args:
        ...
        score_mod: A callable that takes the attention scores and applies a modification.
        mask_mod: A callable that takes token position information and selectively masks
        block_sparse_tensors: A tuple of tensors used for block sparsity.
        return_lse: Whether to return the log softmax of the attention scores. If set to True will always calculate
        out: Optional pre-allocated output tensor. If None, will be allocated internally.
        lse: Optional pre-allocated log-sum-exp tensor. If None, will be allocated when needed.
        aux_tensors: Some score_mods will want to read from global aux_tensors. This is how we thread them through to the inner kernel.
    c                 S      g | ]}t |qS r   r!   .0r&   r   r   r   
<listcomp>       z#_flash_attn_fwd.<locals>.<listcomp>N   r   r   z-page_table is not supported with cu_seqlens_kzpage_table must be int32r   z3page_table must be contiguous in the last dimensionNN.cu_seqlens_k must have shape (batch_size + 1,).cu_seqlens_q must have shape (batch_size + 1,)z'seqused_q must have shape (batch_size,)z'seqused_k must have shape (batch_size,)"inputs must be float16 or bfloat16inputs must have the same dtypez>cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k must be int32zCcu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k must be contiguouszlearnable_sink must be bfloat16c                 s       | ]
}|d u p
|j V  qd S Nr%   rU   r   r   r   	<genexpr>   s
    
z"_flash_attn_fwd.<locals>.<genexpr>inputs must be on CUDA device)num_head must be divisible by num_head_kv   *head_dim must be less than or equal to 256   head_dim must be divisible by  head_dim_v must be divisible by       ?        r#   r$   rO   rP   	   
      :Unsupported compute capability. Supported: 9.x, 10.x, 11.xTFFT)FFrp   r3      rq   rr   Frq   z-softcap and score_mod cannot be used togetherzgmask_mod with aux_tensors is not yet supported for varlen sequences. This will be fixed in a future PR.z\Block sparsity is not yet supported for varlen sequences. This will be fixed in a future PR.z_Block sparsity is not yet supported with SplitKV. TODO: partition sparse block lists per split.zHBlock sparsity requires fixed-length sequences (seqlen_q must be known).)expected_count_shapeexpected_index_shape)Nr3   c                 S   &   g | ]}|d urt |dddnd qS Nr,   r   assumed_alignleading_dimr   rU   r   r   r   rW         r,   r|   c                 S   rS   r   r   rU   r   r   r   rW         r}   c                 S      g | ]	}t |d ddqS NT)r}   fully_dynamicr   rV   bufr   r   r   rW         z paged KV not supported on SM 9.0zSplitKV not supported on SM 9.0T)	is_causalis_localrI   tile_mtile_n
num_stagesrG   	Q_in_regsintra_wg_overlapmma_pv_is_rsrL   rK   has_aux_tensors)qhead_per_kvheadr   r   is_split_kvrI   rE   rF   q_stageis_persistentrK   rL   r   paged_kv_non_tmais_varlen_qz Unsupported compute capability: z. Supported: 9.x, 10.x, 11.x--enable-tvm-ffioptions)/r"   r#   r   int32r   float16bfloat16allelement_sizemathsqrtr$   requires_grademptyr+   float32torch2cute_dtype_mapr   r   CUstreamcurrent_streamcuda_streammaxr-   r2   get_device_propertiesmulti_processor_countutilshash_callablecreate_softcap_scoremodNotImplementedErrormask_block_cnt
ValueErrorr   r   r   len_flash_attn_fwdcompile_cacher   r   r	   r
   cutecompile_flash_attn_fwd_combine	transpose)Ur5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   num_headhead_dim
batch_sizeseqlen_qtotal_qmax_num_pages_per_seq	num_pages	page_sizeseqlen_knum_head_kv
head_dim_vr&   	alignmentr   out_torch_dtyper$   q_batch_seqlen_shape	lse_shaper   r#   compute_capabilityuse_block_sparsitylocalr   seqlen_q_packgqar   m_block_size_effectiveseqlen_k_loadedr0   num_m_blocksr.   r   out_partiallse_partialscore_mod_hashmask_mod_hash	is_varlenblock_sparse_broadcast_patternnormalized_block_sparse_tensorsrx   ry   compile_keycu_seqlens_q_tensorcu_seqlens_k_tensorseqused_q_tensorseqused_k_tensorlearnable_sink_tensorpage_table_tensorq_tensork_tensorv_tensoro_tensor
lse_tensorsparse_tensorscute_aux_tensorsfa_fwdr   r   r   r   Y   s  .






.












.




&$






	







r   rm   @   rg   rZ   doutnum_stages_Qnum_stages_dO
SdP_swapAB
dKV_swapAB	dQ_swapABAtomLayoutMSdPAtomLayoutNdKVAtomLayoutMdQ	V_in_regsdeterministicdqdkdvscore_mod_bwdc'           t         sT  t  }'|'dv sJ d|'dkr9|sdnd}d d}d}d}(d}d	}| }d
}d}d
}d
})|	d u r4|
d u s8J dnd}d d	}d	}d
}d
}d
})dd | |||||||||f
D \
} }}}}}}}}}| jdd  \}*}+|d u r}| jd d \},}-|,|- }.n|jd d
 },| jd }.|d ur|n|.}-|d u r|jd d \},}/|,|/ }0n|jd d
 },|jd }0|d ur|n|0}/|jd }1|jd }2|rd}
|	d up|
d u}3|3r|	d u r|
dkrd\}}3d }
nd\}}3|&d u}4|'dkr|4rd}d	}d}5|5| |-| d
 | | }6|/  d
     }7|d u r(|j|,|/|1|+fksJ |j|,|/|1|2fks'J n#|j|0|1|+fks3J |j|0|1|2fks>J |j|,d
 fksKJ d|d ur|j|,d
 fks]J d|j|.|*|2fkshJ |j|.|*|2fkssJ |j|*|.fksJ dn%|j|,|-|*|2fksJ |j|,|-|*|2fksJ |j|,|*|-fksJ d| jtjtjfv sJ d| j|j  kr|j  kr|j  kr|jksJ d J d||fD ]}8|8d ur|8jtjksJ dq|jtjksJ dtdd | |||||||fD sJ d|*|1 dksJ d|+dks%J d d!| 	  }9|+|9 dks9J d"|9 |2|9 dksGJ d#|9 |d u rSd$t
|+ }|*|1 }:|d u r`|:d
k}d	}|'d%vrp|d	u spJ d&|"d ur|#d us~J d'|d(ksJ d)|d u r|d u sJ d*| j};| j}<|d u rt| }n	t|d+| j|<|; | d u rt|} n	t| d,|j|<|; |!d u rt|}!n	t|!d-|j|<|; |+d. d
 d. d. }=|d u r
tj|,|*|6|= tj|;d/}>tj|,|*|6tj|;d/}?tj|,|*|6tj|;d/}@n/|.|jd |  d
 | | }Atj|*|A|= tj|;d/}>tj|*|Atj|;d/}?tj|*|Atj|;d/}@|:d
k}B|Br|2d. d
 d. d. }C|d u r~|7  }D|)dkrc|D|) dkrc|7  }7tj|,|1|7|= tj|;d/}Etj|,|1|7|C tj|;d/}Fn;|0|jd    d
     }G|G  }D|)dkr|D|) dkr|G  }Gtj|1|G|= tj|;d/}Etj|1|G|C tj|;d/}Ft| j }Httj j}I|rtj|,|*|6| d
tjd0d/}Jnd }J|r |:d
kr tj|,|1|7  dtjd0d/}Ktj|,|1|7  dtjd0d/}Lnd }Kd }L|'|H|2|||d u |d u f}M|Mtjvr`d1d ||fD \}N}Od2d |>|?|@fD \}P}Q}Rt|d3d4}Sd5d ||fD \}T}U|'d6 }Vt|H|2|V||d7}Wtj|W|N|O|Q|S|R|P|T|U|Id8d9tj|M< tj|M |||?||@|>|||I	 d:}|"ryt|"nd	}X|#rt|#nd	}Y|$rt|$nd	}Z|%rt|%nd}[d }\|%d urd;d |%D }\d }]d }^|&d urt|,|*|-|/| |5\}_}`t |&|_|`d< fd=d>d?}^t!|^}]|'dkr|'|H|+|2|:||d(k| ||||||||||||d u |d u |d u |d u |X|Y|Z|[|4|]f}an)|'|H|+|2|:||	d u|
d u|d(k| |||)||X|Y|Z|[|4|]|d u |d u |d u |d u f}a|atj"vrd@d | ||||| |!fD \}b}c}d}O}e}f}gdAd |>|?|@fD \}P}Q}R|Br\dBd |E|FfD \}h}idCd ||||fD \}T}j}U}kdDd |J|K|LfD \}l}m}nt#|H|+|2|:| ||||||||||||dE}o|'dkrt$|H|+|2|:|| |||(|||||||||"|#|$|%d u|5dF}pnt%|+|2||3|:|)||"|#|$|%d u|5dG}pd }q|^d urt&|^}qtj|p|b|c|d|O|R|Q|P|Bs|fn|h|Bs|gn|i||I|T|j|U|kd |	|
|l|m|n|\|qd8d9tj"|a< tj"|a | ||||@|?|>|Bs	| n|E|Bs|!n|F||I||||d |	|
|J|K|L|%|^ |'dkr'dnd}|'d6 }V|'|H|+|||||d u |d u f	}r|rtj'vrot|>}Pt|}edHd ||fD \}T}Ut(|H|+|V||||}stj|s|P|e||T|U|Id8d9tj'|r< tj'|r |>|||||I |Br%|'|H|+ ||||d u |d u f	}r|rtj'vrt|E}ht| }fdId ||fD \}j}k|'d6 }Vt(|H|+|V |||}stj|s|h|f||j|k|Id8d9tj'|r< tj'|r |E| ||||I |'|H|2 ||||d u |d u f	}r|rtj'vrt|F}it|!}gdJd ||fD \}j}k|'d6 }Vt(|H|2|V |||}stj|s|i|gt)*d$|j|k|Id8d9tj'|r< tj'|r |F|!d$|||I || |!fS )KNro   rs   rp   P   r   r3   rZ   TFr   zlocal not supported yet on 9.xc                 S   rS   r   rT   rU   r   r   r   rW   ^  s    z#_flash_attn_bwd.<locals>.<listcomp>rY   r   r   rt   ru   r]   r^   z'lse must have shape (num_head, total_q)z4lse must have shape (batch_size, num_head, seqlen_q)r_   r`   z(cu_seqlens_q, cu_seqlens_k must be int32zlse must be float32c                 s   ra   rb   rc   rU   r   r   r   rd     s    
z"_flash_attn_bwd.<locals>.<genexpr>re   rf   rg   rh   ri   rj   rk   rl   rw   z8bwd deterministic only supported for sm100/sm110 for nowz4score_mod_bwd is required when score_mod is providedrm   zEsoftcap and score_mod are mutually exclusive (different log2 scaling)z+varlen + score_mod not supported in bwd yetr   r   r       rn   r   c                 S   rS   r   r   rU   r   r   r   rW   9  rX   c                 S   rS   r   r   rU   r   r   r   rW   :  r   r,   r   c                 S   $   g | ]}|d urt |ddnd qS Nr,   r   r   rU   r   r   r   rW   >      rq   rG   r   r   r4   c                 S   r   r   r   r   r   r   r   rW   q  r   _flash_attn_bwdc                      s   d d  d dS )NzBackward expects Q-direction block-sparse tensors (q_mask_cnt/q_mask_idx, and optionally full_q_cnt/full_q_idx). Regenerate the backward BlockMask with BLOCK_SIZE=(z, z) (sparse_block_size_q=z).r   r   rF   sparse_block_size_qr   r   <lambda>  s   z!_flash_attn_bwd.<locals>.<lambda>)rx   ry   contexthintc                 S   rS   r   r   rU   r   r   r   rW     r   c                 S   rS   r   r   rU   r   r   r   rW     r   c                 S   rS   r   r   rU   r   r   r   rW     r   c                 S   r   r   r   rU   r   r   r   rW     r   c                 S   s2   g | ]}|d urt j| dd| dnd qS )N   r,   )r~   r   stride_order)r   "convert_from_dlpack_leading_staticdetach	dim_orderrU   r   r   r   rW     s    )r   )r   rK   r   rL   r   subtile_factor)
r   r   r   cluster_sizer   rK   r   rL   r   r  c                 S   r   r   r   rU   r   r   r   rW   h  r   c                 S   r   r   r   rU   r   r   r   rW     r   c                 S   r   r   r   rU   r   r   r   rW     r   )+r   r"   r#   r   r   r   r   r   r   r   r   r   r$   
empty_liker+   r   zerosr   r   r   r   r   r   compile_cache_prer   r   r   r   r   r   r   r   r   r   r   r   r   r   r   compile_cache_postr   cutlassFloat32)tr5   r6   r7   rO   r   rP   r?   r@   rA   rB   rC   rE   rF   rG   rI   r   r   r   r   r   r   r   r   r   r8   r9   r:   r;   r<   r=   r   r   r   r   rK   r   rL   rQ   rM   r   num_stages_PdSr  r   r   r   r   r   r   total_kr   r   r   r   r  seqlen_q_roundedseqlen_k_roundedr&   r   r   r$   r   head_dim_roundeddq_accumdpsumlse_log2total_q_rounded_paddeddKV_postprocesshead_dim_v_roundedr0   dk_accumdv_accumtotal_k_rounded_paddedr#   r   dQ_semaphoredK_semaphoredV_semaphorecompile_key_prer   	do_tensordq_accum_tensordpsum_tensorlse_log2_tensorr   r   r   arch
fa_bwd_prer   score_mod_bwd_hashr   num_aux_tensorsr   r   r   rx   ry   r   r   r   r   	dq_tensor	dk_tensor	dv_tensordk_accum_tensordv_accum_tensorr   r   dQ_semaphore_tensordK_semaphore_tensordV_semaphore_tensorfa_bwd_sm80
fa_bwd_objsparse_tensors_compilecompile_key_postfa_bwd_postr   r   r   r     s|  )







:









	
	



"





	

	


	r   c                #   @   s   e Zd Ze													ddejdejdejd	ee d
ede	ee
 ee
 f deej dede
dee dedee deej deej deej deej f ddZedd ZdS )FlashAttnFuncNFr[   rm   r   r5   r6   r7   r?   r@   window_sizerD   rA   rH   rI   r   rL   full_block_cntfull_block_idxr   mask_block_idxc                 C   s   d }t dd ||||fD rt||||d}t||||||d |d |||	|
||d\}}| ||||| || _|| _|| _|| _|| _||fS )Nc                 s   s    | ]}|d uV  qd S rb   r   rU   r   r   r   rd     s    z(FlashAttnFunc.forward.<locals>.<genexpr>)r6  r7  r   r8  r   r   )
r?   r@   rB   rC   rD   rA   rH   rI   rL   rM   )	anyr   r   save_for_backwardr?   r@   r5  rA   r   )ctxr5   r6   r7   r?   r@   r5  rD   rA   rH   rI   r   rL   r6  r7  r   r8  rM   rO   rP   r   r   r   forward  s<   
zFlashAttnFunc.forwardc                 G   sZ   | j \}}}}}t||||||| j| j| j| jd | jd | jd\}}	}
||	|
gdR S )Nr   r   )rB   rC   r   NNNNNNNNNNNNNNNNNNNN)saved_tensorsr   r?   r@   rA   r5  r   )r;  r   argsr5   r6   r7   rO   rP   r   r   r   r   r   r   backward  s    zFlashAttnFunc.backwardNFr[   Nrm   r   NFNNNNN)__name__
__module____qualname__staticmethodr   Tensorr   floatboolr   intr   r<  r@  r   r   r   r   r4    sd    	
3r4  c                +   @   s   e Zd Ze															ddejdejdejd	eej d
eej deej deej dee dee deej dee de	de
ee ee f deej dededee	 de	dee dee f(ddZedd ZdS )FlashAttnVarlenFuncNFr[   rm   r   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   r5  rD   rA   rH   rI   r   rK   rQ   c                 C   s   t |||||||f||	|
|||d |d ||||||dd\}}| |||||||||	 || _|| _|| _|| _|| _|| _|	| _||fS )Nr   r   T)r<   r=   r>   r?   r@   rB   rC   rD   rA   rH   rI   rK   rQ   rN   )	r   r:  r?   r@   r5  rA   r   r<   r=   )r;  r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   r5  rD   rA   rH   rI   r   rK   rQ   rO   rP   r   r   r   r<  *  sB   zFlashAttnVarlenFunc.forwardc                 G   s   | j \	}}}}}}}	}
}| jdksJ t||||||| j| j| j| jd | jd ||	|
|| j| j| jd\}}}|||gdR S )Nrm   r   r   )	rB   rC   r8   r9   r:   r;   r<   r=   r   r=  )	r>  rA   r   r?   r@   r5  r<   r=   r   )r;  r   r?  r5   r6   r7   rO   rP   r8   r9   r:   r;   r   r   r   r   r   r   r@  c  s.   zFlashAttnVarlenFunc.backward)NNNNNNFr[   Nrm   r   NFNN)rB  rC  rD  rE  r   rF  r   rI  rG  rH  r   r   listr<  r@  r   r   r   r   rJ  )  sx    	
8rJ  r[   r5  r6  r7  r   r8  c                 C   s(   t | |||||||||	|
|||||S rb   )r4  apply)r5   r6   r7   r?   r@   r5  rD   rA   rH   rI   r   rL   r6  r7  r   r8  r   r   r   flash_attn_func  s$   rM  c                 C   s0   t | |||||||||	|
|||||||||S rb   )rJ  rL  )r5   r6   r7   r8   r9   r<   r=   r:   r;   r>   r?   r@   r5  rD   rA   rH   rI   r   rK   rQ   r   r   r   flash_attn_varlen_func  s,   rN  r   r   
cu_seqlenssequsednum_splits_dynamic_ptrsemaphore_to_resetc                 C   sP  |   dv s
J d|  dv sJ d| jtjtjtjfv s#J d|jtjks-J d| jr3|js7J d| dd	ksBJ d
|dd	ksMJ d|j| jdd ksYJ |   dk}|j| jd	d ksmJ d|dur|j|jd	d ksJ d|jtjksJ d|df|df|dffD ]*\}	}
|	dur|	jtj	ksJ |
 d|	jsJ |
 d|	
 sJ |
 dq| jd }| jd }|dksJ |dkrdnd}|d dkrdn	|d dkrdnd}ttt|d}|dkrt|d}ttj j}t|j }t| j }|||||||du|du|duf	}|tjvrt| |s.dnd d!}t|d|jd" d#}t||sCd nd"d!}|durVt|d|jd" d#nd}d$d% ||||fD }|\}}}}t||||||d&}|j||||||dd'std(tj||||||||||d)d*tj|< tj| | ||||||||	 dS )+aT  Forward combine kernel for split attention computation.

    Combines partial outputs and log-sum-exp values from multiple splits
    of attention computation into final outputs.

    Args:
        out_partial: Partial outputs tensor (num_splits, batch, seqlen, nheads, headdim) or
                                            (num_splits, total_q, nheads, headdim) if there's cu_seqlens
        lse_partial: Partial LSE tensor (num_splits, batch, seqlen, nheads) or
                                       (num_splits, total_q, nheads) if there's cu_seqlens
        out: Output tensor (batch, seqlen, nheads, headdim) or (total_q, nheads, headdim) if there's cu_seqlens
        lse: Output LSE tensor (batch, seqlen, nheads) or (total_q, nheads) if there's cu_seqlens.
        cu_seqlens: Cumulative sequence lengths for variable length sequences
        seqused: Used sequence lengths for each batch
        num_splits_dynamic_ptr: Dynamic number of splits per batch
        semaphore_to_reset: Semaphore for synchronization
        k_block_size: Block size for head dimension

    Returns:
        None
    r,      'out_partial must have 4 or 5 dimensionsr   r,   'lse_partial must have 3 or 4 dimensionsz'out_partial must be fp16, bf16, or fp32lse_partial must be fp32ztensors must be on CUDA devicer   r   z4out_partial must be contiguous in the last dimensionrY   z6lse_partial must be contiguous in the seqlen dimensionNr,   zout shape mismatchzlse shape mismatchzlse must be fp32rO  rP  rQ  z must be int32z must be on CUDA devicez must be contiguousr   rg   r   r3      ri   r   rT  r   )r~   rZ   r|   c                 S   rz   r{   r   rU   r   r   r   rW   A  r   z+_flash_attn_fwd_combine.<locals>.<listcomp>)r#   dtype_partialr   rE   k_block_sizelog_max_splitsr   zIFlashAttention combine kernel cannot be implemented with given parametersr   r   )dimr#   r   r   r   r   r%   r   r"   r   is_contiguousr   r   ceillog2r   r   r   r   r   r   r   r   ndimr   can_implementRuntimeErrorr   r   )r   r   rO   rP   rO  rP  rQ  rR  r   r&   r'   r   rH   r[  rE   r\  r   r#   rZ  r   out_partial_tensorlse_partial_tensor
out_tensorr   optional_tensorscu_seqlens_tensorseqused_tensornum_splits_dynamic_tensorsemaphore_tensor
fa_combiner   r   r   r     s    

$






	r   T	out_dtypec                 C   sj  |   dv s
J d|  dv sJ d| jtjksJ d|jtjks(J d|   dk}|rH| j\}}	}
}|j||	|
fksCJ dd	}|	}n| j\}}}}
}|j||||
fks]J d
|du rd| j}| j}|du r|rxtj|	|
|||d}ntj|||
|||d}|r|rtj|
|	tj|ddd	}ntj||
|tj|dd	d}nd}t| ||||| ||fS )a  Flash Attention combine function for split attention computation.

    Combines partial outputs and log-sum-exp values from multiple splits
    of attention computation into final outputs. This is the main user-facing
    interface for the combine kernel.

    Args:
        out_partial: Partial outputs tensor with shape:
            - (num_splits, batch_size, seqlen, num_heads, head_size) for regular batched input
            - (num_splits, total_q, num_heads, head_size) for variable length input
        lse_partial: Partial LSE tensor with shape:
            - (num_splits, batch_size, seqlen, num_heads) for regular batched input
            - (num_splits, total_q, num_heads) for variable length input
        out: Optional output tensor. If None, will be created automatically.
        out_dtype: Optional output dtype. If None, will use fp16/bf16 based on input.
        cu_seqlens: Cumulative sequence lengths for variable length sequences
        seqused: Used sequence lengths for each batch
        return_lse: Whether to return the combined LSE tensor. Default is True.

    Returns:
        Tuple of (out, lse) where:
        - out: Combined output tensor with shape (batch_size, seqlen, num_heads, head_size)
              or (total_q, num_heads, head_size) for varlen
        - lse: Combined log-sum-exp tensor with shape (batch_size, seqlen, num_heads)
              or (total_q, num_heads) for varlen. None if return_lse=False

    Note:
        This function expects the input tensors to be in the format produced by
        split attention computation, where the first dimension is num_splits.
        The permuting from user format to kernel format is now done inside the kernel.
    rS  rU  rV  rW  z,out_partial must be fp32 (from accumulation)rX  r,   z%lse_partial shape mismatch for varlenr   zlse_partial shape mismatchNrn   r   rZ   )	r]  r#   r   r   r"   r$   r   r   r   )r   r   rO   rm  rO  rP  rN   r   rH   r   	num_heads	head_sizer   seqlenr$   rP   r   r   r   flash_attn_combine~  sZ   )rq  )NNNNNNNNFNNNNr3   r3   r4   r   NNNNNFNNN)!NFrm   NNr   r3   rg   FrZ   rZ   FFFrZ   rZ   rZ   FNNNNNNFNNNNNNNNrA  )NNNNNNNNFr[   Nrm   r   NFNN)NNNNN)NNNNT)Jr   typingr   r   r   r   sglang.jit_kernel.utilsr   cuda.bindings.driverbindingsdriverr   r  cutlass.cuter   ,sglang.jit_kernel.flash_attention.cute.utils
jit_kernelflash_attentionr   cute_dsl_utilsr   	flash_fwdr	   flash_fwd_sm100r
   flash_bwd_preprocessr   	flash_bwdr   flash_bwd_sm90r   flash_bwd_sm100r   flash_bwd_postprocessr   flash_fwd_combiner   block_sparsityr   r   r   r   r   r   r   r!   r+   r   Float16r   BFloat16r   r  r   r2   rF  rI  rG  rH  rK  r   r   r   r	  r
  autogradFunctionr4  rJ  rM  rN  r   r#   rq  r   r   r   r   <module>   s   	
	
 !
   B
	
 !"#$%&'(
     EIZ	

*	

2	
 )