o
    پiq                  ?   @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
 eeZd dlm  mZ d dlZd dlmZ d dlZd dlmZ d dlmZ d dlmZmZmZmZ d dlm Z  d dl!m"Z" d d	l#m$Z$ edd
dd Z%dd Z&dd Z'dKddZ(ej)ej*ej+ej,ej-ej.iZ/dd Z0																										dLdej1dej1dej1de	ej1 de	ej1 d e	ej1 d!e	ej1 d"e	e2 d#e	e2 d$e	ej1 d%e	e3 d&e4d'e	e3 d(e	e2 d)e	e2 d*e	ej1 d+e2d,e2d-e2d.e2d/e	e4 d0e	e2 d1e	e d2e	e d3e	e d4e4d5e	ej1 d6e	ej1 d7e	e5ej1  d8e
ej1ej1f f<d9d:Z6i e6_7					dMd;ej1d<ej1d5ej1d6e	ej1 d=e	ej1 d>e	ej1 d?e	ej1 d@e	ej1 d8dfdAdBZ8i e8_7dCdD Z9e9								E		F					dNdej1dej1dej1de	ej1 de	ej1 d e	ej1 d!e	ej1 d$e	ej1 d%e	e3 d&e4dGe
e	e2 e	e2 f d*e	ej1 d'e3d.e2d/e	e4 dHe	e4 d1e	e d7e	e5 d8e
ej1ej1f f&dIdJZ:dS )O    N)	lru_cache)CallableOptionalTuple)from_dlpack)utils)BlockSparseTensorsTorch get_block_sparse_expected_shapesnormalize_block_sparse_tensorsto_cute_block_sparse_tensors)FlashAttentionForwardSm90)FlashAttentionForwardCombine)FlashAttentionForwardSm100)maxsizec                   C   s   t j d S )zCached device capability check.r   )torchcudaget_device_capability r   r   M/home/ubuntu/.local/lib/python3.10/site-packages/sgl_kernel/_fa4_interface.py_get_device_capability#   s   r   c                 C   s"   | d ur|  ddkr|  S | S )N   )stride
contiguousxr   r   r   maybe_contiguous)   s   "r   c                 C   s   | j |ksJ | d| j  d| | j|ks$J | d| j d| | j|ks6J | d| j d| | js@J | dd S )Nz shape z != expected z dtype z device z must be on CUDA)shapedtypedeviceis_cuda)tnameexpected_shapeexpected_dtypeexpected_devicer   r   r   _validate_tensor-   s   r&      r   Fc                 C   s<   t |  |dd}|r| S |dkr| jd }|j|dS )zUConvert torch tensor to cute tensor for TVM FFI. leading_dim=-1 defaults to t.ndim-1.T)assumed_alignenable_tvm_ffir   r   leading_dim)r   detachmark_layout_dynamicndim)r!   r(   r+   fully_dynamictensorr   r   r   to_cute_tensor:   s   
r1   c                 C   s   |dkrdS t ||  ||S )N   r   )min)total_mblocksnum_SMsnum_n_blocks
max_splitsr   r   r   num_splits_heuristicK   s   r8        r   qkvcu_seqlens_qcu_seqlens_k	seqused_q	seqused_kmax_seqlen_qmax_seqlen_k
page_tablesoftmax_scalecausalsoftcapwindow_size_leftwindow_size_rightlearnable_sinkm_block_sizen_block_sizenum_threads
num_splitspack_gqa_compute_capability	score_modmask_modblock_sparse_tensors
return_lseoutlseaux_tensorsreturnc           U      C   s  dd | ||fD \} }}| j dd \}}|du r(| j dd \}} ||  }!n|j d d }d} | j d }!|	durs|du sBJ d|	jtjksLJ d	|	d
dksWJ d|	j d }"|	j ||"fkseJ |j dd \}#}$|#|$ }%n	d\}#}$|j d }%|j d }&|j d
 }'|du r|	du r|j ||%|&|fksJ |j ||%|&|'fksJ n7|j |#|$|&|fksJ |j |#|$|&|'fksJ n |j |%|&|fksJ |j |%|&|'fksJ |j |d fksJ d|dur|j |d fksJ d|du s|j |fksJ d|du s
|j |fks
J d| jtjtjfv sJ d| j|j  kr*|jks/J d J d||||fD ]}(|(durS|(jtjksGJ d|(ddksSJ dq5|durn|j |fkscJ |jtjksnJ dtdd | |||||||	|f	D sJ d||& dksJ d|dksJ dd|   })||) dksJ d|) |'|) dksJ d|) |
du rd t	
| }
|d!krd}||& }*|du r|*dk}| j}+| j},|du r|| fn|!f}-|du r||| fn||!f}.| jp|jp|j}/|du rtjg |-||'R |+|,d"}nt|d#g |-||'R |+|, |du rB|/s6|r?tj|.tj|,d"nd}n|durPt|d$|.tj|, t| j }0|du r]t n|}1|1d%v shJ d&|du}2|du r|rvd}|dup~|du}3|dus|dur|du r|dkrd'\}}3d}n	d(\}}3nd)\}}3ttj j}4|1d*kr||'  krd+krn n|s|3s|2sd,}|1d-v r|rd+|* dkrd.}|r|dkr|du rd.}|du r|du r| n|!}|du r|%}||* }5|1d/kr|5|krdnd}6nd}6|dk rR|6| }7|3s"|ntdt||| d | }8|8| d | }9|5|7 d |7 }:||& |: };t|;tj|,j|9d+}|dk}<|<rytj|g|-||'R tj|,d"}=tj|g|.R tj|,d"}>|durt|nd.}?|durt|nd.}@|dur|du sJ d0t|}|dup|dup|dup|du}A|dur|Artd1|2r|Artd2|r|jj d dkrd.}|<rtd3|0||'|*||?|@|2|durt |nd|du |du |du |du |du |	du|du|du|du|||6||<||1|$d4vf}B|Bt!j"vr)d5d |||||fD \}C}D}E}F}G|	dur>t#|	d6dd7nd}Hd8d | |||<sK|n|=fD \}I}J}K}L|<r^t#|>d6d9}Mn|durjt#|d6d9}Mnd}Md}N|dur| du r|t$d:t%||| |%|||6\}O}Pt&||O|Pd;}Qt'|Q}Nd}R|durd<d |D }R|1d*kr|	du sJ d=|<rJ d>t(|0||'|*||3|||d|d.d?d?|||dud@}Sn@|1d-v rt)||'|*||3|<||||6| o|3 o|du o|du o|< |||du|$d4v|dup|dudA}Snt$dB|1 dCt*j+|S|I|J|K|L|M|
|4|C|D|E|F|H|||G|N|RdDdEt!j"|B< d}T|durCt%||| |%|||6\}O}Pt&||O|Pd;}Tt!j"|B | |||<sO|n|=|<rU|>n||
|4|||||	||||T| |<rt,|=|>-d
d||durz|-d
dnd|| ||fS )Fa  Forward pass for FlashAttention.

    Args:
        ...
        score_mod: A callable that takes the attention scores and applies a modification.
        mask_mod: A callable that takes token position information and selectively masks
        block_sparse_tensors: A tuple of tensors used for block sparsity.
        return_lse: Whether to return the log softmax of the attention scores. If set to True will always calculate
        out: Optional pre-allocated output tensor. If None, will be allocated internally.
        lse: Optional pre-allocated log-sum-exp tensor. If None, will be allocated when needed.
        aux_tensors: Some score_mods will want to read from global aux_tensors. This is how we thread them through to the inner kernel.
    c                 S      g | ]}t |qS r   )r   .0r!   r   r   r   
<listcomp>   s    z#_flash_attn_fwd.<locals>.<listcomp>N   r   r   z-page_table is not supported with cu_seqlens_kzpage_table must be int32r   z3page_table must be contiguous in the last dimensionNNz.cu_seqlens_k must have shape (batch_size + 1,)z.cu_seqlens_q must have shape (batch_size + 1,)z'seqused_q must have shape (batch_size,)z'seqused_k must have shape (batch_size,)z"inputs must be float16 or bfloat16zinputs must have the same dtypez>cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k must be int32zCcu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k must be contiguouszlearnable_sink must be bfloat16c                 s   s    | ]
}|d u p
|j V  qd S N)r    rZ   r   r   r   	<genexpr>   s
    
z"_flash_attn_fwd.<locals>.<genexpr>zinputs must be on CUDA devicez)num_head must be divisible by num_head_kv   z*head_dim must be less than or equal to 256r'   zhead_dim must be divisible by z head_dim_v must be divisible by g      ?        )r   r   rU   rV   )	   
      z:Unsupported compute capability. Supported: 9.x, 10.x, 11.x)TF)FT)FFre   r9      )rf   rg   Frf   z-softcap and score_mod cannot be used togetherzgmask_mod with aux_tensors is not yet supported for varlen sequences. This will be fixed in a future PR.z\Block sparsity is not yet supported for varlen sequences. This will be fixed in a future PR.z_Block sparsity is not yet supported with SplitKV. TODO: partition sparse block lists per split.)Nr9   c                 S   &   g | ]}|d urt |dddnd qS Nr2   r   r(   r+   r1   rZ   r   r   r   r\         r2   rk   c                 S   rY   r   rl   rZ   r   r   r   r\     s    )r(   zHBlock sparsity requires fixed-length sequences (seqlen_q must be known).)expected_count_shapeexpected_index_shapec                 S   s   g | ]	}t |d ddqS )NT)r(   r/   rl   )r[   bufr   r   r   r\     s    z paged KV not supported on SM 9.0zSplitKV not supported on SM 9.0T)	is_causalis_localrO   tile_mtile_n
num_stagesrM   	Q_in_regsintra_wg_overlapmma_pv_is_rsrR   rQ   has_aux_tensors)qhead_per_kvheadrq   rr   is_split_kvrO   rK   rL   q_stageis_persistentrQ   rR   ry   paged_kv_non_tmais_varlen_qz Unsupported compute capability: z. Supported: 9.x, 10.x, 11.x--enable-tvm-ffioptions).r   r   r   int32r   float16bfloat16allelement_sizemathsqrtr   requires_grademptyr&   float32torch2cute_dtype_mapr   r   CUstreamcurrent_streamcuda_streammaxr3   r8   get_device_propertiesmulti_processor_countr   hash_callablecreate_softcap_scoremodNotImplementedErrormask_block_cntlen_flash_attn_fwdcompile_cacher1   
ValueErrorr	   r
   r   r   r   cutecompile_flash_attn_fwd_combine	transpose)Ur;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   num_headhead_dim
batch_sizeseqlen_qtotal_qmax_num_pages_per_seq	num_pages	page_sizeseqlen_knum_head_kv
head_dim_vr!   	alignmentrz   out_torch_dtyper   q_batch_seqlen_shape	lse_shaper   r   compute_capabilityuse_block_sparsitylocalr   seqlen_q_packgqar|   m_block_size_effectiveseqlen_k_loadedr6   num_m_blocksr4   r{   out_partiallse_partialscore_mod_hashmask_mod_hash	is_varlencompile_keycu_seqlens_q_tensorcu_seqlens_k_tensorseqused_q_tensorseqused_k_tensorlearnable_sink_tensorpage_table_tensorq_tensork_tensorv_tensoro_tensor
lse_tensorsparse_tensorsrn   ro   compile_time_normalizedcute_aux_tensorsfa_fwdnormalized_block_sparse_tensorsr   r   r   r   U   sl  .








.
































	
r   r   r   
cu_seqlenssequsednum_splits_dynamic_ptrsemaphore_to_resetc                 C   sP  |   dv s
J d|  dv sJ d| jtjtjtjfv s#J d|jtjks-J d| jr3|js7J d| dd	ksBJ d
|dd	ksMJ d|j| jdd ksYJ |   dk}|j| jd	d ksmJ d|dur|j|jd	d ksJ d|jtjksJ d|df|df|dffD ]*\}	}
|	dur|	jtj	ksJ |
 d|	jsJ |
 d|	
 sJ |
 dq| jd }| jd }|dksJ |dkrdnd}|d dkrdn	|d dkrdnd}ttt|d}|dkrt|d}ttj j}t|j }t| j }|||||||du|du|duf	}|tjvrt| |s.dnd d!}t|d|jd" d#}t||sCd nd"d!}|durVt|d|jd" d#nd}d$d% ||||fD }|\}}}}t||||||d&}|j||||||dd'std(tj||||||||||d)d*tj|< tj| | ||||||||	 dS )+aT  Forward combine kernel for split attention computation.

    Combines partial outputs and log-sum-exp values from multiple splits
    of attention computation into final outputs.

    Args:
        out_partial: Partial outputs tensor (num_splits, batch, seqlen, nheads, headdim) or
                                            (num_splits, total_q, nheads, headdim) if there's cu_seqlens
        lse_partial: Partial LSE tensor (num_splits, batch, seqlen, nheads) or
                                       (num_splits, total_q, nheads) if there's cu_seqlens
        out: Output tensor (batch, seqlen, nheads, headdim) or (total_q, nheads, headdim) if there's cu_seqlens
        lse: Output LSE tensor (batch, seqlen, nheads) or (total_q, nheads) if there's cu_seqlens.
        cu_seqlens: Cumulative sequence lengths for variable length sequences
        seqused: Used sequence lengths for each batch
        num_splits_dynamic_ptr: Dynamic number of splits per batch
        semaphore_to_reset: Semaphore for synchronization
        k_block_size: Block size for head dimension

    Returns:
        None
    )r2      z'out_partial must have 4 or 5 dimensions)   r2   z'lse_partial must have 3 or 4 dimensionsz'out_partial must be fp16, bf16, or fp32zlse_partial must be fp32ztensors must be on CUDA devicer   r   z4out_partial must be contiguous in the last dimensionr]   z6lse_partial must be contiguous in the seqlen dimensionNr2   zout shape mismatchzlse shape mismatchzlse must be fp32r   r   r   z must be int32z must be on CUDA devicez must be contiguousr   rc   @   r9      r'       r   r   r*   r^   rk   c                 S   ri   rj   rl   rZ   r   r   r   r\     rm   z+_flash_attn_fwd_combine.<locals>.<listcomp>)r   dtype_partialr   rK   k_block_sizelog_max_splits)rM   zIFlashAttention combine kernel cannot be implemented with given parametersr   r   )dimr   r   r   r   r   r    r   r   r   is_contiguousr   r   ceillog2r   r   r   r   r   r   r   r1   r.   r   can_implementRuntimeErrorr   r   )r   r   rU   rV   r   r   r   r   r   r!   r"   r   rN   r   rK   r   r   r   r   r   out_partial_tensorlse_partial_tensor
out_tensorr   optional_tensorscu_seqlens_tensorseqused_tensornum_splits_dynamic_tensorsemaphore_tensor
fa_combiner   r   r   r   W  s    

"




	
	r   c                    sT   t dd dv }|rS ddd  dd  fd	d
fdd}|S )a   
    Decorator for flash_attn_varlen_func:
    - On first call, run several warmup passes with different flag combinations:
        * return_softmax_lse in {False, True}
        * global noncausal (window_size=(None,None))
        * causal (window_size=(None,0))
        * local sliding window (window_size=(64,64))
        * optionally pack_gqa=True if qheads > kvheads and allowed
    - No score_mod / softcap (not supported for varlen yet)
    - Executes sequentially to minimize peak GPU mem
    - Does not modify user tensors (clones)
    SGLANG_DISABLE_FA4_WARMUP )1trueyesonFc                    s4   dd  t  fdd| D  fdd| D fS )zEClone tensor arguments to avoid sharing storage; deepcopy for others.c                 S   s"   t | tjr|   S t| S ra   )
isinstancer   Tensorr,   clonecopydeepcopyr   r   r   r   maybe_clone&  s   
z;warmup_flash_attn.<locals>._clone_args.<locals>.maybe_clonec                 3   s    | ]} |V  qd S ra   r   )r[   ar   r   r   rb   +  s    z9warmup_flash_attn.<locals>._clone_args.<locals>.<genexpr>c                    s   i | ]	\}}| |qS r   r   )r[   r<   r=   r   r   r   
<dictcomp>+  s    z:warmup_flash_attn.<locals>._clone_args.<locals>.<dictcomp>)tupleitemsargskwargsr   r   r   _clone_args#  s   z&warmup_flash_attn.<locals>._clone_argsc                 S   sx   t | dkr
| d n|d}t | dkr| d n|d}zt|jd }t|jd }||fW S  ty;   Y dS w )z*Infer q and kv head counts from arguments.r   r;   r   r<   r]   r_   )r   getintr   	Exception)r   r   r;   r<   qhkvhr   r   r   _infer_heads/  s   
z'warmup_flash_attn.<locals>._infer_headsc                    s   | |\}}||\}}|duo!|duo!|| dko!|| dk}d|v o+|d du}g d}ddg}	g }
|D ]}|	D ]}|
 t||d q<q8|rZ|D ]}|
 t|ddd	 qM|re|
 td
dd |
D ]f} ||\}}|dd d|v r|d rd|d< || tjtj * z	|i | W n ty } zt	
d| W Y d}~nd}~ww W d   n1 sw   Y  ~~tj  t  qgdS )z<Run warmup calls sequentially and release memory after each.Nr   r   rD   )r_   )Nr   )r   r   FT)window_sizereturn_softmax_lse)r  r  rO   r_   rQ   rG   rd   zWarmup combo skipped: %s)appenddictpopupdater   r   streamr   r   loggerdebugempty_cachegccollect)r   r   	base_argsbase_kwargsr   r   can_pack_gqahas_page_tablewindow_presets	lse_flagscomboswsreturn_lse_flagcombowawke)r   r  fr   r   _run_warmups;  sN   &


z'warmup_flash_attn.<locals>._run_warmupsc                     s*   st d  | | d| i |S )NzJRunning FA4 warmup (global/causal/local, LSE on/off, optional GQA pack)...T)r	  infor   )r  doner  r   r   wrappert  s   
z"warmup_flash_attn.<locals>.wrapper)osgetenvlower)r  disable_warmupr  r   )r   r  r  r  r  r   warmup_flash_attn  s   9
r$  r_   rd   r  r  c                 C   sN   t | ||||||f|||	|
d |
d |||||||d\}}|r%||fS |S )Nr   r   )rD   rE   rF   rH   rI   rJ   rG   rN   rO   rT   rQ   rW   )r   )r;   r<   r=   r>   r?   r@   rA   rD   rE   rF   r  rJ   rG   rN   rO   r  rQ   rW   rU   rV   r   r   r   flash_attn_varlen_func  s.   r%  )r'   r   F)NNNNNNNNFNNNNr9   r9   r:   r   NNNNNFNNN)NNNNN)NNNNNNFr_   Nrd   r   NFNN);r   r  loggingr   r   	functoolsr   typingr   r   r   	getLogger__name__r	  cuda.bindings.driverbindingsdriverr   cutlasscutlass.cuter   r   cutlass.cute.runtimer   flash_attn_origin.cuter   %flash_attn_origin.cute.block_sparsityr   r	   r
   r    flash_attn_origin.cute.flash_fwdr   (flash_attn_origin.cute.flash_fwd_combiner   &flash_attn_origin.cute.flash_fwd_sm100r   r   r   r&   r1   r   Float16r   BFloat16r   Float32r   r8   r   r   floatboollistr   r   r   r$  r%  r   r   r   r   <module>   s  


	
 !
    	
 2v	
