o
    iC^                  0   @   sn  d dl Z d dlmZmZ d dlZd dlm  mZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d
d Zeje	jeje	jej e	j!iZ"																dAdej#dej#dej#deej# deej# deej# deej# deej# dee$ de%dee$ dee& dee& deej# de&de&de&d ee% d!ee& d"eej#ej#f f(d#d$Z'i e'_(			%	&		'	(	(				(	(	(	dBdej#dej#dej#d)ej#d*ej#d+ej#dee$ de%de$de&de&de&d,e&d-e&d.e%d/e%d0e%d1e&d2e&d3e&d4e%d"eej#ej#ej#f f,d5d6Z)i e)_*i e)_(i e)_+G d7d8 d8ej,j-Z.G d9d: d:ej,j-Z/			;		%	dCdej#dej#dej#dee$ de%d<eee& ee& f deej# de$d ee% fd=d>Z0								;		%	dDdej#dej#dej#deej# deej# deej# deej# deej# dee$ de%d<eee& ee& f deej# de$d ee% fd?d@Z1dS )E    N)OptionalTuple)from_dlpack)utils)FlashAttentionForwardSm80FlashAttentionForwardSm90)FlashAttentionForwardSm100) FlashAttentionBackwardPreprocess)FlashAttentionBackwardSm80)!FlashAttentionBackwardPostprocessc                 C   s"   | d ur|  ddkr|  S | S )N   )stride
contiguous)x r   O/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/cute/interface.pymaybe_contiguous)   s   "r   F     qkvcu_seqlens_qcu_seqlens_k	seqused_q	seqused_k
page_tablesoftmax_scalecausalsoftcapwindow_size_leftwindow_size_rightlearnable_sinkm_block_sizen_block_sizenum_threadspack_gqa_compute_capabilityreturnc           9      C   s@  dd | ||fD \} }}| j dd  \}}|d u r(| j d d \}}|| }n|j d d }d }| j d }|d urs|d u sBJ d|jtjksLJ d|d	dksWJ d
|j d }|j ||fkseJ |j d d \}}|| }n	d\}}|j d }|j d }|j d	 }|d u r|d u r|j ||||fksJ |j ||||fksJ n7|j ||||fksJ |j ||||fksJ n |j |||fksJ |j |||fksJ |j |d fksJ d|d ur|j |d fksJ d|d u s|j |fksJ d|d u s
|j |fks
J d| jtjtjfv sJ d| j|j  kr*|jks/J d J d||||fD ]}|d urS|jtjksGJ d|ddksSJ dq5|d urn|j |fkscJ |jtjksnJ dtdd | ||||||||f	D sJ d|| dksJ d|dksJ dd|   }|| dksJ d| || dksJ d| |d u rdt	
| }|
d krd }
|| } |d u r| dk}| j}!| j}"|d u r||fn|f}#tjg |#||R |!|"d!}$|d u r|||fn||f}%| jp|jp|j}&|&r"tj|%tj|"d!nd }'t| j }(d"d | |||$fD \})}*}+},|'d urLt|' d#d$j|'jd d%nd }-d&d |||||fD \}.}/}0}1}2|d urpt| d#d$jdd%nd }3|	rwd}|d up|d u}4|d us|d ur|d u r|dkrd'\}	}4nd(\}	}4|d u rtj d n|}5|5d)v sJ d*ttj j}6|5d+kr|	s|4sd,}|5d-kr|rd.|  dks|d us|d urd/}|(||| |	|
d u|'d u |d u |d u |d u |d u |d u|d u|d u|d u|||||5f}7|7tjvr|5d+kr=|d u s#J d0|d u s,J d1t|(||| |	|4|||d|d/d2}8n1|5d-krf|d3v sKJ d4t||| |	|4||	 ob|4 ob|d u ob|d u d5}8ntd6|5 d7t|8|)|*|+|,|-||6|.|/|0|1|3|
|||2tj|7< tj|7 |)|*|+|,|-||6|.|/|0|1|3|
|||2 |$|'fS )8Nc                 S      g | ]}t |qS r   r   .0tr   r   r   
<listcomp>L       z#_flash_attn_fwd.<locals>.<listcomp>   r   r   z-page_table is not supported with cu_seqlens_kzpage_table must be int32r   z3page_table must be contiguous in the last dimensionNNz.cu_seqlens_k must have shape (batch_size + 1,)z.cu_seqlens_q must have shape (batch_size + 1,)z'seqused_q must have shape (batch_size,)z'seqused_k must have shape (batch_size,)"inputs must be float16 or bfloat16inputs must have the same dtypez>cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k must be int32zCcu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k must be contiguouszlearnable_sink must be bfloat16c                 s   s    | ]
}|d u p
|j V  qd S Nis_cudar,   r   r   r   	<genexpr>z   s    z"_flash_attn_fwd.<locals>.<genexpr>inputs must be on CUDA device)num_head must be divisible by num_head_kv   *head_dim must be less than or equal to 256   head_dim must be divisible by  head_dim_v must be divisible by       ?        dtypedevicec                 S   *   g | ]}t | d dj|jd dqS r?   assumed_alignr   leading_dimr   detachmark_layout_dynamicndimr,   r   r   r   r/             rI   rK   c                 S   s0   g | ]}|d urt | ddjddnd qS )NrR   rI   r   rK   r   rN   rO   r,   r   r   r   r/      s    ")TF)FT)	   
   z4Unsupported compute capability. Supported: 9.x, 10.xrT      rU   r   Fz paged KV not supported on SM 9.0z"Sm90 doesn't support additive sink)	is_causalis_localr'   r$   r%   
num_stagesr&   	Q_in_regs)Nr   z7Only page_size=128 is supported for paged KV on SM 10.0)qhead_per_kvheadrW   rX   r'   is_persistentz Unsupported compute capability: z. Supported: 9.x, 10.x) shaperE   torchint32r   float16bfloat16allelement_sizemathsqrtrF   emptyrequires_gradfloat32torch2cute_dtype_mapr   rN   rO   rP   cudaget_device_capabilityCUstreamcurrent_streamcuda_stream_flash_attn_fwdcompile_cacher   r   
ValueErrorcutecompile)9r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   num_headhead_dim
batch_sizeseqlen_qtotal_qmax_num_pages_per_seq	num_pages	page_sizeseqlen_knum_head_kv
head_dim_vr.   	alignmentr[   out_torch_dtyperF   q_batch_seqlen_shapeout	lse_shaperg   lserE   q_tensork_tensorv_tensoro_tensor
lse_tensorcu_seqlens_q_tensorcu_seqlens_k_tensorseqused_q_tensorseqused_k_tensoradditive_sink_tensorpage_table_tensorlocalcompute_capabilityrm   compile_keyfa_fwdr   r   r   ro   4   s  






 .

.


 

,&


(	

 

ro   rC   @   r=   r2   r   doutr   num_stages_Qnum_stages_dO
SdP_swapAB
dKV_swapAB	dQ_swapABAtomLayoutMSdPAtomLayoutNdKVAtomLayoutMdQ	V_in_regsc           B      C   s  dd | |||||fD \} }}}}}| j \}}}}|j \}}}}|j \}}}}|j ||||fks3J |j ||||fks>J |j ||||fksIJ |j ||||fksTJ |j |||fks`J d| jtjtjfv smJ d| j|j  kr|j  kr|j  kr|jksJ d J d|jtjksJ dtdd | |||||fD sJ d	|| d
ksJ d|dksJ dd|   }|| d
ksJ d| || d
ksJ d| |d u rdt	| }|| }| j
}||	 d |	 |	 } |d d d d }!t| }"t|}#t|}$tj||| |! tj|d}%tj||| tj|d}&tj||| tj|d}'|dkrh||
 d |
 |
 }(|d d d d })tj|||(|! tj|d}*tj|||(|) tj|d}+t| j },dd | |||||"|#|$fD \}-}.}/}0}1}2}3}4t| ddjdd}5dd |%|&|'fD \}6}7}8|dkrdd |*|+fD \}9}:ttj j};|,||	|f}<|<tjvrt|,||	|d}=t|=|0|1|7|5|8|6|;tj|<< tj|< |0|1|7|5|8|6|; |,|||||dk|	|
||||||||||f}>|>tjvr8t|,||||	|
|||||||||||d}?t|?|-|.|/|1|8|7|6|dkr(|3n|9|dkr0|4n|:||;tj|>< tj|> |-|.|/|1|8|7|6|dkrJ|3n|9|dkrR|4n|:||; |,||	|||f}@|@tjvrzt|,||	|||}At|A|6|2||;tj|@< tj|@ |6|2||; |dkr|,||
|||f}@|@tjvrt|,||
|||}At|A|9|3||;tj|@< tj|@ |9|3||; |,||
|||f}@|@tjvrt|,||
|||}At|A|:|4t d|;tj|@< tj|@ |:|4t d|; |"|#|$fS )Nc                 S   r*   r   r+   r,   r   r   r   r/     r0   z#_flash_attn_bwd.<locals>.<listcomp>z4lse must have shape (batch_size, num_head, seqlen_q)r5   r6   zlse must be float32c                 s   s    | ]}|j V  qd S r7   r8   r,   r   r   r   r:     s    z"_flash_attn_bwd.<locals>.<genexpr>r;   r   r<   r=   r>   r?   r@   rA   rB   r       rD   c                 S   rG   rH   rM   r,   r   r   r   r/   *  rQ   rR   rI   r2   rK   c                 S   $   g | ]}t | d djddqS r?   rI   r2   rK   rS   r,   r   r   r   r/   /      c                 S   r   r   rS   r,   r   r   r   r/   4  r   )r&   rC   )r   )!r]   rE   r^   r`   ra   rh   rb   rc   rd   re   rF   
empty_likerf   zerosri   r   rN   rO   rj   rl   rm   rn   _flash_attn_bwdcompile_cache_prer	   rr   rs   rp   r
   compile_cache_postr   cutlassFloat32)Br   r   r   r   r   r   r   r   r    r$   r%   r&   r   r   r   r   r   r   r   r   r   rv   rw   rt   ru   _r|   r}   r~   r   r[   rF   seqlen_q_roundedhead_dim_roundeddqdkdvdq_accumdpsumlse_log2seqlen_k_roundedhead_dim_v_roundeddk_accumdv_accumrE   r   r   r   r   	do_tensor	dq_tensor	dk_tensor	dv_tensorr   dq_accum_tensordpsum_tensorlse_log2_tensordk_accum_tensordv_accum_tensorrm   compile_key_pre
fa_bwd_prer   fa_bwd_sm80compile_key_postfa_bwd_postr   r   r   r      s
  &B&








	






r   c                   @   s|   e Zd Ze						ddejdejdejdee d	ed
e	ee
 ee
 f deej dedee fddZedd ZdS )FlashAttnFuncNFr3   rC   r   r   r   r   r   window_sizer#   r    r'   c
                 C   sZ   t ||||||d |d |||	d
\}
}| ||||
| || _|| _|| _|| _|
|fS )Nr   r   )r   r   r!   r"   r#   r    r'   ro   save_for_backwardr   r   r   r    )ctxr   r   r   r   r   r   r#   r    r'   r   r   r   r   r   forward  s$   
zFlashAttnFunc.forwardc              
   G   sD   | j \}}}}}t||||||| j| j| j	\}}	}
||	|
gdR S )N)NNNNN)saved_tensorsr   r   r   r    )r   r   argsr   r   r   r   r   r   r   r   r   r   r   backward  s   
zFlashAttnFunc.backwardNFr3   NrC   N__name__
__module____qualname__staticmethodr^   Tensorr   floatboolr   intr   r   r   r   r   r   r     s:    	
r   c                   @   s   e Zd Ze									ddejdejdejdeej d	eej d
eej deej deej dee dede	ee
 ee
 f deej dedee fddZedd ZdS )FlashAttnVarlenFuncNFr3   rC   r   r   r   r   r   r   r   r   r   r   r   r#   r    r'   c                 C   sl   t |||||||||	|
|d |d |||d\}}| |||||||||	 |	| _|
| _|| _|| _||fS )Nr   r   )r   r   r   r!   r"   r#   r    r'   r   )r   r   r   r   r   r   r   r   r   r   r   r   r#   r    r'   r   r   r   r   r   r     s.   
zFlashAttnVarlenFunc.forwardc              	   G   s    | j \	}}}}}}}	}
}td)NzWBackward pass for FlashAttention with variable length sequences is not implemented yet.)r   NotImplementedError)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r     s   zFlashAttnVarlenFunc.backward)	NNNNFr3   NrC   Nr   r   r   r   r   r     sT    	
)r   r3   r   c	           	      C   s   t | ||||||||	S r7   )r   apply)	r   r   r   r   r   r   r#   r    r'   r   r   r   flash_attn_func  s   r   c                 C   s$   t | |||||||||	|
|||S r7   )r   r   )r   r   r   r   r   r   r   r   r   r   r   r#   r    r'   r   r   r   flash_attn_varlen_func#  s    r   )NNNNNNFNNNNr   r   r   NN)NFrC   r   r   r=   r2   r2   FFFr2   r2   r2   Fr   )NNNNNNFr3   NrC   N)2rd   typingr   r   r^   cuda.bindings.driverbindingsdriverrj   r   cutlass.cuterr   cutlass.cute.runtimer   flash_attn.cuter   flash_attn.cute.flash_fwdr   r   flash_attn.cute.flash_fwd_sm100r   $flash_attn.cute.flash_bwd_preprocessr	   flash_attn.cute.flash_bwdr
   %flash_attn.cute.flash_bwd_postprocessr   r   r`   Float16ra   BFloat16rh   r   ri   r   r   r   r   ro   rp   r   r   r   autogradFunctionr   r   r   r   r   r   r   r   <module>   s  	

 6
	

 438	
	
