o
     i                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
mZmZmZmZmZ d dlZddlmZ ddlmZ dd	lmZmZmZmZmZmZmZmZ dd
lmZmZm Z m!Z! de
ej" de#fddZ$dede%fddZ&dede%fddZ'dede%fddZ(eG dd de Z)ese rddl*m+Z+m,Z, ndZ+dZ,de%fddZ-dej.de%fddZ/eG dd  d eZ0d!ej"d"e
ej" d#ej"d$ej"fd%d&Z1d!ej"d"e
ej" d#eej" d$eej" fd'd(Z2d#e	ej" d$e	ej" d!ej"d"ej"d)ej"d*ej"dee	ej" e	ej" f fd+d,Z3		d8d!ej"d"e
ej" d#eej" d$eej" d)e
ej" d*e
ej" deee#e4f ee4e4e4f f fd-d.Z5d/d0 d1D Z6e6d Z7e6d Z8e6d2 Z9e6d3 Z:e6d4 Z;e6d5 Z<e6d6 Z=e6d7 Z>dS )9    N)	dataclass)TYPE_CHECKINGAnyDictIterableListOptionalSequenceTupleTypeUnioncast   )_is_triton_available   )register_operator   )*BlockDiagonalCausalWithOffsetGappyKeysMask+BlockDiagonalCausalWithOffsetPaddedKeysMaskBlockDiagonalGappyKeysMaskBlockDiagonalPaddedKeysMask/PagedBlockDiagonalCausalWithOffsetGappyKeysMask0PagedBlockDiagonalCausalWithOffsetPaddedKeysMaskPagedBlockDiagonalGappyKeysMask PagedBlockDiagonalPaddedKeysMask)AttentionFwOpBaseContextInputscheck_lastdim_alignment_stride1xstride_namesc                 G   s@   | d u rdd |D S | j t|ksJ dd t||  D S )Nc                 S   s   i | ]}d | dqS )stride_N ).0namer"   r"   S/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/fmha/triton_splitk.py
<dictcomp>*   s    z_strides.<locals>.<dictcomp>c                 S   s   i | ]
\}}d | |qS )r!   r"   )r#   r$   sr"   r"   r%   r&   ,   s    )ndimlenzipstride)r   r    r"   r"   r%   _strides(   s   r,   	attn_biasreturnc                 C   s   t | ttttfS N)
isinstancer   r   r   r   r-   r"   r"   r%   _is_supported_causal_bias/   s   r2   c                 C      t | ttfS r/   )r0   r   r   r1   r"   r"   r%   _is_supported_gappy_bias;      r4   c                 C   r3   r/   )r0   r   r   r1   r"   r"   r%   _is_supported_paged_biasE   r5   r6   c                       sP   e Zd ZU dZdZeej ed< dZ	eej ed< e
def fddZ  ZS )	InputsFp8z
    Each of k/v_fp8_scales is an int32 tensor of shape (1, B * Mkv, Hq),
    or (1, page_size * max_pages_per_lane, Hq) in the paged case.
    Each int32 element contains two packed fp16 number
    - scales and shifts for row-wise FP8 quantization.
    Nk_fp8_scale_shiftv_fp8_scale_shiftr.   c                    sF   t t| j| jdur| j  nd | jdur | j   S d S )zP
        Number of bytes in the input, not counting the attention bias.
        Nr   )superr7   nbytesr8   untyped_storager9   )self	__class__r"   r%   r;   [   s   



zInputsFp8.nbytes)__name__
__module____qualname____doc__r8   r   torchTensor__annotations__r9   propertyintr;   __classcell__r"   r"   r>   r%   r7   O   s   
 r7   )_fwd_kernel_splitK_splitK_reducec                   C   s   t jjd uS r/   )rD   versioncudar"   r"   r"   r%   _is_cudav   s   rN   devicec                 C   s   t  o
tj| dkS )N   r   )rN   rD   rM   get_device_capability)rO   r"   r"   r%   _is_cuda_at_least_sm80z   s   rS   c                       s  e Zd ZU dZdZdhZdZejej	hZ
dZedejeeeeeeeef
Zee ed< dZdZdZdZdZd	ZdZ e!e" ed
< dZ#dZ$e%ed< dZ&dZ'g dZ(dZ)e"ed< dZ*e"ed< dZ+e"ed< dZ,e"ed< e-de"de"de"de"de.e/ f
 fddZ0e-de1de.e/ f fddZ2e-d e"d!e"d"e"d#e"de"de"fd$d%Z3e-d&d' Z4e-d(e1de5e!ej e!ej f fd)d*Z6e-d(e1d+e%de5eje!e7 f fd,d-Z8e-e9j:dddddd.d/e"d0e!e" d1e!e" d2e!e" d3e!e" d4e!e% de;e< fd5d6Z=  Z>S )7FwOpa>  Flash-Attention with Split-K. Supports fused int4 and fp8 K/V quantization.
    Quantized path will be taken if input K/V have type int32.

    Int4 quantization can be row-wise or group-wise (when cls.NUM_GROUPS > 1) along
    the last dimension of K and V. Currently 1, 2, 4, or 8 groups per row are supported.
    Quantization coefficients (scale and shift) are represented as two
    float16 constants per group, packed into int32. Quantization coefficients of
    all groups are placed at the beginning of the row. So, if unquantized K/V have head
    dimension D, the quantized versions have head dimension D // 8 + NUM_GROUPS
    and dtype int32.
    Pseudocode for dequantizing one row can look like:
    group_size = D // 8
    for i in range(NUM_GROUPS):
        group_start = NUM_GROUPS + i * group_size
        group_quant = K[..., group_start: group_start + group_size]
        scale, shift = unpack_int32_into_float16x2(group_quant[0])
        group_dequant = group_quant[..., 1:] * scale + shift
    ...

    For fp8 only row-wise quantization is supported. To use it, provide input of type
    xformers.ops.fmha.triton_splitk.InputsFp8 (instead of the usual xformers.ops.fmha.Inputs) to
    xformers.ops.fmha.triton_splitk.FwOp.apply or xformers.ops.fmha._memory_efficient_attention_forward.

    This op uses Paged Attention when bias is one of the Paged* classes.
    In this case bias has additional fields:
    - block_tables of shape [batch_size, max_num_pages]
    - K/V of shape [1, max_num_pages * page_size, num_heads, head_dim]
      or [1, max_num_pages * page_size, num_groups, num_heads, head_dim]

    The shape which the kernel takes the queries and the output
    is quite different from the user interface. There are three
    types of input (a) no bias / tensor bias, (b) variable q_len
    (which is only for non causal) and (c) other bias objects.
    From the interface to the kernel the following changes happen.

    (0) In all cases, a group dimension may need to be added.

    (1) For (c), a batch dimension is created, reshaping from (1, B*Mq, G, Hq, K)
        to (B, Mq, G, Hq, K)

    (2) For (a) and (c), in the case of multiquery (i.e. the head dimension
        of keys and values is expanded), the head-swapping trick
        reshaping from (B, Mq, G, Hq, K) to (B, M=Hq*Mq, G, H=1, K)

    (3) For (b), in the case of multiquery, the head-swapping trick
        trick, reshaping from (1, Mq, G, Hq, K) to (1, Mq*Hq, G, H=1, K)
        Note here that Mq is a single long dimension which spans all the queries
        in the batch, unlike in case (C). Also that Hq has to run faster than
        Mq in order that the queries in a batch element remain evenly spaced.

    In all cases, the shape as seen by the kernel is called (Bqq, Mqq, G, H, K).
    The kernel operates on B batch elements and M queries per batch element.
    TrM   rP      NSUPPORTED_ATTN_BIAS_TYPESFtriton_splitKFSPLIT_K    SPLIT_K_EARLY_EXITr   )r   r      rQ      BLOCK_M@   BLOCK_N
NUM_STAGESr   	NUM_WARPSMqMkvKKvr.   c                    sB   t  ||||}|dvr|d| d |dkr|d |S )N>      rU   r\   rY   r^      z
Embed dim z not supportedr   zQuery length is 0)r:   shape_not_supported_reasonsappend)clsrb   rc   rd   re   reasonsr>   r"   r%   rh      s   
z FwOp.shape_not_supported_reasonsdc           	         s  t t| |}tjjtjjfdk r|d t|d|j	d |j
jtjkr6t|d|j
d t|d|jd | jd u r@|d |jjdkrSt rSt|jsS|d	 |j	jd
 }t|jttf}t|j}t|j}|so|r|jj}||jd kr|d|jd  d|  |j}|r||jkr|d |dkr|r|d |r|jj}|j
jd
 | r|d|j
jd
 d|d | j r|d || j! r|d|d| j!d t|jtj"r|jj#dvr|d|jj d | j$d ur| j$d
kr|d |S )N)r   	   z+triton_splitk requires python 3.9 or above!queryrQ   keyvalueztriton is not availablerM   zJrequires NVidia GPU with sm80 minimum compute capacity, e.g., A100/H100/L4r   zExpected total z queries not z5Variable query len is not supported for causal masks.r\   zEQuery length should not be larger than 16 for causal attention biasesz_For paged attention, key.shape[1] should be divisible by the page size, but got d.key.shape[1]=z, page_size=.z/Paged attention doesn't support autotuning yet.zXFor paged attention, page size should be divisible by the block size, but got page_size=z, cls.BLOCK_N=)r[      zYAdditive attention bias has to have shape (B, G, H, Mq, Mkv) or (B, H, Mq, Mkv), but got z:Additive attention bias is not supported with split-k > 1.)%r:   rT   not_supported_reasonssysversion_infomajorminorri   r   rn   ro   dtyperD   int32rp   OPERATORrO   typerN   rS   shaper0   r-   r   r   r6   r2   	q_seqinfoseqstart_py
max_seqlen
min_seqlen	page_sizeAUTOTUNEr_   rE   r(   rX   )	rj   rl   rk   q_lenis_block_diagonalis_paged	is_causalseqinfor   r>   r"   r%   rt      s   









zFwOp.not_supported_reasonsBGHMkc                 C   sF  t || d}tjjrdt || d d| }d}t d|| |  d}	|dkr?||d  |k r?|d }|dkr?||d  |k s/||	krK|d }||	ksC|| d | }
|
| | }||
k ra|d7 }d}n3|dkrr|| | dkrrdS t |d| }|dkr|dkrdnd}|| }	d}||	kr|d }||	kst||}t |d}|S )z"Heuristic for the number of splitsr   i   r^   r   rU   rg   )maxrD   rL   hipmin)rj   r   r   r   r   rb   bhsplit_kmax_chunk_sizesplit_k_stop_val
split_size
chunk_sizesplit_k_upper_boundr"   r"   r%   get_split_k@  s8   

zFwOp.get_split_kc                 C   s*   ddl m}m} | jr|| j S || jS )Nr   )_fwd_kernel_splitK_autotune_get_splitk_kernel)_triton.splitk_kernelsr   r   r   
NUM_GROUPS)rj   r   r   r"   r"   r%   
get_kernele  s   

zFwOp.get_kernelinpc                 C   s|   t |dsdS tt|}|j}|j}|d usJ |d usJ |jdkr-|d|dfS |jdkr6||fS td|j)Nr8   NNr   r   r[   zVFP8 scales have to be provided in BMH or BMGH format, but got k_fp8_scale_shift.shape=)	hasattrr   r7   r8   r9   r(   	unsqueeze
ValueErrorr}   )rj   r   inp_r8   r9   r"   r"   r%   get_fp8_scale_shiftq  s    



zFwOp.get_fp8_scale_shiftneeds_gradientc           :         sv  |  |\}}| }t|jtjs'd}tttt	t
ttttttf  |j}n|j}d}d}d}	d}
d}| \}}}d}d}d}t|t}t|}t|}|dur|s[|s[|s[J |jjj|jjksfJ |jj}|ddkssJ |r|jj}	|	ddksJ |jd dksJ t| |jdd \}}t|jjdk}|ot|}|o| }|jd }|r|jj}
d}|
ddksJ n	|  d||}|s|rdn d||f}| |}| |}|dur|dur| |dd }| |dd }|jd }|}n|j\ }}}|dur|j!dkr|"d}d}|jd dkr|ddkr|ddkr|du rd	}|r^|}|jd dksMJ |#dddd
d$ddd|}n|#dddd
d$|jd dd|}|ddddddddf }|ddddddddf }|dur|dur|ddddddddf }|ddddddddf }|j%tj&kr|dur|jd d }d}n"d}|jd | j' d }n|jd }d}| j'dksJ d| j'|j\}}}|j\} }!}||ksJ d| d| |r*|dusJ |dus#J |jj| n|!|r3|jj(nd}"d}#d}$|rM|jj)}#|#jd }$|#jd |" }n|durYt*||jj}| j+durc| j+n|du rq| , ||nd|!| j- d | j- | j- }%dk}&| |||f}'|&r|tj.krtj.ntj/}(| j0rtj1| |%|g|(|jd})n'tj2| |%|g|(|jd})ntj2| |!|g||jd#ddddd
d})d\}*}+|tj/tj.fv },|&s|r | j0rtj3| |!gt4d |&s|,r tj.ntj/|jd}+ntj2| |!g|&s|,rtj.ntj/|jd}+ fdd}-| d  }.|du}/| 5 }0| j6}1| j7}2| j8rGi }3n|| j9}4| j:}5tj;j<r dkr^d}4d}5d}2n^ dkrodkrod
}4d}5d}2nM dkrdk r~d
}4d}5nd}4d}5d}2n7d}4d}5d}2n0|dko|dkotj=> dk}6|6r|dkrd}4dkrd}1dkrd}1dkrd}1|1|2|4|5d}3|0|- dZ||||j?|)|+|#||	|
||||dt@|ddddd t@|d!d"d#d$d%t@|d&d'd(d)d*t@|)d+d,d-d.d/d0t@|+d1d2d3d4d5t@|#d6d7t@|d8d9d:d;d<t@|d=d>d?d@t@|dAdBdCdDi dE|$dF dGdHdIdJ|dK|.dL|dM|/dN|dO| j'dP|dQ|dR|&dS| j0dT|dU|"dV|&p`|dW|du|3 |&s0|)dddddddf }7|r|r|7 d|||#dd
dddA }7n|7 | |||}7|7#dddd
dA }7|r|+dusJ |+dddddddf }*|r|r|* d||#dddd
}*n#|* | ||}*|dur|s|*#dd
dd$d| | }*nd}*|jj!dkrdksJ |*dur|*dddf }*|7dddddf }7|*du r(|7dfS |7tB|7|*dXfS tj2|'|j|dY}7|+dus@J d}8|r|,rKtj.ntj/}9|du sV|rftj2| ||f|j|9dY}8|8}*ntj2d| | f|j|9dY}8|8 | |#d
ddd}*|)ddddddddd|!f })|r|r|) | |||#dddd
dd})|+ | ||#dddd
d}+n"|) | |||#dddd
dd})|+ | ||#dddd
d}+tC|7|*|)|+ |jj!dkrdksJ |7dddddf }7|8dur|8dddf }8|dkr|7D  |dur*|s*|7 d | ||}7|8du r3|7dfS |7tB|7|8dXfS )[a%  
        Note that inp can be of type InputsFp8, in which case K/V are assumed to be row-wise FP8-quantized.
        This is different from int4 quantization, where coefficients are kept together with the quantized
        values at the beginning of each row, and inp has type Inputs.
        NFr   r   rq   r[   r   Tr   rQ   zcls.NUM_GROUPS=zKeys have head dim z but queries have head dim )ry   rO   rs   r   infc                    s&   dd l }|| d    fS )Nr   r]   )tritoncdiv)METAr   r   r   r   Mr   r"   r%   gridM  s   zFwOp.apply.<locals>.gridrY   rg   r\   r^   )rQ   rm   )r]   r_   	num_warps
num_stages)Qrd   Vsm_scale
Out_splitK
LSE_splitkblock_tablesSeq_lenSeq_starts_kSeq_starts_qSeq_starts_q_multiplieradditive_biasK_fp8_scale_shiftV_fp8_scale_shiftqzqmqgqhqkkzknkgkhkkvzvnvgvhvkosk_zosk_gosk_hosk_sosk_mosk_klsek_zlsek_glsek_hlsek_slsek_mblocktableszblocktableslbias_bbias_gbias_hbias_qmbias_kmk_fp8_scale_shift_zk_fp8_scale_shift_nk_fp8_scale_shift_gk_fp8_scale_shift_hv_fp8_scale_shift_zv_fp8_scale_shift_nv_fp8_scale_shift_gv_fp8_scale_shift_hkv_cache_blocks_per_rowZr   r   N_CTX_QN_CTX_KBLOCK_N_PER_SPLITBLOCK_DMODELUSE_SEQ_LENPACKED_PER_VALN_GROUPS	IS_CAUSALNUM_QUERIES_CAUSAL	IS_SPLITKrZ   USE_PAGED_ATTENTION	PAGE_SIZE	WRITE_LSEHAS_ADDITIVE_BIAS)outlse)rO   ry   r"   )Er   get_output_dtyper0   r-   rD   rE   r   r   r   r   r   r   r   r   r   r   r   get_qkv_in_bmghkr4   r6   	k_seqinfoseqlenrO   rn   r+   seqstartr}   r)   boolr~   r   r2   viewr(   r   permutereshapery   rz   r   r   r   r   rX   r   MAX_BLOCK_Mfloat64float32rZ   zerosemptyfullfloatr   r]   r_   r   ra   r`   rL   r   rM   rR   scale_floatr,   
contiguousr   merge_attentionszero_):rj   r   r   r8   r9   output_dtypeattn_bias_tensorr-   seq_lenseq_starts_kseq_starts_qseq_starts_q_multiplierqkvr   r   
variable_qr   is_gappyr   HqKq
multiple_qKkvkv_shaperb   mqa_swap_seqlen_headLkr   _r   BqqMqqr   r   r   M_ceilr   output_shapeo_splitk_dtypeo_splitkr   
lse_splitkoutput_f64_lser   r   use_seq_lenkernelr]   r_   
extra_argsr   r   should_modify_warp_and_blockr   
output_lse	lse_dtyper"   r   r%   apply  s  








"&    




	










&'()*+,-./0123456789;& 

&




z
FwOp.apply)block_mblock_nr   r   split_k_early_exitsplitkr-  r.  r   r   r/  c                C   st   d| |d}|d ur||d< |d ur||d< |d ur ||d< |d ur(||d< |d ur0||d< t d| | f|S )	Ntriton_splitK)NAMErX   r]   r_   ra   r`   rZ   FwOp_S)r|   )rj   r0  r-  r.  r   r   r/  kwargsr"   r"   r%   get_operator  s$   zFwOp.get_operator)?r@   rA   rB   rC   r{   SUPPORTED_DEVICESCUDA_MINIMUM_COMPUTE_CAPABILITYrD   halfbfloat16SUPPORTED_DTYPESSUPPORTED_MAX_Kr|   rE   r   r   r   r   r   r   r   r   rV   r   r   rF   SUPPORTS_DROPOUTSUPPORTS_CUSTOM_SCALESUPPORTS_BMGHKSUPPORTS_OUTPUT_DTYPESUPPORTS_PARTIALr2  rX   r   rH   r   rZ   r   r   r   NUM_GROUPS_VALUESr]   r_   r`   ra   classmethodr   strrh   r   rt   r   r   r
   r   r   r,  	functools	lru_cacher   r   r5  rI   r"   r"   r>   r%   rT      s   
 6F$$
   	rT   attn_outlse_out
attn_split	lse_splitc                 C   s  dd l }ddlm} | j\}}}}	}
|j\}}}}}}|j\}}}}}||  kr-|krWn n(||  kr9|krWn n|	|  krE|krWn n||  krQ|krWn n|
|ksgJ d| jd|jd|j||kswJ d|jd|j|d ur|j\}}}}||kr||kr|	|kr||ksJ d| jd|j|| |	 d	k stjjrd
nd}||}||| |	 df}|| ||| |f||dt|ddddddt|dddddt| dddddt|dddd | jd! ||	|d u|d" d S )#Nr   r   )rK   $Incompatible shapes: attn_out.shape=z, attn_split.shape=z, lse_split.shape=z&Incompatible shapes: attn_split.shape=, lse_out.shape=rY   r[   r   )r   splitK_pow2r   r   r   r   r   r   r   r   r   r   r   ozomogohoklse_zlse_glse_hlse_mrq   )
BLOCK_SIZEr   r   r   r   )	r   r   rK   r}   rD   rL   r   next_power_of_2r,   )rF  rG  rH  rI  r   rK   r   r   r   r   r  B1G1H1r   M1Kq1B2G2H2split_k1M2B3G3H3M3r   rL  r   r"   r"   r%   r	  ;  s\   

" 
	
r	  c           	      C   st   ddl m} ddlm} t| |||\}}||t|d}|| g ||R | |d|| jd |d ud d S )	Nr   unroll_varargsr   )_splitK_reduce_varargsN)OutLSErq   )rV  r   )xformers.triton.vararg_kernelrg  r   rh  _prepare_reduce_kernel_paramsr)   r}   )	rF  rG  rH  rI  rg  rh  kernel_argsr   reduce_kernelr"   r"   r%   merge_attentions_varargso  s&   rq  	grad_attngrad_lsec                 C   s   ddl m} ddlm} dd | D }dd |D }	t||| |||\}
}||t| d}|| g | |||	R ||||d	|
d
|jd i ||	fS )Nr   rf  r   )_splitK_reduce_varargs_backwardc                 S      g | ]}t |qS r"   rD   
empty_liker#   r   r"   r"   r%   
<listcomp>      z5merge_attentions_varargs_backward.<locals>.<listcomp>c                 S   ru  r"   rv  rx  r"   r"   r%   ry    rz  ri  )rk  rl  DOutDLSErV  rq   )rm  rg  r   rt  rn  r)   r}   )rH  rI  rF  rG  rr  rs  rg  rt  dattn_splitkdlse_splitkro  r   reduce_kernel_backwardr"   r"   r%   !merge_attentions_varargs_backward  s:   	
r  c                 C   s`  | j \}}}}	}
|d j \}}}}}|d j \}}}}||  kr%|krOn n(||  kr1|krOn n|	|  kr=|krOn n||  krI|krOn n|
|kscJ d| j d|d j d|d j |d ur|j \}}}}||kr~||kr~|	|kr~||ksJ d| j d|j i }i }tt|D ]A}|t|| dt| dt| dt| d	t| d
t|  |t|| dt| dt| dt| dt|  q|| |	 dk stjjrdnd}||| |	 df}||	|d||}|t| ddddd |t|dddd |d ur,|t|dddd d! |t|d"d#d$d% ||fS )&Nr   rJ  z, attn_split[0].shape=z, lse_split[0].shape=rK  r   r   r   r   r   r   r   r   r   rY   r[   r   r   )r   r   r   rM  rN  rO  rP  rQ  rR  rS  rT  rU  dozdomdogdohdokdlse_zdlse_gdlse_hdlse_m)	r}   ranger)   updater,   rC  rD   rL   r   )rF  rG  rH  rI  rr  rs  r   r   r   r   r  rX  rY  rZ  r[  r\  r]  r^  r_  ra  rb  rc  rd  re  attn_split_strideslse_split_stridesir   r   ro  r"   r"   r%   rn    sj   	
""









 

rn  c                 C   s   i | ]}|t |qS r"   )rT   r5  )r#   r  r"   r"   r%   r&     s    r&   )r   r   r[   rQ   r\   rY   0   r^   H   P   `   p   rg   r[   rQ   r\   rY   r^   rg   r   )?rD  ru   dataclassesr   typingr   r   r   r   r   r   r	   r
   r   r   r   rD    r   commonr   r-   r   r   r   r   r   r   r   r   r   r   r   r   rE   rC  r,   r   r2   r4   r6   r7   r   rJ   rK   rN   rO   rS   rT   r	  rq  r  rH   rn  FwOp_MapFwOp_S1FwOp_S2FwOp_S4FwOp_S8FwOp_S16FwOp_S32FwOp_S64	FwOp_S128r"   r"   r"   r%   <module>   s   4(



     >
4

+
C