o
     i8                     @   s*  U d dl Z d dlmZmZmZ d dlZd dlZd dlmZ	 d dl
mZ d dlmZmZ dZedZdd ed	d
D Zd7dejdedejfddZG dd dZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd  d eZd!eiZeeee f e d"< ej!j"reed#< ej!j#re$eeed$ e j%j&e j%j'fd%kreed&< eed'< zd dl(Z(G d(d) d)eZ)e)ed*e(j* < W n	 e+y   Y nw d+d ed	d
D d,d d-D  Z,d.d/ Z-ej./d0d1d e- D d2d3 Z0d8d4d5Z1e2d6kre1  dS dS )9    N)AnyDictType)create_attn_bias)NotSupportedInputErrorbenchmark_main_helper2      ?cudac                 C   sD   g | ]}d D ]}t tddd|  dd| d|dtjjjdqqS ))      r
   r         BMqMkvHqHkvKattn_bias_type)dictmaxxopsfmha	attn_bias+BlockDiagonalCausalWithOffsetPaddedKeysMask.0ihkv r    _/home/ubuntu/.local/lib/python3.10/site-packages/xformers/benchmarks/benchmark_attn_decoding.py
<listcomp>   s    
r"         r
   k
num_groupsreturnc                 C   s2  | j g | jdd || jd | R  } tj| dddj}tj| dddj}|| d }tj| dddj}|tj}|tj}| || j || j d }|tj	}|d@ }|ddddf |dd	ddf d
>  }tj
|tj	|tj	gdd}	tj
|	jdd|jddgddtj}
|
S )aT  
    Auxiliary int4 row quantization function used for benchmarking and tests.
    Matches the behaviour of torch.ops.llama_cpp.dequantize_int4_cache -
    quantization parameters (scale and offset) of each row along the last
    dimension of the tensor are assumed to be packed into two float16 values
    at the beginning of the row.
    NT)dimkeepdim   r   .r   r
      )r)   )	start_dim)reshapeshapetorchr   valuesmintofloat16expanduint8concatviewflattenint16)r%   r&   max_valsmin_valsscale_kshift_kin_bytesin_int4in_int4_packedscale_shiftk_quantr    r    r!   quantize_kv_int4&   s.   ,
 (

rE   c                   @   sV   e Zd ZU dZeed< dedededededed	ed
dfddZdd Z	dddZ
dS )AttentionDecodingBaseNOPr   r   r   r   r   r   bwr'   c	                 C   s  t j}	t d d| d| d| d| d| d| d|| | | d	 || | |  || | |  d	  | _d
| _||||||f| _||ksLJ || dksTJ t j||||| |gd|	|d| _t j|||d|gd|	|dddd|| d| _	t j|||d|gd|	|dddd|| d| _
||kr| jd d d d d d df | _| j	d d d d d d df | _	| j
d d d d d d df | _
|dkr| jd d d d df | _| j	d d d d df | _	| j
d d d d df | _
t||||| |||	tdd| jd| _t| jtjjjrG| jjddg| jjd	d  R  | _| j	jddg| j	jd	d  R  | _	| j
jddg| j
jd	d  R  | _
t| jdritjj| j| j	| j
| jd}
| j|
}|rkt|d S d S N
   B= Mq= Mkv= Hq= Hkv= K= TotalBytes=r   attn_decodingr   r	   devicedtyperequires_gradr
   r(   FBMHK

batch_size	num_headsnum_heads_groupsq_lenkv_lenrU   rT   rV   fmtopnot_supported_reasonsquerykeyvaluer   r1   r5   manual_seed	sub_labellabelshapesrandnqr6   r%   vr   rT   rG   r   
isinstancer   r   r   r9   r0   hasattrInputsr`   r   selfr   r   r   r   r   r   rH   r   rU   inpr`   r    r    r!   __init__N   sx   
&6   $$$zAttentionDecodingBase.__init__c                 C   s    t jj| j| j| j| jd}|S )Nra   )r   r   ro   rk   r%   rl   r   )rq   rr   r    r    r!   
get_inputs   s   z AttentionDecodingBase.get_inputsc              
   C   s^   zt j| j| j| j| j| jd W d S  ttfy. } zt	d|  W Y d }~d S d }~ww )N)r_   r   zRuntime error: )
r   "memory_efficient_attention_forwardrk   r%   rl   rG   r   RuntimeError
ValueErrorprint)rq   er    r    r!   fw   s   zAttentionDecodingBase.fwr'   N)__name__
__module____qualname__rG   r   __annotations__intboolrs   rt   rz   r    r    r    r!   rF   K   s*   
 

GrF   c                   @      e Zd ZejjjZdS )AttentionDecodingCUTLASSN)r|   r}   r~   r   r   cutlassFwOprG   r    r    r    r!   r          r   c                   @   @   e Zd ZejjjZdedededededede	dd	fd
dZ
d	S )AttentionDecodingCKr   r   r   r   r   r   rH   r'   Nc	                 C   s  t j}	t d d| d| d| d| d| d| d|| | | d	 || | |  || | |  d	  | _d
| _||||||f| _||ksLJ || dksTJ t j||||| |gd|	|d| _t j|||d|gd|	|dddd|| d| _	t j|||d|gd|	|dddd|| d| _
||kr| jd d d d d d df | _| j	d d d d d d df | _	| j
d d d d d d df | _
t||||| |||	tdd| jd| _t| jtjjjr| jjddg| jjd	d  R  | _| j	jddg| j	jd	d  R  | _	| j
jddg| j
jd	d  R  | _
t| jdr>tjj| j| j	| j
| jd}
| j|
}|r@t|d S d S rI   re   rp   r    r    r!   rs      sp   
&6   $$$zAttentionDecodingCK.__init__)r|   r}   r~   r   r   ckr   rG   r   r   rs   r    r    r    r!   r      &    

r   c                   @   r   )AttentionDecodingCKDecoderN)r|   r}   r~   r   r   
ck_decoderr   rG   r    r    r    r!   r      r   r   c                   @   r   )AttentionDecodingSplitKVN)r|   r}   r~   r   r   triton_splitkr   rG   r    r    r    r!   r      r   r   c                   @   r   )AttentionDecodingCKSplitKVN)r|   r}   r~   r   r   	ck_splitkr   rG   r    r    r    r!   r      r   r   c                   @   r   )AttentionDecodingSplitInt4KVr   r   r   r   r   r   rH   r'   Nc	                 C   s  t j}	t d d| d| d| d| d| d| d|| | | d	 || | |  || | |  d	  | _d
| _||||||f| _||ksLJ || dksTJ t j||||| |gd|	|d| _t j|||d|gd|	|d| _t j|||d|gd|	|d| _	d}
t
| j|
d t jddd|| d| _t
| j	|
d t jddd|| d| _	||kr| jd d d d d d df | _| jd d d d d d df | _| j	d d d d d d df | _	|dkr| jd d d d df | _| jd d d d df | _| j	d d d d df | _	t||||| |||	tdd| jd| _t| jtjjjrf| jjddg| jjd	d  R  | _| jjddg| jjd	d  R  | _| j	jddg| j	jd	d  R  | _	t| jdrtjj| j| j| j	| jd}| j|}|rt|d S d S )NrJ   rK   rL   rM   rN   rO   rP   rQ   r   rR   r   r	   rS   r
   )r&   r(   FrW   rX   r`   ra   )r1   r5   rf   rg   rh   ri   rj   rk   r%   rl   rE   
contiguousr9   int32r6   r   rT   rG   r   rm   r   r   r   r0   rn   ro   r`   r   )rq   r   r   r   r   r   r   rH   r   rU   r&   rr   r`   r    r    r!   rs      s   
&6   
$$$z%AttentionDecodingSplitInt4KV.__init__)r|   r}   r~   r   r   r   r   rG   r   r   rs   r    r    r    r!   r      r   r   c                   @      e Zd ZdddZdS )AttentionDecodingPyTorchRepeatr'   Nc                 C   s   | j \}}}}}}d|d  }| j||d|gdddd}| j||d|gdddd}	| j||d|gdddd}
||	dd | d}||
 S )Nr
   r   r(   r   r      r-   )ri   rk   r/   permuter%   rl   	transposesoftmax)rq   r   r   r   r   r   r   scalerk   r%   rl   attnr    r    r!   rz   T  s      z!AttentionDecodingPyTorchRepeat.fwr{   r|   r}   r~   rz   r    r    r    r!   r   S  r   r   pytorch
BENCHMARKSr   )r   
ck-decoder	ck_splitK)r   	   triton_splitKtriton_int4KVc                   @   r   )AttentionDecodingFlashAttentionr'   Nc           
      C   s   | j | j| j}}}|jdkrG|j\}}}}}|j\}}	}}}||||| |g}|d d d d d d df }|d d d d d d df }t|||S )N   r   )rk   r%   rl   ndimr0   r/   
flash_attnflash_attn_func)
rq   rk   r%   rl   r   r   H1H2r   r   r    r    r!   rz   w  s   
z"AttentionDecodingFlashAttention.fwr{   r   r    r    r    r!   r   v  r   r   zflash-attention@c                 C   sD   g | ]}t d dD ]}ttd dd|  d d| d|dddq	qS )r
   r   r   r   r   Nr   )ranger   r   r   r    r    r!   r"     s    
c                 C   s"   g | ]}t |d ddd dddqS )r
   i  r#   r   Nr   )r   )r   r   r    r    r!   r"     s    )r   r,   r#   r       @   r   c                  C   s   t t } | d | S )Nr   )listr   keysremove)decoder_namesr    r    r!   get_benchmark_names  s   
r   z
name, casec                 C   s   g | ]}t D ]}||fqqS r    )
TEST_CASES)r   namecaser    r    r!   r"     s    c              
   C   s8  t |d |d |d |d |d |d d|d }| d	kr(|d d
kr(td | }| }t|  }| dv s:J |j|d\}}| \}}	}
|j	\}}}}}d}|	j	d dkrk|	
ddkrk|

ddkrkd}|r|||d||dd }n|||| d| }|dd }tjj||ddd d S )Nr   r   r   r   r   r   Fr   r   i @  z&ck-decoder does not support Mkv >= 16K)r   r   r   r   r   r   r
   r   Tr(   r   g{Gz?)atolrtol)r   pytestskiprz   rt   r   rG   applyget_qkv_in_bmghkr0   strider/   r   r   r1   testingassert_close)r   r   baselinebaseline_outinputsdecoderdecoder_outputctxrk   r%   rl   r   MGHKqmqa_swap_seqlen_headr    r    r!   test_flash_attention_decoder  s8   

*r   c                   C   s   t ddtttd dS )z#
    run performance benchmark
    rR   T)rz   cases	functionsmin_run_timeN)r   CASESr   r   r    r    r    r!   main  s   
r   __main__)r
   r{   )3systypingr   r   r   r   r1   xformers.opsopsr   xformers.attn_bias_utilsr   xformers.benchmarks.utilsr   r   r   rT   r   r   Tensorr   rE   rF   r   r   r   r   r   r   r   r   strr   versionr	   hipupdateversion_infomajorminorr   r   __version__ImportErrorr   r   markparametrizer   r   r|   r    r    r    r!   <module>   s|   


%YGX	



%

