o
    i3                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlm  mZ d dl	m
Z
mZ d dlmZmZmZ d dlmZmZ d dlmZ d dlmZmZ zd dlmZ W n eyZ   dZY nw zd dlmZ W n eyn   dZY nw zd dlZW n ey   dZY nw dd	 Zd?ddZ d@ddZ!dAddZ"dd Z#dd Z$e%d  dZ&dZ'ej(Z)g dZ*d
dgZ+g dZ,dZ-dZ.ddgedurdgng  Z/i Z0i Z1i Z2i Z3i Z4i Z5e+D ]xZ6e,D ]rZ7e*D ]l\Z8Z9ej:;  e6e7e8e9fZ<e-e7 Z=d d! e>d"D \Z?Z@ZAejBe?e@eAgd#d$ZCeCDejEZCe$e!eCe.e6e&d
d%ZFeFe0e<df< e!eCe.e6d&ZGedure?Hd'd#I Dej(ZJe@Hd'd#I Dej(ZKeAHd'd#I Ld d'd"d#Dej(ZMd'eNe7 ZOe$eeJeKeMe6eOd(d
d)d*	ZFe$eeJeKeMe6eOe&d
d)d*	ZFeFe0e<d)f< eeJeKeMLd d'd"d#e6eOP Hd'd#ZQejRjSeQeGd+d+d, e?De)e@De)eADe)Z?Z@ZAe?jTd- d. ZUejVd/gejWdd0ZXejVd/gejWdd0ZYejVd/gejWdd0ZZe$ee?e@eAeUe6d1eXeYeZe&d
d2ZFeFe0e<df< edureCDe)Z[e\d' e$e e[e9e9e6d&e&d
d3ZFeFe0e<df< e]d4e6 d5e7 d6e8 d7e9 d8	 e/D ]0Z^e#e"e8e9e7e=e6dd9e0e<e^f e3e<e^f< e]e^ d:e3e<e^f d;d<e0e<e^f d=  d> qqqqdS )B    N)	rearrangerepeat)benchmark_allbenchmark_forwardbenchmark_backward)benchmark_fwd_bwdbenchmark_combined)flash_attn_qkvpacked_func)flash_attn_func_flash_attn_forward)	attentionc                 C   s   | t jkr	tjjS | t jkrtjjS | t jkrtjjS | t j	kr$tjj
S | t jkr-tjjS | t jkr6tjjS | t jkr?tjjS td)NzUnsupported tensor data type.)torchfloat16cudnn	data_typeHALFbfloat16BFLOAT16float32FLOATint32INT32int64INT64float8_e4m3fnFP8_E4M3float8_e5m2FP8_E5M2
ValueError)
torch_type r    Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/hopper/benchmark_flash_attention_fp8.pyconvert_to_cudnn_type"   s   






r"   Fc                    sZ  | j \}}}}}td usJ dtj||||| j| jdt||||g|| | ||| dg}tj|||dtj| jd}	tjddddtj| jd}
tjddddtj| jd tj	t
| jtjjtjjdtj| ||||g|| | d ||| d dgdd}jdt|j t| t
| jd	}tj| ||||g|| | d ||| d dg|| d}jd
t|j t| t
| jd	}tj| ||||g|| | d ||| d dg|| d d}jdt|j t| t
| jd	}fdd}tjddddtjdd}| }| }| }| }| }| }j|||||||||ddt| |dd\}}}}|d|j |  |d|
j |
  |d j        tjjtjjg     ||||||||||||||||||||||
| itj dtj d fdd}|S )NzCUDNN is not availabledtypedevice   )io_data_typeintermediate_data_typecompute_data_type   r   )storage_offsetQ)namedimstrider   K   Vc                      s    j g dg dtjjdS )N)r&   r&   r&   r&   )r.   r/   r   )tensorr   r   r   r    )graphr    r!   get_default_scale_tensorj   s
   z2cudnn_spda_setup.<locals>.get_default_scale_tensorcudaT      ?sdpa)qkv	descale_q	descale_k	descale_v	descale_sscale_sscale_ois_inference
attn_scaleuse_causal_maskr-   F)r%   r$   c                     s      fS )N)execute)argskwargs
amax_o_gpur4   o_gpuvariant_pack	workspacer    r!   run   s   zcudnn_spda_setup.<locals>.run)!shaper   r   zerosr$   r%   
as_stridedemptyr   pygraphr"   r   r   r3   listr/   onessdpa_fp8mathsqrt
set_outputset_dim
set_stridevalidatebuild_operation_graphcreate_execution_plans	heur_modeAFALLBACKcheck_supportbuild_plansget_workspace_sizeuint8)qkvseqlen_qseqlen_kcausalb_nheadsheaddimo_gpu_transposed	stats_gpu
amax_s_gpunew_qr9   new_kr:   new_vr;   r5   default_scale_gpur<   r=   r>   r?   r@   rA   oamax_samax_orM   r    rH   r!   cudnn_spda_setup4   s   







rw           Tc                 C   s   | j \}}}}}| jdd\}}	}
t|d}t|	d}	dt| }tj|| ||| j| jd}ttj	|||	d|dd	|d
}|rYt
tj||fd|jdd}||j|jd }tj|dd}t||}td||
}|j| jdS )z
    Arguments:
        qkv: (batch_size, seqlen, 3, nheads, head_dim)
        dropout_p: float
    Output:
        output: (batch_size, seqlen, nheads, head_dim)
    r1   r.   zb t h d -> (b h) t dzb s h d -> (b h) d sr7   r#   r   )betaalphaz(b h) t s -> b h t s)hg     )r%   r&   )r$   zbhts,bshd->bthd)rN   unbindr   rV   rW   r   rQ   r$   r%   baddbmmtriufulltosoftmaxFdropouteinsum)re   	dropout_prh   
batch_sizeseqlenrj   rk   dr9   r:   r;   softmax_scalescorescausal_maskr   attention_dropoutputr    r    r!   attention_pytorch   s    

r   fwdc                 C   sT   |dv sJ d|  |d  | | |rdnd }|dkr|S |dkr&d| S d| S )	N)r   bwdfwd_bwd   r1   r&   r   r   g      @g      @r    )batchr   rl   rk   rh   modefr    r    r!   flops   s   $$r   c                 C   s   t |s| | d S dS )Nl    J)rx   )rV   isnan)floptimer    r    r!   
efficiency   s   r   c                 O   s*   t d t| g|R i |}|d jS )Nr&   )r   sleepr   mean)funcrF   rG   time_fr    r    r!   time_fwd   s   

r      r6   ))    i   )   i   )      )r   i   )r1   i    )r&   i @  )@         r   PytorchFlash3cuDNNc                 C   s&   g | ]}t jtttttt jd dqS )F)r%   r$   requires_grad)r   randnr   r   rk   rl   r%   r   ).0rj   r    r    r!   
<listcomp>   s   & r   r*   r1   ry   )rh   repeatsverbose)rh   r&      Triton)r   r   descg      ?)atolrtolr}   g      r7   r#   )r}   r}   )rh   window_sizer<   r=   r>   r   r   )r   r   z### causal=z
, headdim=z, batch_size=z	, seqlen=z ###)r   z fwd: z.2fz TFLOPs/s, g     @@z ms, )F)rx   T)r   )_picklerV   r   r   torch.nnnntorch.nn.functional
functionalr   einopsr   r   flash_attn.utils.benchmarkr   r   r   r   r   
flash_attnr	   flash_attn_interfacer
   r   triton_fused_attentionr   attention_tritonImportErrorxformers.opsopsxopsr   r"   rw   r   r   r   r   manual_seedr   r%   r   r$   bs_seqlen_valscausal_valsheaddim_valsr.   r   methodsr   time_btime_f_bspeed_fspeed_b	speed_f_brh   rl   r   r   r6   empty_cacheconfigrk   ranger9   r:   r;   stackre   r   r   r   res_baseline	transpose
contiguousq_transposedk_transposedpermutev_transposedrW   scalehalfrestestingassert_closerN   r   r3   r   r<   r=   r>   qkv_fp8r   printmethodr    r    r    r!   <module>   s   

y





$

"


"
*