o
    iQ                     @   s	  d dl mZ d dlmZ d dlZd dlZd dlmZ d dlZd dl	m
Z
 d dlm
  mZ d dlZzd dlZW n eyB   dZY nw eddefgZd dlmZmZ d dlmZmZmZmZmZmZ d d	lmZmZ d d
l mZ! d dl mZ" d dl#m$Z$ zd dl%m&Z' W n ey   dZ'Y nw dZ'e(dddkZ)ddddddZ*dXddZ+dd Z,dYddZ-dYd d!Z.e/d  d"Z0d#Z1dZ2ej3Z4e4ej5krej3ne4Z6d$Z7dZ8dZ9dZ:d#Z;dZ<dZ=d%Z>d&Z?d'Z@d(ZAd)gZBi ZCi ZDd*D ]ZAe@eA ZEeEZFeAZGeAd+koeGd,kZHeBD ]\Z>Z?d ZIdZJdZKe?ZLdZMejNe>eLeEeAe7e6dd-ZOejNe>e?eFeAe7e6dd-ZPejNe>e?eFeGe7e6dd-ZQd.d/ eOePeQfD \ZOZPZQeQR Sdd0T Sdd0U ZVe<sYeQneVZWeHriejNe>eLeEeGe7e6d1ndZXejNe>eLeEeGe7e6dd-ZYejNe>eLeEeGe7e6dd-ZZejNe>eLeEd2e7ej[d1Z\e9rd3d/ eOePeQfD \Z]Z^Z_ej`e>d2 e7ejad1eL Zbej`e>d2 e7ejad1e? Zce:dure?e: d ksJ d4d/ ePeQfD \ZdZeeej`e>e? e: e7ejad1d5e?e: d6ZfndZfd7D ]Z2egd8eAd9e2d:e?d; e+e>eEeLe?eHseAneAeG eGe2eJd<ZheduraeAd(krae4ej5kraeAeGkrae-eOSd2d%ePSd2d%eQSd2d%e2eJd  d=Zie.eOSd2d%ePSd2d%eQSd2d%eZSd2d%eYSd2d%e\Sd2d%e2eJd  d=Zje4ej5kreAeGkre9se*eeOePeQe1e2eJe;e0e8d>d?Zkne*ee]e^e_ebeceLe?e1e2eJe;e0e8d>d?ZkekjleCe2eAe>e?fd@f< emd2 e9seeeOePeQe1e2eJe;e=e0dd>dA\ZnZoneee]e^e_ebeceLe?e1e2eJe;e=e0dd>dA\ZnZoeojleDe2eAe>e?fd@f< eAd(kre4ej5kreAeGkre'durdBd/ eOePeQfD \ZpZqZremd2 e*e'epeqere2d2eseA e0e8dCd	ZtetjleCe2eAe>e?fdCf< edurbeAd(krbe4ej5krbeAeGkrbemd2 e*eie0e8dDdZueujleCe2eAe>e?fdEf< emd2 e*eje0e8dDdZvevjleDe2eAe>e?fdEf< emd2 e9se*e!eOe:du rtePnede:du r|eWneeeXe2eJe;eIeKe0e8dFdGZwne*e"e]e^e_ebeceLe?e2eJe;eIeKe0e8dFdHZwewjleCe2eAe>e?fdIf< e4ej5kreAeGkre)semd2 e9see!eOePeQe2eJe;e=e0ddFdA\ZnZxnee"e]e^e_ebeceLe?e2eJe;e=e0ddFdA\ZnZxexjleDe2eAe>e?fdIf< e4ej5kr(eAeGkr(egdJekjldK dLdMehekjl dN dOdP egdQeojldK dLdMdReh eojl dN dOdP eAd(kre4ej5kreAeGkre'durRegdSetjldK dLdMehetjl dN dOdP eduregdTeujldK dLdMeheujl dN dOdP egdUevjldK dLdMdReh evjl dN dOdP egdVewjldK dLdMehewjl dN dOdP e4ej5kreAeGkre)segdWexjldK dLdMdReh exjl dN dOdP qqqdS )Z    )
namedtuple)partialN)
NamedTupletimingmean)	rearrangerepeat)benchmark_forwardbenchmark_backwardbenchmark_combinedbenchmark_allbenchmark_fwd_bwdpytorch_profiler)flash_attn_funcflash_attn_varlen_func)r   )r   )do_bench)	attention FLASH_ATTENTION_DISABLE_BACKWARDFALSETRUE   T )repeatsverbosedescc                   s"   t t fddd|dd S )Nc                      s    i S N r   argsfunckwargsr   K/home/ubuntu/vllm_env/lib/python3.10/site-packages/hopper/benchmark_attn.py<lambda>;   s    ztime_fwd.<locals>.<lambda>   )warmuprepgMbP?)Timingr   )r   r   r   r   r   r    r   r   r!   time_fwd)   s   "r'   Fr)   c                 C   s   |rt d|| | d }n>|dkr|}n7tj|dd}	t|	| | |d  td}
t|	| | |d  t|d }||
 d    }| | d | | ||  S )Nr      r(   cuda)device   )	maxtorcharangemaximumtensorminimumfloatr   item)batchnheadsseqlen_qseqlen_kheaddim	headdim_vcausalwindow_size
avg_seqlenrow_idxcol_left	col_rightr   r   r!   flops>   s   "&rB   c                 C   sb   | t jkr	tjjS | t jkrtjjS | t jkrtjjS | t j	kr$tjj
S | t jkr-tjjS td)NzUnsupported tensor data type.)r/   float16cudnn	data_typeHALFbfloat16BFLOAT16float32FLOATint32INT32int64INT64
ValueError)
torch_typer   r   r!   convert_to_cudnn_typeL   s   




rQ   r)   c              
      s  | j \}}}}|j \}	}
}}	|j ||
||fksJ td us!J d| ||}}}t|tj|||dtj| jd}tjt| j	tj
jtj
jd  | }  | } | } jd| ||ddt| |pq|dk|dkry|sy|nd d	\}}|d
j   |d
tj
j        tjjtjjg       | ||||||||itj  dtjd fdd}|S )NCUDNN is not availabler-   )dtyper,   io_data_typeintermediate_data_typecompute_data_typesdpaF      ?r   )nameqkvis_inference
attn_scaleuse_causal_masksliding_window_lengthTr+   r,   rS   c                     s      S r   executer   r    grapho_gpuvariant_pack	workspacer   r!   run   s   zcudnn_spda_setup.<locals>.run) shaperD   r/   
empty_likeemptyrI   r,   pygraphrQ   rS   rE   rJ   tensor_likedetachrX   mathsqrt
set_outputset_dim
set_stridestrideset_data_typevalidatebuild_operation_graphcreate_execution_plans	heur_modeAFALLBACKcheck_supportbuild_plansget_workspace_sizeuint8)r[   r\   r]   r<   window_size_leftbr7   r8   r:   _nheads_kr9   q_gpuk_gpuv_gpu	stats_gpuostatsrk   r   rf   r!   cudnn_spda_setup[   sR   


r   c                    sT  | j \}}	}
}|j \}}}}|j ||||fksJ |j ||	|
|fks$J |j ||	|
|fks/J |j ||	|
dfks:J td usBJ d| ||||f\}}}}}t|t| t|tjt| jtjjtjjd	|
 } 	|
 }	|
 }	|
 }	|
 }	|
 }jd| |||||dt| |p|dk|dkr|s|nd d
\}}}|dj   |d j    |dj       tjjtjjg     | ||||||||||||| |i	tj d	tjd
 fdd}|S )Nr-   rR   rT   sdpa_backwardrY   r   )
rZ   r[   r\   r]   r   dOr   r_   r`   ra   Tr+   rb   c                     s      fS r   rc   re   dk_gpudq_gpudv_gpurg   ri   rj   r   r!   rk      s   
z!cudnn_spda_bwd_setup.<locals>.run)rl   rD   r/   rm   ro   rQ   rS   rE   rJ   rp   rq   r   rr   rs   rt   ru   rv   rw   ry   rz   r{   r|   r}   r~   r   r   rn   r   r   )r[   r\   r]   r   glser<   r   r   r7   r8   r:   r   r   r9   r   r   r   rh   g_gpur   dqdkdvrk   r   r   r!   cudnn_spda_bwd_setup   sn   



r   
   g        r+   r*       i      )r*   r   )   @   i   )r,   rS   requires_gradc                 C   s   g | ]}|  t qS r   )rq   torS   requires_grad_.0xr   r   r!   
<listcomp>      r   rb   r-   c                 C   s   g | ]}t | d  qS )zb s h d -> (b s) h d)r   rq   r   r   r   r   r!   r   %  r   c                 C   s   g | ]	}t |d tdqS )zb (n p) h d -> (b n) p h d)p)r   	page_sizer   r   r   r!   r   0  s    z(b s) -> b s)s)FTz
### headdim = z, causal = z, seqlen = z ###)r<   r=   )r<   r   Fav2)r<   r=   softcapr   r   r   Flash2)r<   r=   r   deterministicr   r   r   c                 C   s$   g | ]}|  d d  qS )r-   r*   )rq   	transpose
contiguousr   r   r   r   r!   r   R  s   $ TritonCuDNNcuDNNFav3)	qvr<   r=   r   
num_splitspack_gqar   r   r   )r<   r=   r   r   r   r   r   r   Flash3z
Fav2 fwd: g     @@z.3fzms, g-q=z.1fz TFLOPSz
Fav2 bwd: g      @zTriton fwd: zCuDNN fwd: zCuDNN bwd: z
Fav3 fwd: z
Fav3 bwd: )Fr(   )Fr)   )ycollectionsr   	functoolsr   rr   ostypingr   r/   torch.nnnntorch.nn.functional
functionalFtimerD   ImportErrorr4   r&   einopsr   r   flash_attn.utils.benchmarkr	   r
   r   r   r   r   flash_attn.flash_attn_interfacer   r   flash_attn_interfaceflash_attn_func_v3flash_attn_varlen_func_v3triton.testingr   triton_fused_attentionr   triton_attentiongetenvDISABLE_BACKWARDr'   rB   rQ   r   r   manual_seedr   	dropout_pr<   rG   rS   float8_e4m3fn	dtype_genr,   r   varlenr   r   
V_colmajorr   
batch_sizeseqlendimr:   bs_seqlen_valstime_ftime_br7   	nheads_kvr;   has_qvr   r=   r   r8   	leftpad_krandnr[   r\   r]   rq   r   r   r   
v_colmajorv_fa3r   r   r   rI   r   q_unpadk_unpadv_unpadr0   rK   cu_seqlens_qcu_seqlens_kk_pagedv_paged
page_tableprintnFLOPS
cudnn_spdacudnn_spda_bwdm0r   sleepr   m0bqtktvtrs   m3m2m2bm1m1br   r   r   r!   <module>   s.    


7
E
  

&
 .L &

 

$
 


@(

*. 
*
*.*.