o
    IiT                     @   s	  d dl mZ d dlmZ d dlZd dlmZ d dlZd dlm	Z	 d dl
m	  mZ d dlZzd dlZW n ey>   dZY nw eddefgZd dlmZmZ d dlmZmZmZmZmZmZ d d	lmZmZ d d
lmZ  d dlmZ! d dl"m#Z# zd dl$m%Z& W n ey   dZ&Y nw dZ&ddddddZ'dUddZ(dd Z)dVddZ*dVddZ+e,d  dZ-d Z.dZ/ej0Z1e1ej2krej0ne1Z3d!Z4dZ5dZ6dZ7d Z8dZ9dZ:d"Z;d#Z<d$Z=d%Z>d&gZ?i Z@i ZAd'D ]Z>e=e> ZBeBZCe?D ]\Z;Z<d(ZDdZEd ZFdZGe<ZHdZIejJe;eHeBe>e4e3dd)ZKejJe;e<eCe>e4e3dd)ZLejJe;e<eCe>e4e3dd)ZMd*d+ eKeLeMfD \ZKZLZMeMN Odd,P Odd,Q ZRe9sDeMneRZSejJe;eHeBe>e4e3dd)ZTejJe;eHeBe>e4e3dd)ZUejJe;eHeBd(e4ejVd-ZWejJe;e<e<e4e3d-ZXejJe;e=d" e<e4e3d-Odd.ZYe6rd/d+ eKeLeMfD \ZZZ[Z\ej]e;d( e4ej^d-eH Z_ej]e;d( e4ej^d-e< Z`e7dure<e7 d ksJ d0d+ eLeMfD \ZaZbeej]e;e< e7 e4ej^d-d1e<e7 d2ZcndZcd3D ]Z/edd4e>d5e/d6e<d7 e(e;eBeHe<e>e/eEd8ZeedurIe>d%krIe1ej2krIe*eKOd(d"eLOd(d"eMOd(d"e/eEd  d9Zfe+eKOd(d"eLOd(d"eMOd(d"eUOd(d"eTOd(d"eWOd(d"e/eEd  d9Zge1ej2kre6sbe'eeKeLeMe.e/eEe8e-e5d:d;Zhne'eeZe[e\e_e`eHe<e.e/eEe8e-e5d:d;Zhehjie@e/e>e;e<fd<f< ejd( e6seeeKeLeMe.e/eEe8e:e-dd:d=\ZkZlneeeZe[e\e_e`eHe<e.e/eEe8e:e-dd:d=\ZkZleljieAe/e>e;e<fd<f< e>d%kre1ej2kre&durd>d+ eKeLeMfD \ZmZnZoejd( e'e&emeneoe/d(epe> e-e5d?d	Zqeqjie@e/e>e;e<fd?f< edur;e>d%kr;e1ej2kr;ejd( e'efe-e5d@dZrerjie@e/e>e;e<fdAf< ejd( e'ege-e5d@dZsesjieAe/e>e;e<fdAf< ejd( e6sce'e eKe7du rMeLneae7du rUeSnebe/eEeFe8eDeGe-e5dBdCZtne'e!eZe[e\e_e`ddeHe<e/eEe8eDeGe-e5dBdDZtetjie@e/e>e;e<fdEf< e1ej2krejd( e6see eKeLeMe/eEeFe8e:e-ddBdF\ZkZunee!eZe[e\e_e`ddeHe<e/eEe8e:e-ddBd=\ZkZueujieAe/e>e;e<fdEf< e1ej2kreddGehjidH dIdJeeehji dK dLdM eddNeljidH dIdJdOee elji dK dLdM e>d%krOe1ej2krOe&dureddPeqjidH dIdJeeeqji dK dLdM edurOeddQerjidH dIdJeeerji dK dLdM eddResjidH dIdJdOee esji dK dLdM eddSetjidH dIdJeeetji dK dLdM e1ej2kreddTeujidH dIdJdOee euji dK dLdM qqqdS )W    )
namedtuple)partialN)
NamedTupletimingmean)	rearrangerepeat)benchmark_forwardbenchmark_backwardbenchmark_combinedbenchmark_allbenchmark_fwd_bwdpytorch_profiler)flash_attn_funcflash_attn_varlen_func)r   )r   )do_bench)	attention   T )repeatsverbosedescc                   s"   t t fddd|dd S )Nc                      s    i S N r   argsfunckwargsr   I/home/ubuntu/.local/lib/python3.10/site-packages/hopper/benchmark_attn.py<lambda>8   s    ztime_fwd.<locals>.<lambda>   )warmuprepgMbP?)Timingr   )r   r   r   r   r   r   r   r   r   time_fwd&   s   "r$   Fr&   c                 C   s   |rt d|| | d }n>|dkr|}n7tj|dd}t|| | |d  td}	t|| | |d  t|d }
|
|	 d    }| | d | | | d S )Nr      r%   cuda)device   )	maxtorcharangemaximumtensorminimumfloatr   item)batchnheadsseqlen_qseqlen_kheaddimcausalwindow_size
avg_seqlenrow_idxcol_left	col_rightr   r   r   flops;   s   "&r>   c                 C   sb   | t jkr	tjjS | t jkrtjjS | t jkrtjjS | t j	kr$tjj
S | t jkr-tjjS td)NzUnsupported tensor data type.)r,   float16cudnn	data_typeHALFbfloat16BFLOAT16float32FLOATint32INT32int64INT64
ValueError)
torch_typer   r   r   convert_to_cudnn_typeI   s   




rM   r&   c              
      s  | j \}}}}|j \}	}
}}	|j ||
||fksJ td us!J d| ||}}}t|tj|||dtj| jd}tjt| j	tj
jtj
jd  | }  | } | } jd| ||ddt| |pq|dk|dkry|sy|nd d	\}}|d
j   |d
tj
j        tjjtjjg       | ||||||||itj  dtjd fdd}|S )NCUDNN is not availabler*   )dtyper)   io_data_typeintermediate_data_typecompute_data_typesdpaF      ?r   )nameqkvis_inference
attn_scaleuse_causal_masksliding_window_lengthTr(   r)   rO   c                     s      S r   executer   r   grapho_gpuvariant_pack	workspacer   r   run   s   zcudnn_spda_setup.<locals>.run) shaper@   r,   
empty_likeemptyrE   r)   pygraphrM   rO   rA   rF   tensor_likedetachrT   mathsqrt
set_outputset_dim
set_stridestrideset_data_typevalidatebuild_operation_graphcreate_execution_plans	heur_modeAFALLBACKcheck_supportbuild_plansget_workspace_sizeuint8)rW   rX   rY   r8   window_size_leftbr4   r5   r7   _nheads_kr6   q_gpuk_gpuv_gpu	stats_gpuostatsrg   r   rb   r   cudnn_spda_setupX   sR   


r   c                    sT  | j \}}	}
}|j \}}}}|j ||||fksJ |j ||	|
|fks$J |j ||	|
|fks/J |j ||	|
dfks:J td usBJ d| ||||f\}}}}}t|t| t|tjt| jtjjtjjd	|
 } 	|
 }	|
 }	|
 }	|
 }	|
 }jd| |||||dt| |p|dk|dkr|s|nd d
\}}}|dj   |d j    |dj       tjjtjjg     | ||||||||||||| |i	tj d	tjd
 fdd}|S )Nr*   rN   rP   sdpa_backwardrU   r   )
rV   rW   rX   rY   r   dOr   r[   r\   r]   Tr(   r^   c                     s      fS r   r_   ra   dk_gpudq_gpudv_gpurc   re   rf   r   r   rg      s   
z!cudnn_spda_bwd_setup.<locals>.run)rh   r@   r,   ri   rk   rM   rO   rA   rF   rl   rm   r   rn   ro   rp   rq   rr   rs   ru   rv   rw   rx   ry   rz   r{   r|   rj   r}   r~   )rW   rX   rY   r   glser8   r   r   r4   r5   r7   r   r   r6   r   r   r   rd   g_gpur   dqdkdvrg   r   r   r   cudnn_spda_bwd_setup   sn   



r   
   g        r(   r'       i      )r'   r   )   r*   )r)   rO   requires_gradc                 C   s   g | ]}|  t qS r   )rm   torO   requires_grad_.0xr   r   r   
<listcomp>!      r   r^   c                 C   s   g | ]}t | d  qS )zb s h d -> (b s) h d)r   rm   r   r   r   r   r   r   /  r   c                 C   s   g | ]	}t |d tdqS )zb (n p) h d -> (b n) p h d)p)r   	page_sizer   r   r   r   r   :  s    z(b s) -> b s)s)FTz
### headdim = z, causal = z, seqlen = z ###)r8   r9   )r8   r   Fav2)r8   r9   softcapr   r   r   Flash2)r8   r9   r   deterministicr   r   r   c                 C   s$   g | ]}|  d d  qS )r*   r'   )rm   	transpose
contiguousr   r   r   r   r   r   \  s   $ TritonCuDNNcuDNNFav3)	r8   r9   sink_token_lengthr   
num_splitspack_gqar   r   r   )r8   r9   r   r   r   r   r   r   Flash3)r8   r9   r   r   r   r   r   r   z
Fav2 fwd: g     @@z.3fzms, g-q=z.1fz TFLOPSz
Fav2 bwd: g      @zTriton fwd: zCuDNN fwd: zCuDNN bwd: z
Fav3 fwd: z
Fav3 bwd: )Fr%   )Fr&   )vcollectionsr   	functoolsr   rn   typingr   r,   torch.nnnntorch.nn.functional
functionalFtimer@   ImportErrorr1   r#   einopsr   r   flash_attn.utils.benchmarkr	   r
   r   r   r   r   flash_attn.flash_attn_interfacer   r   flash_attn_interfaceflash_attn_func_v3flash_attn_varlen_func_v3triton.testingr   triton_fused_attentionr   triton_attentionr$   r>   rM   r   r   manual_seedr   	dropout_pr8   rC   rO   float8_e4m3fn	dtype_genr)   r   varlenr   r   
V_colmajorr   
batch_sizeseqlendimr7   bs_seqlen_valstime_ftime_br4   	nheads_kvr   r9   r   r   r5   	leftpad_krandnrW   rX   rY   rm   r   r   r   
v_colmajorv_fa3r   r   rE   r   ar   q_unpadk_unpadv_unpadr-   rG   cu_seqlens_qcu_seqlens_kk_pagedv_paged
page_tableprintnFLOPS
cudnn_spdacudnn_spda_bwdm0r   sleepr   m0bqtktvtro   m3m2m2bm1m1br   r   r   r   <module>   s*    


7
E
  


.L &



$



@,

*.
*
*.*.