o
     i                     @   sN   d dl Z d dlmZ d dlmZ dd Zdd Zdd	 Zed
kr%e  dS dS )    N)fmha)do_bench_cudagraphc           	      C   s   t | } t |}|d dd}t j|dd\}}t || }|jdd}||  jdd}|| }|t | }|ddddd}||fS )zV
    attn_split: list of [B, M, (G,) H, Kq]
    lse_split: list of [B, (G,) H, M]
    ).N      r   )dim      )	torchstackmoveaxismaxexpsumlogsqueezepermute)	
attn_split	lse_splitlse_max_sumexp_normalizeddenominator	numeratorattn_outlse_out r   b/home/ubuntu/.local/lib/python3.10/site-packages/xformers/benchmarks/benchmark_merge_attentions.py_merge_attentions_varargs_ref   s   

r   c                    sZ  t j }t j|  fddt| D } fddt| D }	t||	\}
t jdd tfdd}||	 D ]}|	  |
d qKt||	\}jdd tfdd}td	| d
 dddddd|d dd|d dd|| dd W d   dS 1 sw   Y  dS )z
    Benchmark backward pass for merge_attentions. Assumes "varargs" path,
    i.e. LSE and attention of chunks are provided as two lists of tensors, and not as two stacked tensors.
    c                    s(   g | ]}t j gd ddqS cudaT)dtypedevicerequires_gradr	   randn.0r   )BD_HGMN_H_Lr    r   r   
<listcomp>+   s    z7benchmark_merge_attentions_backward.<locals>.<listcomp>c                    s&   g | ]}t j gd ddqS r   r#   r%   )r'   r)   r*   r+   r    r   r   r,   1   s    Tretain_graphc                          j ddS NTr-   backwardr   )attn_out_refout_gradr   r   <lambda><       z5benchmark_merge_attentions_backward.<locals>.<lambda>c                      r/   r0   r1   r   )r   r4   r   r   r5   F   r6   zsplit_k=z, B=z, M=z, G=z, N_H_L=z, D_H=z, dtype=z. Baseline: g     @@z.2fzus, Triton: zus, z.1fzx fasterN)r	   r   Streamstreamranger   
randn_liker2   r   detach_requires_grad_r   merge_attentionsprint)split_kr'   r*   r)   r+   r(   r    bench_streamr   r   lse_out_reft_ms_refxr   t_msr   )	r'   r(   r)   r*   r+   r   r3   r    r4   r   #benchmark_merge_attentions_backward"   s@   

,"rE   c                  C   sH   d} d}d}t j}dD ]}dD ]}dD ]}t|||| ||| qqqd S )Nr         )r   r   rF      )r       rG   )r   rI   i   )r	   float32rE   )r)   r+   r(   r    r?   r'   r*   r   r   r   mainP   s   rK   __main__)	r	   xformers.opsr   xformers.utilsr   r   rE   rK   __name__r   r   r   r   <module>   s   .
