o
     iv                     @   sV  d dl Z d dlZd dlmZ d dlmZmZ d dlmZ dZ	dgdgd d	gfdgd	gdgd fd	gdgd dgfdgd
gd d	gfdgd	gd
gd fd	gd
gd dgfdgg ddgfdgdgg dfdgg ddgfdgdgd dgfdgdgdgd fdgdgd dgfdZ
dd Zeee
 ejgdZdd Zdd Zeeee	d dS )    N)	benchmark)	DTYPE2STRbenchmark_main_helper)tiled_matmul   i @  i      i    i
     )i      r	   i 0  i   )llama1_65b_mha_fwdllama1_65b_mha_bwd_inputllama1_65b_mha_bwd_weightllama1_65b_ffn_fwdllama1_65b_ffn_bwd_inputllama1_65b_ffn_bwd_weightllama2_150b_mha_fwdllama2_150b_mha_bwd_inputllama2_150b_mha_bwd_weightllama2_150b_ffn_fwdllama2_150b_ffn_bwd_inputllama2_150b_ffn_bwd_weightc                  k   s6    |   }|  }tj| D ]
}tt||V  qd S N)keysvalues	itertoolsproductdictzip)kwargsr   valsinstance r    ^/home/ubuntu/.local/lib/python3.10/site-packages/xformers/benchmarks/benchmark_tiled_matmul.pyproduct_dict"   s   r"   )
shape_namedtypec              
      sl   g }t t D ]+|g  t td D ]|d t fddt t d D  qq|S )Nr   c                    s(   g | ]}t   | |  qS r    )torchmatmul.0kabmnr    r!   
<listcomp>;   s   ( z#matmul_per_tile.<locals>.<listcomp>)rangelenappendsum)r,   r-   cr    r+   r!   matmul_per_tile5   s   
&r6   c                 #   sh   t |  \} t|tt }}}tj||fd|d}tj||fd|d} fdd|j|ddD }fdd|j ddD }	t||}
|
 d|  d	d
dd |D  dd
dd D  dd
dd  D  	}t|| t	||	 t
||	 tjd||tjddd|dV  tjd||	t	ddd|dV  tjd||	t
ddd|dV  d S )Ncuda)devicer$   c                    $   g | ]}d d |j  ddD qS )c                 S      g | ]}|  qS r    cloner)   yr    r    r!   r0   G       5benchmark_tiled_matmul.<locals>.<listcomp>.<listcomp>   dimsplitr)   x)ksr    r!   r0   G      $ z*benchmark_tiled_matmul.<locals>.<listcomp>r   rB   c                    r9   )c                 S   r:   r    r;   r=   r    r    r!   r0   H   r?   r@   rA   rB   rD   rF   )nsr    r!   r0   H   rI    z M=+c                 s       | ]}| V  qd S r   r    )r)   r.   r    r    r!   	<genexpr>M       z)benchmark_tiled_matmul.<locals>.<genexpr>z N=c                 s   rM   r   r    )r)   r/   r    r    r!   rN   N   rO   z K=c                 s   rM   r   r    r(   r    r    r!   rN   O   rO   zfn(a, b))r,   r-   fnr   pytorch_fused)stmtglobalslabeldescription	sub_labelpytorch_tiledxformers_tiled)SHAPESr4   r&   randnrE   r   getjoinmmr6   r   r   Timer)r#   r$   msr.   r/   r*   r,   r-   a_tilesb_tiles	dtype_strrV   r    )rH   rJ   r!   benchmark_tiled_matmul@   sb   

rc   )min_run_time)r   r&   torch.utilsr   xformers.benchmarks.utilsr   r   xformers.ops.tiled_matmulr   rd   rY   r"   listr   bfloat16CASESr6   rc   r    r    r    r!   <module>   s<   :