o
     i                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlm	Z	 d dl
m  mZ d dlmZ dZedZg d	ZejZd
d ZeeeejejdgddgdZejdejdddiZdefddZdefddZejjrse d dS eeeed eeeed dS )    N)nullcontext)partial)Any)	benchmark)benchmark_main_helperg      ?cuda))i$     
  )iX  r   r	   )x  r   r	   )r
   r   i   )   i   i   )r   i    i V  c                  k   s6    |   }|  }tj| D ]
}tt||V  qd S )N)keysvalues	itertoolsproductdictzip)kwargsr   valsinstance r   X/home/ubuntu/.local/lib/python3.10/site-packages/xformers/benchmarks/benchmark_swiglu.pyproduct_dict)   s   r   autocast_halfTF)shapedtypebiaszb16   zf16   zf16.acr   c              	   c   s(   |dkrt jt jd}}}n||d}}}t j| d d t|d}tj| d | d |dt|}t||}|r@dnd	}	| d
| d  d| d  d| d  d|	 	}
|	 }|r`dnd}t
j| d||ttjtdddtj|
dV  t
j| d||ttjtjdddd|
dV  d S )Nr   TF   devicer      in_featureshidden_featuresr   r   nobi B=r   , I=, H= z3with torch.autocast("cuda", dtype=torch.half):
     zfn(x, *args))op)xargsfn	swiglu_fwstmtglobalslabeldescription	sub_labeleager)torchfloatrandnr   xswSwiGLUto	DTYPE2STRget_ordered_paramsr   Timerr   swigluOPNAMESwiGLUEagerOp)r   r   r   	inp_dtypemodel_dtypeautocastr*   module	dtype_strbstrr3   paramsPREFIXr   r   r   benchmark_swiglu?   sD   ,rK   c              	   c   s   |dkrt jt j}}tt jjddt jd}n||}}t}t j| d d t|d}|	  t
j| d | d |dt|}t||}|rKd	nd
}	| d| d  d| d  d| d  d|	 	}
| }|  t
j|g|R dti}W d    n1 sw   Y  t |}tjd||ddtj|
dV  ~|  t
j|g|R dt
ji}W d    n1 sw   Y  tjd||ddd|
dV  d S )Nr   r   T)enabledr   r   r   r   r    r   r#   r$   r   r%   r&   r'   r)   z%out.backward(grad, retain_graph=True))outgrad	swiglu_bwr.   r4   )r5   r6   r   amprE   float16r   r7   r   requires_grad_r8   r9   r:   r;   r<   r=   r?   r@   
zeros_liker   r>   rA   rB   )r   r   r   rC   rD   cmr*   rF   rG   rH   r3   rI   rM   rN   r   r   r   benchmark_swiglu_bwk   sT   
,

rU   z)This benchmark could not be done on ROCM!)min_run_time)!r   
contextlibr   	functoolsr   typingr   r5   torch.utilsr   xformers.ops.swiglu_opops	swiglu_opr8   xformers.benchmarks.utilsr   rV   r   SHAPESSwiGLUPackedFusedOpr@   r   listbfloat16halfCASESr;   boolrK   rU   versionhipprintr   r   r   r   <module>   s<   
	,4