o
    c۷i                  "   @   sR  d dl mZmZmZ d dlmZ d dlZd dlm  m	Z
 d dlmZ d dlmZmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ dd e
j dd ee
j!dddZ"dd dd dd dd dd dZ#ed Z$ed Z%ed Z&ee'dZ(dd Z)de*fddZ+edd  ee(d  D d!gd"e+id#			$	$					%	%	dd&ed'ed(ed)ee d*ee d+e,eB d,e,eB d-ee d.ee d/ee d0ee d1e-d!e-d2ee d3dfd4d5Z.ed6d  ee(d  D d7d!gd"e+id#						%	dd&ed'ed8ee d9ed)ee d*ee d7e$d-ee d/ee d!e-d2ee d3dfd:d;Z/ed<d  ee(d  D d7d!gd"e+id#				=	dd&ed'ed>ed?ed9ed7e$d-ee d/ee d!e-d2ee d3dfd@dAZ0			$						%	=dd&ed'ed(ee d*ee d+e,eB dBeej1 d-ee d.ee d/ee d0ee d!e-dCe-d3efdDdEZej2j3dFdGddH		$						%	=dd&ed'ed(ed*ee d+e,dIee d-ee d.ee d/ee d0ee d!e-dCe-d3dfdJdKZ4			$				dd&ed'ed(ee d*ee d+e,eB d-ee d.ee d/ee dBeej1 d3efdLdMZ5		$	$						%	=dd&ed'ed)ed(ee d+e,eB d,e,eB dBeej1 d-ee d.ee d/ee d0ee d!e-dCe-d3efdNdOZ6ej2j3dPdGddH	$	$							%	%	=dd&ed'ed)ed(ed+e,d,e,dIee dQee d-ee d.ee d/ee d0ee d1e-d!e-dCe-d3df dRdSZ7			$	$				dd&ed'ed)ed*ee d(ee d+e,eB d,e,eB d-ee d.ee d/ee dBeej1 d3efdTdUZ8	$	$					%	=dd&ed'ed(ed+e,eB d,e,eB d-ee d.ee d/ee d0ee d!e-dCe-d3dfdVdWZ9ej2j3dXdGddH	$	$							%	=dd&ed'ed(ed+e,d,e,dIee dQee d-ee d.ee d/ee d0ee d!e-dCe-d3dfdYdZZ:										=	%	=dd&ed'ed)ee d*ee d7e&d8ee d9ee dBeej1 d[eej1 d-ee d/ee d\e-d!e-dCe-d3eee ef fd]d^ZeZ;ej2j3d_d`ddadb						%	=dd&ed'ed8ee d9ed)ee d*ee d7e$d-ee d/ee d!e-dCe-d3dfdcddZ<								=dd&ed'ed)ee d*ee d7e&d-ee d/ee dBeej1 d[eej1 d\e-d3eee ef fdedfZ=e=Z>							%			=	=dd&ed'ed>ed7e&d?ee d9ee dBeej1 d[eej1 dgee dhe-d-ee d/ee d!e-dCe-fdidjZeZ?ej2j3dkdlddmdb				=	=dd&ed'ed>ed?ed9ed7e$d-ee d/ee d!e-dCe-d3dfdndoZ@					dd&ed'ed>ed7e&d-ee d/ee dBeej1 d[eej1 d3eeef fdpdqZAeAZBej2j3drdGddsdb		%	$	$dd&ed'ed(ed)ee d!e-d+e,d,e,d3dfdtduZC				%	$	$dd&ed'ed)ee d(ee dBeej1 d!e-d+e,eB d,e,eB d3eee ef fdvdwZedxd  ee(d  dyD d7d!gd"e+id#			z			%	dd&ed'ed8ee d9ed)ee d*ee d7e%d-ee d/ee d!e-d2ee d3dfd{d|ZDde*fd}d~ZEedd  ee(d  dD g dd"eEid#		z	%			=	dd&ed'ed>ed?ed9edgee d7e%dhe-d-ee d/ee d!e-d2ee d3ee fddZFej2j3dd`dddb			z			%	=dd&ed'ed8ee d9ed)ee d*ee d7e%d-ee d/ee d!e-dCe-d3dfddZGej2j3ddldddb		z	%			=	=dd&ed'ed>ed?ed9edgee d7e%dhe-d-ee d/ee d!e-dCe-d3efddZHej2Id		z	%			=	=dd&ed'ed>ed?ed9edgee d7eJdhe-d-ee d/ee d!e-dCe-d3efddZKdd ZLe:jI	$	$							%	=dd&ed'ed(ed+e,d,e,dIee dQee d-ee d.ee d/ee d0ee d!e-dCe-d3dfddZMdddZNdd ZOdd ZPeNe4e.eOd eNe7e.ePd eNe<e/ eNe@e0 eNeGeD eCjI		%	$	$dd&ed'ed(ed)ee d!e-d+e,d,e,d3dfddZQdS )    )OptionalTupleLiteral)partialN)Tensor)
GemmConfigget_all_configs)autotuneAutotuneConfig)get_device_capacity)gemm)gemm_act)	gemm_dact)gemm_symmetricc                 C   s   | S N xr   r   J/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/gemm_interface.py<lambda>   s    r   c                 C   s   t |  S r   )Frelusquarer   r   r   r   r          tanhapproximate)Nr   relu_sqgelu_tanh_approxc                 C      t | | S r   )r   silugateupr   r   r   r      r   c                 C   s   | t d|   |d  S )NgZd;?   torchsigmoidr!   r   r   r   r       s    c                 C   r   r   )r   r   r!   r   r   r   r   !   r   c                 C   s   t j| dd| S )Nr   r   )r   gelur!   r   r   r   r   "   s    c                 C   r   r   r%   r!   r   r   r   r   #   r   )swiglu
swiglu_oaireglugegluglu)	Nr   r   r   r)   r*   r+   r,   r-   cudac                 C   s4   t | d dkrtddddddS td	d	ddd
dS )Nr   
            r$   T)tile_mtile_n	cluster_m	cluster_npingpong   F)r   r   )devicer   r   r   default_config8   s   r:   
named_argsc                 K   sx   ||B }| dd d u}| dd d u}|s|rdd | D } |r:dd | D } t|d jd dkr:d	d | D } | S )
NA_idxcu_seqlens_mc                 S      g | ]
}|j d  js|qS configkwargsswap_ab.0confr   r   r   
<listcomp>D       z.prune_invalid_gemm_configs.<locals>.<listcomp>c                 S   s    g | ]}|j d  jdkr|qS )r@   r$   )rB   r6   rD   r   r   r   rG   F        Ar   	   c                 S   s    g | ]}|j d  jdkr|qS )r@      )rB   r4   rD   r   r   r   rG   I   rI   )getr   r9   )configsr;   rB   gather_Avarlen_mr   r   r   prune_invalid_gemm_configs?   s   rQ   c                 C      g | ]}t |d qS r?   r
   rE   cr   r   r   rG   N       rG   dynamic_schedulerearly_config_prune)rN   keyprune_configs_by      ?FrJ   BoutCbiasalphabetar=   cu_seqlens_kr<   batch_idx_permuteadd_to_outputr@   returnc                 C   s"  |d u r	t | j}|d u}|d u}|p|}|	d u}|r*|s!J d|jdks*J d|r3|jr3J d| jdkr?|s?| d} |j}|jdkrN|sN|d}|d ur^|jdkr^|s^|d}|jdkrj|sj|d}|d urx|jdkrx|d}|s|jd n|jd d }|r|	d ur|	jd n| jd }||jd f}n|| jd |jd f}|j|ksJ d|j d	| |rtj	dtj
| jd
nd }t|js| n||js|n| |js|n|j|d ur|js|n|jnd ||j|j|j|j|jf
d|j|js|nd |jr|nd |||||	|
|d d S )Nz-gather_A requires either varlen_m or varlen_kr$   zgather_A requires cluster_n=14Variable-length sequences not supported with swap_abr2   r   zout shape mismatch: z vs dtyper9   T)
persistentmax_swizzle_sizerowvec_biascolvec_biasra   rb   r=   rc   r<   rd   re   )r:   r9   r6   rC   ndim	unsqueezemTshaper&   zerosint32gemm_sm90_sm100r3   r4   r5   r7   rl   )rJ   r]   r^   r_   r`   ra   rb   r=   rc   r<   rd   re   rX   r@   rP   varlen_kvarlenrO   
batch_sizetotal_m	out_shapetile_count_semaphorer   r   r   
gemm_tunedM   sl   





 r|   c                 C   rR   rS   rT   rU   r   r   r   rG      rW   
activation
preact_outpostact_outc                 C     |
d u r	t | j}
|d u}|r|
jrJ d| jdkr"|s"| d} |j}|jdkr/|d}|d ur?|jdkr?|s?|d}|d urP|jdkrP|sP|d}n|}|jdkr_|s_|d}n|}|d uro|jdkro|d}|	r{tjdtj| jdnd }t	|
js| n||
js|n| |d ur|
js|n|jnd |d ur|
js|n|jnd |
js|n|j|||
j
|
j|
j|
j|
jd|
j|
js|nd |
jr|nd ||d d S Nrg   r2   r   r$   ri   T)rk   rl   rm   rn   r=   r<   r:   r9   rC   ro   rp   rq   r&   rs   rt   gemm_act_sm90_sm100r3   r4   r5   r6   r7   rl   rJ   r]   r~   r   r_   r`   r}   r=   r<   rX   r@   rP   DPostActr{   r   r   r   gemm_act_tuned   sT   






r   c                 C   rR   rS   rT   rU   r   r   r   rG      rW   TPreActdx_outc
                 C   s>  |	d u r	t | j}	|d u}
|
r|	jrJ d| jdkr"|
s"| d} |j}|jdkr/|d}|jdkr;|
s;|d}|jdkrH|
sH|d}n|}|jdkrW|
sW|d}n|}|retjdtj| jdnd }t	|	jsm| n||	jss|n| |	jsy|n|j|	js|n|j|	js|n|j|||	j
|	j|	j|	j|	jd|	j||d d S )Nrg   r2   r   r$   ri   T)rk   rl   r=   r<   )r:   r9   rC   ro   rp   rq   r&   rs   rt   gemm_dact_sm90_sm100r3   r4   r5   r6   r7   rl   )rJ   r]   r   r   r   r}   r=   r<   rX   r@   rP   r   r   r{   r   r   r   gemm_dact_tuned   sL   





r   	out_dtypetunedc                 C   s  |du rh|du r| j n|}|du}|du}|r-|dur |jd n| jd }||jd f}n2|rB|jd d }|| jd |jd f}n| jdkrQ| jd |jd fn| jd | jd |jd f}tj||| jd}t|tso|nd}t|trx|nd}t| |||||||||	|
|d	 |S )
z4GEMM with optional output tensor and tuning control.Nr   r$   r2   rh   ri   r\   )	r`   ra   alpha_tensorr=   rc   r<   rd   rX   r   )	rj   rr   ro   r&   emptyr9   
isinstancefloatgemm_out)rJ   r]   r^   r`   ra   r   r=   rc   r<   rd   rX   r   rP   rv   ry   rz   Lr   r   r   r   r     s<   8r   zquack::gemm_outr^   )mutates_argsdevice_typesr   c                 C   sH   |rt ntt jdd}|dur|n|}|| ||d||||||	|
d dS )z&GEMM with pre-allocated output tensor.Nr?   )r_   r`   ra   r=   rc   r<   rd   rX   r|   r   fn)rJ   r]   r^   r`   ra   r   r=   rc   r<   rd   rX   r   r   r   r   r   r   P  s   
r   c	              	   C   s  |du r| j n|}|du rG|du rG| jdkrtjntj}	|	| |||d}t|tr-|dkr1||9 }|durE| jdkr<|n|d}||7 }|S |dur|du rk|durX|jd n| jd }
tj	|
|jd f|| j
d	}t|jd d D ]c}|dur| ||| ||d    n| || ||d   }tj||| ||| ||d   d
 t|tr|dkr||| ||d    |9  < |dur||| ||d    || 7  < qt|S |jd d }|du rtj	|| jd |jd f|| j
d	}t|D ]A}|dur| dd||| ||d   f n| dd|| ||d  f }tj|||| ||d  ddf || d
 qt|trH|dkrL||9 }|durU||7 }|S )z<Reference implementation for GEMM with pre-allocated output.N   )r   r^   r\   r2   r$   r   r   ri   r   )rj   ro   r&   bmmmmr   r   rp   rr   r   r9   range)rJ   r]   r^   r`   ra   r=   rc   r<   r   r   ry   iA_slicer   r   r   r   gemm_refy  sP   !( $$
$2
r   c                 C   sl  |du rh|du r| j n|}|du}|du}|r-|	dur |	jd n| jd }||jd f}n2|rB|jd d }|| jd |jd f}n| jdkrQ| jd |jd fn| jd | jd |jd f}tj||| jd}||u oxt|tox|dkox|du }t|ts|nd}t|tr|nd}t|ts|nd}t|tr|nd}t| ||s|nd||||||||	|
|||d	 |S )
z.GEMM with addition and optional output tensor.Nr   r   r$   r2   rh   ri   r\   )r=   rc   r<   rd   re   rX   r   )	rj   rr   ro   r&   r   r9   r   r   gemm_add_out)rJ   r]   r_   r^   ra   rb   r   r=   rc   r<   rd   rX   r   rP   rv   ry   rz   r   re   r   beta_tensorr   r   r   gemm_add  sH   8"
r   zquack::gemm_add_outr   c                 C   sZ   |rt ntt jdd}|dur|n|}|dur|n|}|| |||||||	|
|||d dS )z3GEMM with addition and pre-allocated output tensor.Nr?   ra   rb   r=   rc   r<   rd   re   rX   r   )rJ   r]   r_   r^   ra   rb   r   r   r=   rc   r<   rd   re   rX   r   r   r   r   r   r     s"   
r   c              	   C   s  |du r[|du r[t |trt |trtj|| ||
|||d}n&|dur&|jn|
dur,|
n| j}
|| |  ||  |
}|durE|| |durY| jdkrP|n|d}||7 }|S |dur|du r|	durl|	j	d n| j	d }|
durw|
n| j}
tj
||j	d f|
| jd}t|j	d d D ]U}|	dur| |	|| ||d    n| || ||d   }||| ||d   }||| ||d   }|t|||  ||  }|dur||| 7 }|| q|S |j	d d }|
dur|
n| j}
|du rtj
|| j	d |j	d f|
| jd}t|D ]O}|	dur-| dd|	|| ||d   f n| dd|| ||d  f }||| ||d  ddf }|t|| |||   }|| | q|durm||7 }|S )zIReference implementation for GEMM with addition and pre-allocated output.N)r   ra   rb   r^   r2   r$   r   r   ri   )r   r   r&   addmmrj   tocopy_ro   rp   rr   r   r9   r   r   )rJ   r]   r_   r`   r^   ra   rb   r=   rc   r<   r   resultry   r   r   C_slice	out_slicer   B_slicer   r   r   gemm_add_ref  sX   
%
$
$ 
r   c                 C   sn   t |ts|nd}t |tr|nd}t |ts|nd}t |tr"|nd}t| |||||||||||	|
d dS )a  In-place GEMM with addition: out = alpha * A @ B + beta * out.
    Args:
        A: (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k - input tensor
        B: (K, N) or (L, K, N) or (total_K, N) if varlen_k - input tensor
        out: (M, N) or (L, M, N) or (total_M, N) if varlen_m or (L, M, N) if varlen_k - tensor to accumulate into (modified in-place)
        alpha: Scalar multiplier for A @ B
        beta: Scalar multiplier for out
        cu_seqlens_m: Optional cumulative sequence lengths for variable M
        cu_seqlens_k: Optional cumulative sequence lengths for variable K
        dynamic_scheduler: Whether to use dynamic scheduler
        tuned: Whether to use autotuned configuration
    Nr\   )r<   rd   rX   r   )r   r   gemm_add_inplace_op)rJ   r]   r^   ra   rb   r=   rc   r<   rd   rX   r   r   r   r   r   r   gemm_add_inplace_  s&   
r   zquack::gemm_add_inplacec                 C   s|   |rt ntt jd d}|d ur|n|}|d ur|n|}t|to'|dko'|d u }|| |||s0|nd |||||	|
||d d S )Nr?   r\   r   )r|   r   r   r   r   )rJ   r]   r^   ra   rb   r   r   r=   rc   r<   rd   rX   r   r   re   r   r   r   r     s$   

r   postact_dtypestore_preactc                 C   sJ  |t v }|du r| jn|}|du r| jn|}|	du}|r2|
dur%|
jd n| jd }||jd f}n| jdkrB| jd |jd f}n| jd | jd |jd f}|rag |dd |d d R n|}|du rr|rrtj||| jd}|du rtj||| jd}|rt| |||||||	|
|| ||fS t| |||||||	|
|| ||fS )zGGEMM with activation (or gated activation) and optional output tensors.Nr   r   r2   rh   ri   )	gated_to_pytorch_fn_maprj   rr   ro   r&   r   r9   gemm_gated_outgemm_act_out)rJ   r]   r_   r`   r}   r~   r   r   r   r=   r<   r   rX   r   is_gatedrP   ry   rz   postact_shaper   r   r   r     sX   
&r   zquack::gemm_act_out)r~   r   z(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, Tensor? bias=None, str? activation=None, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=False, bool tuned=True) -> ())r   r   schemac                 C   4   |
rt ntt jdd}|| |||||||||	
 dS )z6GEMM with activation and pre-allocated output tensors.Nr?   )r   r   r   rJ   r]   r~   r   r_   r`   r}   r=   r<   rX   r   r   r   r   r   r        r   c
                 C   s   |t v }
|d u r| jn|}|d u r| jn|}|d u r$t| ||||d}n
t| |||||d}|
rM|dd d df }|ddd df }t | |||}n	t| ||}|	r_|||fS d |fS )N)r`   r=   r<   .r2   r$   )r   rj   r   r   r   act_to_pytorch_fn_map)rJ   r]   r_   r`   r}   r=   r<   r   r   r   r   preactr"   r#   postactr   r   r   gemm_act_ref  s   r   colvec_scalecolvec_reducec                 C   s  |t v }|du r| jn|}|du r|jn|}|
du}|r=|dur%|jd n| jd }|r5||jd d fn||jd f}n7| jdkr[|rP| jd |jd d fn	| jd |jd f}n|rd|jd d n|jd }| jd | jd |f}|rg |dd |d d R n|}|du rtj||| jd}|du rtj||| jd}|rt| |||||||	|
|||}|	s||fS |||fS t| ||||||
|||
 ||fS )zPGEMM with activation (or gated activation) gradient and optional output tensors.Nr   r   r2   rh   ri   )	r   rj   rr   ro   r&   r   r9   gemm_dgated_outgemm_dact_out)rJ   r]   r   r}   r   r   r   r   r   r   r=   r<   rX   r   	is_dgatedrP   ry   rz   nr   colvec_reduce_finalr   r   r   r   6  s^   &
2&
r   zquack::gemm_dact_out)r   r   z(Tensor A, Tensor B, Tensor PreAct, Tensor(a3!) dx_out, Tensor(a4!) postact_out, str? activation=None, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=True, bool tuned=True) -> ()c
              
   C   s2   |	rt ntt jdd}
|
| ||||||||	 dS )z?GEMM with activation gradient and pre-allocated output tensors.Nr?   )r   r   r   )rJ   r]   r   r   r   r}   r=   r<   rX   r   r   r   r   r   r     s   r   c                 C   sd  |t v }|du r| jn|}|du r|jn|}t| |||d|}	|r||ddddf }
|ddddf }|
j|j}}|
d |d t | |
|}tjj||
|g|	dd\}}|
| || tj	||gd	d

|j}||||fS t| |}|du r|	}n|j}|d t| |}tjj|||	ddd }|| ||||fS )zQReference implementation for GEMM with activation (or gated activation) gradient.N)r=   r<   .r2   r$   TF)create_graphr   dimr   )r   rj   r   r   requires_gradrequires_grad_r&   autogradgradstackreshaperr   r   )rJ   r]   r   r}   r=   r<   r   r   r   doutr"   r#   gate_requires_gradup_requires_gradr   dgatedupdxPreAct_requires_gradpostact_for_gradr   r   r   gemm_dact_ref  s2   





r   zquack::gemm_symmetric_outzz(Tensor A, Tensor B, Tensor(a2!) out, Tensor? C=None, bool dynamic_scheduler=False, float alpha=1.0, float beta=1.0) -> ()c                 C   s   | j dkr
| d} |j}|j dkr|d}|dur%|j dkr%|d}|j dkr0|d}n|}|r>tjdtj| jdnd}t| ||durI|nd|durP|nd|dddddd	d
||d dS )z&GEMM with guaranteed symmetric output.r2   r   Nr$   ri   r0   r8   FT   	tile_Mtile_N	cluster_M	cluster_Nr7   rk   rl   ra   rb   )ro   rp   rq   r&   rs   rt   r9   gemm_symmetric_sm90_sm100)rJ   r]   r^   r_   rX   ra   rb   r{   r   r   r   gemm_symmetric_out  s8   






r   c              	   C   s   |du r| j n|}| jdkr| jd |jd f}n| jd | jd |jd f}|du r4tj||| jd}t|tr;|nd}	t|trD|nd}
t| |||||	|
d |S )	zGEMM with symmetric output.Nr2   r   r   rh   ri   r\   )rX   ra   rb   )	rj   ro   rr   r&   r   r9   r   r   r   )rJ   r]   r_   r^   r   rX   ra   rb   rz   	alpha_valbeta_valr   r   r   r     s   
r   c                 C   rR   rS   rT   rU   r   r   r   rG         
gatedr)   c                 C   r   r   r   r   r   r   r   gemm_gated_tuned  sT   






r   c                 K   sD   ||B }| dd d us| ddrdd | D } t| |fi |S )Nr   r   Fc                 S   r>   r?   rA   rD   r   r   r   rG   X  rH   z5prune_invalid_gemm_dgated_configs.<locals>.<listcomp>)rM   rQ   )rN   r;   rB   r   r   r   !prune_invalid_gemm_dgated_configsT  s   r   c                 C   rR   rS   rT   rU   r   r   r   rG   ]  r   dgated)r}   r   rX   c                 C   s,  |d u r	t | j}|d u}|r|jrJ d| jdko| }| jdkr*|s*| d} |j}|jdkr7|d}|jdkrC|sC|d}|jdkrP|sP|d}n|}|jdkr_|s_|d}n|}|d urq|jdkrq|sq|d}|d ur||jr|J d|r|j}|jd | d | }|r|	d ur|	jd n| jd }||f}n| jd | jd |f}tj	|tj
| jd}nd }|
rtjdtj| jdnd }t|js| n||js|n| |js|n|j|js|n|j|js|n|j|||j|j|j|j|jd|j||||	d	 |r|jd
d}|r|d}|S d }|S )Nrg   r2   r   r$   z'colvec_scale not supported with swap_abrh   ri   T)rk   rl   r   r   r=   r<   r   r   )r:   r9   rC   ro   rp   rq   r4   rr   r&   r   float32rs   rt   r   r3   r5   r6   r7   rl   sumsqueeze)rJ   r]   r   r   r   r   r}   r   r=   r<   rX   r@   rP   	og_ndim_2r   r   r4   shape_nry   colvec_shapecolvec_reduce_partialr{   r   r   r   r   gemm_dgated_tuned\  sz   







r   zquack::gemm_gated_outz(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, Tensor? bias=None, str activation='swiglu', Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=False, bool tuned=True) -> ()c                 C   r   )z<GEMM with gated activation and pre-allocated output tensors.Nr?   )r   r   r   r   r   r   r   r     r   r   zquack::gemm_dgated_outa  (Tensor A, Tensor B, Tensor PreAct, Tensor(a!) dx_out, Tensor(b!) postact_out, Tensor? colvec_scale=None, str activation='swiglu', bool colvec_reduce=False, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=True, bool tuned=True) -> Tensorc                 C   sR   |rt ntt jdd}|| |||||||||	|
}|du r'tjd| jtjdS |S )zEGEMM with gated activation gradient and pre-allocated output tensors.Nr?   r   )r9   rj   )r   r   r   r&   r   r9   r   )rJ   r]   r   r   r   r   r}   r   r=   r<   rX   r   r   r   r   r   r   r     s"   r   c                 C   s   t t| |||||||||	|
d |stjdtj| jdS |d ur2|	d ur)|	jd n| jd }|f}n| jdkr>| jd f}n
| jd | jd f}tj|tj| jdS )N)r   r}   r   r=   r<   rX   r   ri   r2   rh   )_precompile_default_configr   r&   r   r   r9   rr   ro   )rJ   r]   r   r   r   r   r}   r   r=   r<   rX   r   ry   rz   r   r   r   gemm_dgated_out_fake  s.   
r   c                 O   st   ddl m} |r|d n|d}|r |du s t|jd tjr"dS z| j|ddi| W dS  ty9   Y dS w )ah  Compile the default config in COMPILE_ONLY mode.

    Checks COMPILE_ONLY flag and SymInt guard, then calls the unwrapped function with
    config=None (which selects the default config), triggering compilation (exports .so)
    without benchmarking or kernel launch.
    Tests use tuned=False which also selects the default config, so this is sufficient.
    r   COMPILE_ONLYrJ   Nr@   )	quack.cache_utilsr   rM   r   rr   r&   SymIntr   	Exception)autotuned_fnargsrB   r   rJ   r   r   r   r   !  s   r   c                 C   sh   |d ur|n|}|d ur|n|}t |to|dko|d u }tt| |||s&|nd |||||	|
||d d S )Nr\   r   )r   r   r   r|   )rJ   r]   r^   ra   rb   r   r   r=   rc   r<   rd   rX   r   r   r   re   r   r   r   gemm_add_inplace_fake4  s$   

r   c                    s.   ddl }|| j| j fdd}dS )ao  Register a fake that precompiles the default config in COMPILE_ONLY mode.

    For custom_ops that forward args to their autotuned fn. Binds all args by name,
    strips 'tuned', applies optional rewrite(kw), then calls _precompile_default_config.
    PyTorch normalizes all custom_op args to positional, so we use inspect.signature
    to recover keyword names.
    r   Nc                     sR   j | i |}|  t|j}|dd  d ur| t fi | d S )Nr   )bindapply_defaultsdict	argumentspopr   )r   rB   boundkwr   rewritesigr   r   _faked  s   
z(_register_precompile_fake.<locals>._fake)inspect	signature_init_fnregister_fake)	custom_opr   r   r  r  r   r   r   _register_precompile_fakeX  s   r  c                 C   s,   |  dd}|dur|| d< | dd dS )z9Merge alpha_tensor into alpha for gemm_tuned; add C=None.r   Nra   r_   )r   
setdefault)rB   atr   r   r   _rewrite_merge_alphao  s   r  c                 C   s@   |  dd}|dur|| d< |  dd}|dur|| d< dS dS )z>Merge alpha_tensor/beta_tensor into alpha/beta for gemm_tuned.r   Nra   r   rb   )r   )rB   r
  btr   r   r   _rewrite_merge_alpha_betaw  s   r  )r   c                 C   s   ddl m} |rt| jd tjrd S zQt| jdkr| dn| |jdkr+|j	dn|j	|jdkr7|dn||d urH|jdkrF|dn|nd |rUtj
dtj| jdnd dddddd	d
||d W d S  tyn   Y d S w )Nr   r   r2   r$   ri   r0   r8   FTr   r   )r   r   r   rr   r&   r   r   ro   rp   rq   rs   rt   r9   r   )rJ   r]   r^   r_   rX   ra   rb   r   r   r   r   gemm_symmetric_out_fake  s.   
"r  )NNr\   r\   NNNNFFN)NNNNNFN)NNNTN)
NNr\   NNNNNFT)	Nr\   NNNNNFT)NNr\   NNNN)
Nr\   r\   NNNNNFT)r\   r\   NNNNNNFFT)NNr\   r\   NNNN)r\   r\   NNNNFT)
r\   r\   NNNNNNFT)NNNNNNNNNTFT)NNNNNFT)NNNNNNNT)NNNNNNFNNTT)NNNTT)NNNNN)NFr\   r\   )NNNFr\   r\   )NNr)   NNFN)Nr)   FNNTN)NNr)   NNFT)Nr)   FNNTTr   )Rtypingr   r   r   	functoolsr   r&   torch.nn.functionalnn
functionalr   r   quack.gemm_configr   r   quack.autotunerr	   r
   quack.cute_dsl_utilsr   
quack.gemmr   ru   quack.gemm_actr   r   quack.gemm_dactr   r   quack.gemm_symmetricr   r   r   r(   r   r   ActActivationGatedActivation
Activationr9   default_device_capacityr:   r   rQ   r   boolr|   r   r   rj   libraryr  r   r   r   r   r   r   r   
gemm_gatedr   r   gemm_gated_refgemm_dgatedr   r   gemm_dgated_refr   r   r   r   r   r   r  strr   r   r   r  r  r  r  r   r   r   r   <module>   s  	
	
K	
>	
<	

3	
%	

?	

;	
+	

H	

/	
(	

A	
	

	

G	

	
(
-	
	
>
T	
	

"	
)	

#



