o
    پi                  "   @   s
  d dl mZmZmZ d dlmZ d dlZd dlm  m	Z
 d dlmZ d dlmZmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ dd e
j dd ee
j!dddZ"dd dd dd dd dd dZ#ee$dZ%dd Z&de'fddZ(edd ee%d  D d gd!e(id"			#	#					$	$	d{d%ed&ed'ed(ee d)ee d*e)eB d+e)eB d,ee d-ee d.ee d/ee d0e*d e*d1ee d2dfd3d4Z+ed5d ee%d  D d6d gd!e(id"						$	d|d%ed&ed7ee d8ed(ee d)ee d6ed d,ee d.ee d e*d1ee d2dfd9d:Z,ed;d ee%d  D d6d gd!e(id"				<	d}d%ed&ed=ed>ed8ed6ed d,ee d.ee d e*d1ee d2dfd?d@Z-			#						$	<d~d%ed&ed'ee d)ee d*e)eB dAeej. d,ee d-ee d.ee d/ee d e*dBe*d2efdCdDZej/j0dEdFddG		#						$	<dd%ed&ed'ed)ee d*e)dHee d,ee d-ee d.ee d/ee d e*dBe*d2dfdIdJZ1			#				dd%ed&ed'ee d)ee d*e)eB d,ee d-ee d.ee dAeej. d2efdKdLZ2		#	#						$	<dd%ed&ed(ed'ee d*e)eB d+e)eB dAeej. d,ee d-ee d.ee d/ee d e*dBe*d2efdMdNZ3ej/j0dOdFddG	#	#							$	$	<dd%ed&ed(ed'ed*e)d+e)dHee dPee d,ee d-ee d.ee d/ee d0e*d e*dBe*d2df dQdRZ4			#	#				dd%ed&ed(ed)ee d'ee d*e)eB d+e)eB d,ee d-ee d.ee dAeej. d2efdSdTZ5	#	#					$	<dd%ed&ed'ed*e)eB d+e)eB d,ee d-ee d.ee d/ee d e*dBe*d2dfdUdVZ6ej/j0dWdFddG	#	#							$	<dd%ed&ed'ed*e)d+e)dHee dPee d,ee d-ee d.ee d/ee d e*dBe*d2dfdXdYZ7										<	$	<dd%ed&ed(ee d)ee d6ed d7ee d8ee dAeej. dZeej. d,ee d.ee d[e*d e*dBe*d2eee ef fd\d]Zej/j0d^d_dd`da						$	<dd%ed&ed7ee d8ed(ee d)ee d6ed d,ee d.ee d e*dBe*d2dfdbdcZ8								<dd%ed&ed(ee d)ee d6ed d,ee d.ee dAeej. dZeej. d[e*d2eee ef fdddeZ9								<	<dd%ed&ed=ed6ed d>ee d8ee dAeej. dZeej. d,ee d.ee d e*dBe*d2eeef fdfdgZej/j0dhdiddjda				<	<dd%ed&ed=ed>ed8ed6ed d,ee d.ee d e*dBe*d2dfdkdlZ:					dd%ed&ed=ed6ed d,ee d.ee dAeej. dZeej. d2eeef fdmdnZ;			o					<dd%ed&ed(ee d)ee d6edp d,ee d.ee dAeej. dZeej. d[e*d2eee ef fdqdrZ<				dd%ed&ed=ed6edp d,ee d.ee dAeej. dZeej. d2eeef fdsdtZ=ej/j0dudFddvda		$	#	#dd%ed&ed'ed(ee d e*d*e)d+e)d2dfdwdxZ>				$	#	#dd%ed&ed(ee d'ee dAeej. d e*d*e)eB d+e)eB d2eee ef fdydzZdS )    )OptionalTupleLiteral)partialN)Tensor)
GemmConfigget_all_configs)autotuneAutotuneConfig)get_device_capacity)gemm)gemm_act)	gemm_dact)gemm_symmetricc                 C   s   | S N xr   r   H/home/ubuntu/.local/lib/python3.10/site-packages/quack/gemm_interface.py<lambda>   s    r   c                 C   s   t |  S r   )Frelusquarer   r   r   r   r          tanhapproximate)Nr   relu_sqgelu_tanh_approxc                 C      t | | S r   )r   silugateupr   r   r   r      r   c                 C   s   | t d|   |d  S )NgZd;?   torchsigmoidr!   r   r   r   r       s    c                 C   r   r   )r   r   r!   r   r   r   r   !   r   c                 C   s   t j| dd| S )Nr   r   )r   gelur!   r   r   r   r   "   s    c                 C   r   r   r%   r!   r   r   r   r   #   r   )swiglu
swiglu_oaireglugegluglucudac                 C   s4   t | d dkrtddddddS td	d	ddd
dS )Nr   
            r$   T)tile_mtile_n	cluster_m	cluster_npingpong   F)r   r   )devicer   r   r   default_config*   s   r:   
named_argsc                 K   sj   ||B }| dd d u}| dd d u}|s|rdd | D } |r3t|d jd dkr3dd | D } | S )	NA_idxcu_seqlens_mc                 S   s   g | ]
}|j d  js|qS config)kwargsswap_ab.0confr   r   r   
<listcomp>6   s    z.prune_invalid_gemm_configs.<locals>.<listcomp>Ar   	   c                 S   s0   g | ]}|j d  jdkr|j d  jdkr|qS )r?   r$      )r@   r6   r4   rB   r   r   r   rE   :   s
     )getr   r9   )configsr;   r@   gather_Avarlen_mr   r   r   prune_invalid_gemm_configs1   s   rM   c                 C      g | ]}t |d qS r>   r
   rC   cr   r   r   rE   C       rE   dynamic_schedulerearly_config_prune)rJ   keyprune_configs_by      ?FrF   BoutCbiasalphabetar=   cu_seqlens_kr<   batch_idx_permuteadd_to_outputr?   returnc                 C   s"  |d u r	t | j}|d u}|d u}|p|}|	d u}|r*|s!J d|jdks*J d|r3|jr3J d| jdkr?|s?| d} |j}|jdkrN|sN|d}|d ur^|jdkr^|s^|d}|jdkrj|sj|d}|d urx|jdkrx|d}|s|jd n|jd d }|r|	d ur|	jd n| jd }||jd f}n|| jd |jd f}|j|ksJ d|j d	| |rtj	dtj
| jd
nd }t|js| n||js|n| |js|n|j|d ur|js|n|jnd ||j|j|j|j|jf
d|j|js|nd |jr|nd |||||	|
|d d S )Nz-gather_A requires either varlen_m or varlen_kr$   zgather_A requires cluster_n=14Variable-length sequences not supported with swap_abr2   r   zout shape mismatch: z vs dtyper9   T)
persistentmax_swizzle_sizerowvec_biascolvec_biasr]   r^   r=   r_   r<   r`   ra   )r:   r9   r6   rA   ndim	unsqueezemTshaper&   zerosint32gemm_sm90_sm100r3   r4   r5   r7   rh   )rF   rY   rZ   r[   r\   r]   r^   r=   r_   r<   r`   ra   rT   r?   rL   varlen_kvarlenrK   
batch_sizetotal_m	out_shapetile_count_semaphorer   r   r   
gemm_tunedB   sl   





 rx   c                 C   rN   rO   rP   rQ   r   r   r   rE      rS   
activation
preact_outpostact_outc                 C   s  |
d u r	t | j}
|d u}|r|
jrJ d| jdkr"|s"| d} |j}|jdkr/|d}|d ur?|jdkr?|s?|d}|d urP|jdkrP|sP|d}n|}|jdkr_|s_|d}n|}|d uro|jdkro|d}|	r{tjdtj| jdnd }t	|
js| n||
js|n| |d ur|
js|n|jnd |d ur|
js|n|jnd |
js|n|j|||
j
|
j|
j|
j|
jd|
j|
js|nd |
jr|nd ||d d S )Nrc   r2   r   r$   re   T)rg   rh   ri   rj   r=   r<   )r:   r9   rA   rk   rl   rm   r&   ro   rp   gemm_act_sm90_sm100r3   r4   r5   r6   r7   rh   )rF   rY   rz   r{   r[   r\   ry   r=   r<   rT   r?   rL   DPostActrw   r   r   r   gemm_act_tuned   sT   






r   c                 C   rN   rO   rP   rQ   r   r   r   rE      rS   TPreActdx_outc
                 C   s>  |	d u r	t | j}	|d u}
|
r|	jrJ d| jdkr"|
s"| d} |j}|jdkr/|d}|jdkr;|
s;|d}|jdkrH|
sH|d}n|}|jdkrW|
sW|d}n|}|retjdtj| jdnd }t	|	jsm| n||	jss|n| |	jsy|n|j|	js|n|j|	js|n|j|||	j
|	j|	j|	j|	jd|	j||d d S )Nrc   r2   r   r$   re   T)rg   rh   r=   r<   )r:   r9   rA   rk   rl   rm   r&   ro   rp   gemm_dact_sm90_sm100r3   r4   r5   r6   r7   rh   )rF   rY   r   r   r{   ry   r=   r<   rT   r?   rL   r}   r~   rw   r   r   r   gemm_dact_tuned   sL   





r   	out_dtypetunedc                 C   s  |du rh|du r| j n|}|du}|du}|r-|dur |jd n| jd }||jd f}n2|rB|jd d }|| jd |jd f}n| jdkrQ| jd |jd fn| jd | jd |jd f}tj||| jd}t|tso|nd}t|trx|nd}t| |||||||||	|
|d	 |S )
z4GEMM with optional output tensor and tuning control.Nr   r$   r2   rd   re   rX   )	r\   r]   alpha_tensorr=   r_   r<   r`   rT   r   )	rf   rn   rk   r&   emptyr9   
isinstancefloatgemm_out)rF   rY   rZ   r\   r]   r   r=   r_   r<   r`   rT   r   rL   rr   ru   rv   Lr   r   r   r   r     s<   8r   zquack::gemm_outrZ   )mutates_argsdevice_typesr   c                 C   sH   |rt ntt jdd}|dur|n|}|| ||d||||||	|
d dS )z&GEMM with pre-allocated output tensor.Nr>   )r[   r\   r]   r=   r_   r<   r`   rT   rx   r   fn)rF   rY   rZ   r\   r]   r   r=   r_   r<   r`   rT   r   r   r   r   r   r   E  s   
r   c	              	   C   s  |du r| j n|}|du rG|du rG| jdkrtjntj}	|	| |||d}t|tr-|dkr1||9 }|durE| jdkr<|n|d}||7 }|S |dur|du rk|durX|jd n| jd }
tj	|
|jd f|| j
d	}t|jd d D ]c}|dur| ||| ||d    n| || ||d   }tj||| ||| ||d   d
 t|tr|dkr||| ||d    |9  < |dur||| ||d    || 7  < qt|S |jd d }|du rtj	|| jd |jd f|| j
d	}t|D ]A}|dur| dd||| ||d   f n| dd|| ||d  f }tj|||| ||d  ddf || d
 qt|trH|dkrL||9 }|durU||7 }|S )z<Reference implementation for GEMM with pre-allocated output.N   )r   rZ   rX   r2   r$   r   r   re   r   )rf   rk   r&   bmmmmr   r   rl   rn   r   r9   range)rF   rY   rZ   r\   r]   r=   r_   r<   r   r   ru   iA_slicer   r   r   r   gemm_refn  sP   !( $$
$2
r   c                 C   sl  |du rh|du r| j n|}|du}|du}|r-|	dur |	jd n| jd }||jd f}n2|rB|jd d }|| jd |jd f}n| jdkrQ| jd |jd fn| jd | jd |jd f}tj||| jd}||u oxt|tox|dkox|du }t|ts|nd}t|tr|nd}t|ts|nd}t|tr|nd}t| ||s|nd||||||||	|
|||d	 |S )
z.GEMM with addition and optional output tensor.Nr   r   r$   r2   rd   re   rX   )r=   r_   r<   r`   ra   rT   r   )	rf   rn   rk   r&   r   r9   r   r   gemm_add_out)rF   rY   r[   rZ   r]   r^   r   r=   r_   r<   r`   rT   r   rL   rr   ru   rv   r   ra   r   beta_tensorr   r   r   gemm_add  sH   8"
r   zquack::gemm_add_outr   c                 C   sZ   |rt ntt jdd}|dur|n|}|dur|n|}|| |||||||	|
|||d dS )z3GEMM with addition and pre-allocated output tensor.Nr>   r]   r^   r=   r_   r<   r`   ra   rT   r   )rF   rY   r[   rZ   r]   r^   r   r   r=   r_   r<   r`   ra   rT   r   r   r   r   r   r     s"   
r   c              	   C   s  |du r[|du r[t |trt |trtj|| ||
|||d}n&|dur&|jn|
dur,|
n| j}
|| |  ||  |
}|durE|| |durY| jdkrP|n|d}||7 }|S |dur|du r|	durl|	j	d n| j	d }|
durw|
n| j}
tj
||j	d f|
| jd}t|j	d d D ]U}|	dur| |	|| ||d    n| || ||d   }||| ||d   }||| ||d   }|t|||  ||  }|dur||| 7 }|| q|S |j	d d }|
dur|
n| j}
|du rtj
|| j	d |j	d f|
| jd}t|D ]O}|	dur-| dd|	|| ||d   f n| dd|| ||d  f }||| ||d  ddf }|t|| |||   }|| | q|durm||7 }|S )zIReference implementation for GEMM with addition and pre-allocated output.N)r   r]   r^   rZ   r2   r$   r   r   re   )r   r   r&   addmmrf   tocopy_rk   rl   rn   r   r9   r   r   )rF   rY   r[   r\   rZ   r]   r^   r=   r_   r<   r   resultru   r   r   C_slice	out_slicer   B_slicer   r   r   gemm_add_ref  sX   
%
$
$ 
r   c                 C   sn   t |ts|nd}t |tr|nd}t |ts|nd}t |tr"|nd}t| |||||||||||	|
d dS )a  In-place GEMM with addition: out = alpha * A @ B + beta * out.
    Args:
        A: (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k - input tensor
        B: (K, N) or (L, K, N) or (total_K, N) if varlen_k - input tensor
        out: (M, N) or (L, M, N) or (total_M, N) if varlen_m or (L, M, N) if varlen_k - tensor to accumulate into (modified in-place)
        alpha: Scalar multiplier for A @ B
        beta: Scalar multiplier for out
        cu_seqlens_m: Optional cumulative sequence lengths for variable M
        cu_seqlens_k: Optional cumulative sequence lengths for variable K
        dynamic_scheduler: Whether to use dynamic scheduler
        tuned: Whether to use autotuned configuration
    NrX   )r<   r`   rT   r   )r   r   gemm_add_inplace_op)rF   rY   rZ   r]   r^   r=   r_   r<   r`   rT   r   r   r   r   r   r   gemm_add_inplaceT  s&   
r   zquack::gemm_add_inplacec                 C   s|   |rt ntt jd d}|d ur|n|}|d ur|n|}t|to'|dko'|d u }|| |||s0|nd |||||	|
||d d S )Nr>   rX   r   )rx   r   r   r   r   )rF   rY   rZ   r]   r^   r   r   r=   r_   r<   r`   rT   r   r   ra   r   r   r   r     s$   

r   postact_dtypestore_preactc                 C   s   |du r| j n|}|du r| j n|}|	du}|r.|
dur!|
jd n| jd }||jd f}n| jdkr>| jd |jd f}n| jd | jd |jd f}|du r[|r[tj||| jd}|du rhtj||| jd}t| |||||||	|
|| ||fS )z1GEMM with activation and optional output tensors.Nr   r   r2   rd   re   )rf   rn   rk   r&   r   r9   gemm_act_out)rF   rY   r[   r\   ry   rz   r{   r   r   r=   r<   r   rT   r   rL   ru   rv   r   r   r   r     s6   
r   zquack::gemm_act_out)rz   r{   z(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, Tensor? bias=None, str? activation=None, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=False, bool tuned=True) -> ())r   r   schemac                 C   s4   |
rt ntt jdd}|| |||||||||	
 dS )z6GEMM with activation and pre-allocated output tensors.Nr>   )r   r   r   )rF   rY   rz   r{   r[   r\   ry   r=   r<   rT   r   r   r   r   r   r     s   r   c
                 C   s   |d u r| j n|}|d u r| j n|}|d u r t| ||||d}
n
t| |||||d}
t| |
|}|	r<|
||fS d |fS )Nr\   r=   r<   )rf   r   r   act_to_pytorch_fn_mapr   )rF   rY   r[   r\   ry   r=   r<   r   r   r   rZ   postactr   r   r   gemm_act_ref  s   r   c                 C   s   |du r| j n|}|du r|j n|}|du}|r.|	dur!|	jd n| jd }||jd f}n| jdkr>| jd |jd f}n| jd | jd |jd f}|du rYtj||| jd}|du rftj||| jd}t| |||||||	|
|
 ||fS )z:GEMM with activation gradient and optional output tensors.Nr   r   r2   rd   re   )rf   rn   rk   r&   r   r9   gemm_dact_out)rF   rY   r   ry   r   r{   r   r   r=   r<   rT   r   rL   ru   rv   r   r   r   r     s"   
r   zquack::gemm_dact_out)r   r{   z(Tensor A, Tensor B, Tensor PreAct, Tensor(a3!) dx_out, Tensor(a4!) postact_out, str? activation=None, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=True, bool tuned=True) -> ()c
              
   C   s2   |	rt ntt jdd}
|
| ||||||||	 dS )z?GEMM with activation gradient and pre-allocated output tensors.Nr>   )r   r   r   )rF   rY   r   r   r{   ry   r=   r<   rT   r   r   r   r   r   r   2  s   r   c                 C   s   |du r| j n|}|du r|j n|}t| |||d|}t| |}	|du r*|}
n|j}|d t| |}tjj|||ddd }
|| |
||	|fS )z;Reference implementation for GEMM with activation gradient.Nr=   r<   TFcreate_graphr   )	rf   r   r   r   requires_gradrequires_grad_r&   autogradgrad)rF   rY   r   ry   r=   r<   r   r   doutr   dxPreAct_requires_gradpostact_for_gradr   r   r   gemm_dact_refI  s   

r   r)   )r-   r)   r*   r+   r,   c
                 C   s   |du r| j n|}|du r| j n|}|du r t| ||||d}
n
t| |||||d}
|
ddddf }|
ddddf }t| ||}|	rJ|
|nd||fS )aK  Reference implementation for GEMM with gated activation forward.

    Args:
        A: (M, K) - input tensor
        B: (K, N) - weight tensor with gate and up projections
        C: (M, N) - optional bias tensor
        activation: Type of gated activation
        out_dtype: Output dtype for preact
        postact_dtype: Output dtype for postact
        store_preact: Whether to return the pre-activation

    Returns:
        (preact, postact) where:
        - preact: (M, N) pre-activation (if store_preact=True, else None)
        - postact: (M, N // 2) post-activation output
    Nr   .r2   r$   )rf   r   r   gated_to_pytorch_fn_mapr   )rF   rY   r[   r\   ry   r=   r<   r   r   r   preactr"   r#   r   r   r   r   gemm_gated_refd  s   r   c                 C   s   |du r| j n|}|du r|j n|}t| |||d|}|ddddf }	|ddddf }
|	j|
j}}|	d |
d t| |	|
}tjj||	|
g|dd\}}|	| |
| tj	||gd	d

|j}||||fS )a  Reference implementation for GEMM with gated activation gradient.

    Args:
        A: (M, K) - dout input tensor
        B: (K, N) - weight tensor
        PreAct: (M, 2*N) - pre-activation tensor with gate and up projections interleaved
        activation: Type of gated activation
        out_dtype: Output dtype for dx
        postact_dtype: Output dtype for postact

    Returns:
        (dx, postact) where:
        - dx: (M, 2*N) gradient w.r.t. PreAct
        - postact: (M, N) post-activation output
    Nr   .r2   r$   TFr   r   )dim)rf   r   r   r   r   r   r&   r   r   stackreshapern   )rF   rY   r   ry   r=   r<   r   r   r   r"   r#   gate_requires_gradup_requires_gradr   dgatedupr   r   r   r   gemm_dgated_ref  s   



r   zquack::gemm_symmetric_outzz(Tensor A, Tensor B, Tensor(a2!) out, Tensor? C=None, bool dynamic_scheduler=False, float alpha=1.0, float beta=1.0) -> ()c                 C   s   | j dkr
| d} |j}|j dkr|d}|dur%|j dkr%|d}|j dkr0|d}n|}|r>tjdtj| jdnd}t| ||durI|nd|durP|nd|dddddd	d
||d dS )z&GEMM with guaranteed symmetric output.r2   r   Nr$   re   r0   r8   FT   )	tile_Mtile_N	cluster_M	cluster_Nr7   rg   rh   r]   r^   )rk   rl   rm   r&   ro   rp   r9   gemm_symmetric_sm90_sm100)rF   rY   rZ   r[   rT   r]   r^   rw   r   r   r   gemm_symmetric_out  s8   






r   c              	   C   s   |du r| j n|}| jdkr| jd |jd f}n| jd | jd |jd f}|du r4tj||| jd}t|tr;|nd}	t|trD|nd}
t| |||||	|
d |S )	zGEMM with symmetric output.Nr2   r   r   rd   re   rX   )rT   r]   r^   )	rf   rk   rn   r&   r   r9   r   r   r   )rF   rY   r[   rZ   r   rT   r]   r^   rv   	alpha_valbeta_valr   r   r   r     s   
r   )NNrX   rX   NNNNFFN)NNNNNFN)NNNTN)
NNrX   NNNNNFT)	NrX   NNNNNFT)NNrX   NNNN)
NrX   rX   NNNNNFT)rX   rX   NNNNNNFFT)NNrX   rX   NNNN)rX   rX   NNNNFT)
rX   rX   NNNNNNFT)NNNNNNNNNTFT)NNNNNFT)NNNNNNNT)	NNNNNNNTT)NNNTT)NNNNN)NNr)   NNNNT)NNNN)NFrX   rX   )NNNFrX   rX   )?typingr   r   r   	functoolsr   r&   torch.nn.functionalnn
functionalr   r   quack.gemm_configr   r   quack.autotunerr	   r
   quack.cute_dsl_utilsr   
quack.gemmr   rq   quack.gemm_actr   r|   quack.gemm_dactr   r   quack.gemm_symmetricr   r   r   r(   r   r   r9   default_device_capacityr:   dictrM   r   boolrx   r   r   rf   library	custom_opr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s  	
	
K	
>	
<	

3	
%	

?	

;	
+	

H	

/	
(	

0	
	

	


$	

	
	

.
	
,
-	