o
    c۷iU                     @   s2  d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZmZ d dlmZmZ eeeef B ZeejjdejdZedddd	eeB d
efddZedddded
efddZeddddeded
efddZedddded
efddZeejddddeded
eeef fddZedddded
efddZeejddddeded
eeef fddZedddded
efddZeddddeded
eeef fddZeejdddded
efd d!Z eejddddeded
efd"d#Z!ed$ddd%ded&e"d
efd'd(Z#edddded)ed
efd*d+Z$ed$ddd%ded)eded&e"d
eeeef f
d,d-Z%e	.dCdddded)ed/ed
efd0d1Z&e	.dCdddded)eded/ed
eeeef f
d2d3Z'edddded)ed
efd4d5Z(edddded)eded
eeeef fd6d7Z)edddded)ed
efd8d9Z*eejdddded)eded
eeeef fd:d;Z+edddded)ed
efd<d=Z,edddded)eded
eeeef fd>d?Z-de#eeed@Z.deeedAZ/e$e&e*e,e(dBZ0e%e'e+e-e)dBZ1dS )D    N)Tuple)partial)Float32Boolean
const_expr)Tdsl_user_op)llvmnvvm)src_c	calc_funclocipareturnc             
   C   s4   t tjt t | j||dgddddtjjdS )Nr   ztanh.approx.f32 $0, $1;z=f,fF)has_side_effectsis_align_stackasm_dialect)r   r	   
inline_asmr   f32ir_value
AsmDialectAD_ATT)r   r   r    r   F/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/activation.pytanh   s   r   xc                C   sZ   t t| t rddtd|    S tjd| }t|d t|d f}tj|ddS )N      ?r   r   r      r   
isinstancetupler   cutearchmul_packed_f32x2fma_packed_f32x2)r   r   r   x_halftanh_x_halfr   r   r   sigmoid&   s
   r*   outdoutc                C   s   || | |    S Nr   )r+   r,   r   r   r   r   r   dsigmoid_from_output1   s   r.   c                C   sN   t t| t rtj| tdS tj| d tdtj| d tdfS N        r   r    )r   r"   r#   r$   r%   fmaxr   )r   r   r   r   r   r   relu7   s   ,r2   c                C   s   t t| t rt| dk}|r|ntdtj| tdfS t| d dk}t| d dk}|r5|d ntd|r>|d ntdf}|t| fS )Nr   r0   r    )	r   r"   r#   r   r   r$   r%   r1   r2   )r   r,   r   r   x_posx0_posx1_posdxr   r   r   drelu?   s   "(r7   c                C   s`   t t| t rtj| td|  S tj| d tdtj| d tdf}tj|| S r/   )r   r"   r#   r$   r%   r1   r   r&   )r   r   r   relu_xr   r   r   relu_sqN   s   ,r9   c                C   sj   t t| t rt| }||  }d||  }||fS t| }tj|| }tjdtj||}||fS )a  
    ReLU squared backward pass: computes gradient w.r.t. x and recomputes forward
    Given: relu_sq_out = max(x, 0) * x, and dout = grad w.r.t. relu_sq_out
    Returns: (dx, relu_sq_out) where:
    - dx = dout * 2 * x if x > 0, else 0
    - relu_sq_out = max(x, 0) * x
           @)r:   r:   )r   r"   r#   r2   r$   r%   r&   )r   r,   r   r   r8   relu_sq_outr6   r   r   r   drelu_sqW   s   r<   c          
   	   C   s   t dt j }d| }tt| t r&d| dt| ||| |        S tj	| | }tj
|||f||f}tj	| |}t|d t|d f}tj
|| | }	tj	d|	S )z
    gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
            = 0.5 * x * (1 + tanh(x * (0.797885 + 0.0356774 * x * x)))
       Hm?r         ?r   r    r   mathsqrtpir   r"   r#   r   r$   r%   r&   r'   )
r   r   r   sqrt_2_over_pisqrt_2_over_pi_coeffx_sqx_sq_scaledztanh_zx_tanh_zr   r   r   gelu_tanh_approxp   s    rK   c                C   s  t dt j }d| }d| }tt| t rN| |  }t| |||   }dd|  }	| |	 }
d||  }|||  }|	| d||    }|| }||
fS tj	| | }tj
|||f||f}tj	| |}t|d t|d f}tj
|dd}	tj	| |	}
tj
||d  |d  fd}tj
|||f||f}tj	||}tj	| |}tj
|d|	}tj	||}||
fS )	a  
    GELU tanh approximation backward pass: computes gradient w.r.t. x and recomputes forward
    Given: gelu_out = 0.5 * x * (1 + tanh(x * (c1 + c2 * x^2))), and dout = grad w.r.t. gelu_out
    Returns: (dx, gelu_out)

    Derivative uses the chain rule:
    d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
    where z = x * (c1 + c2 * x^2), dz/dx = c1 + 3 * c2 * x^2
    and sech^2(z) = 1 - tanh^2(z)
    r=   r>   g      @r   r    r   r   r?   r?   r@   )r   r,   r   r   rD   rE   sqrt_2_over_pi_coeff_3rF   rI   half_tanh_z_plus_onegelu_outsech2_zdz_dxdgelur6   rG   rH   sech2_dz_dxx_sech2_dz_dxr   r   r   dgelu_tanh_approx   s<    rU   c                C   s*  t t| t r$t| dk}|s"tjjttjj| ddd ddS | S t	tj
}tj| ||f}tjj|d ddtjj|d ddf}tj|d}tjj	|d ddtjj	|d ddf}td}	tj||	|	f}
t| d dk}t| d dk}|s|
d n| d |s|
d fS | d fS )	N      4@Tfastmathr?   r   r    rL   r:   )r   r"   r#   r   r$   rA   logr   explog2er%   r&   add_packed_f32x2)r   r   r   
use_linearlog2_ex_log2ex_expx_exp_p1log_x_exp_p1ln2
softplus_xuse_linear_0use_linear_1r   r   r   softplus   s.   $(

rh   c                C   s2   t | dk}||tjj|  dd  }|s|S |S )NrV   TrW   )r   r$   rA   rZ   )r+   r,   r   r   r^   r6   r   r   r   dsoftplus_from_output   s   ri   F)already_halvedr   r   rj   c                C   sz   t t| t rt | rd|  n| }|t| | S t | r'tjd| n| }t|d t|d f}tj|||S )z
    silu(x) = x * sigmoid(x) = x * (1 + tanh(x / 2)) / 2 = (0.5 * x) * tanh(0.5 * x) + (0.5 * x)
    This compiles down to 3 SASS instructions: FMUL to get 0.5 * x, MUFU.TANH, and FFMA.
    r   r   r   r    r!   )r   rj   r   r   r(   r)   r   r   r   silu   s   rk   yc                C   .   t t| t rt| | S tjt| |S r-   )r   r"   r#   rk   r$   r%   r&   r   rl   r   r   r   r   r   swiglu   s   ro   c                C   sH  t t| t rCt | rt| }| | }nt| }d| d }| | |  }|| }	|||  | |	 }
|
| }|	}|| }|||fS t | rTt| }tj| |}nt| d t| d f}tj|dd}tj| || }tj||}	tj||d  |d  f|}tj|||	}
tj|
|}|	}tj||}|||fS )a  
    SwiGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
    Given: swiglu_out = silu(x) * y, and dout = grad w.r.t. swiglu_out
    Returns: (dx, dy, swiglu_out) where dx = dout * y * d_silu(x), dy = dout * silu(x)

    d_silu(x) = sigmoid(x) * (1 + x * (1 - sigmoid(x)))

    This has been optimized to use fewer instructions (i.e. we expand things out
    to use FFMA instead of FADD and FMUL).
    r   r   r    r   )	r   r"   r#   r*   r   r$   r%   r&   r'   )r   rl   r,   rj   r   r   	sigmoid_xsilu_xtanh_xsilu_x_doutd_silu_x_doutr6   dy
swiglu_out sigmoid_x_minus_silu_x_sigmoid_xr   r   r   dswiglu  s<   




rx   Zd;?alphac          	      C   s   t t| t rd|  }|t||  | }|| | S tjd| }tj||f|}t|d t|d f}tj|||}tj|||S )a"  The swiglu variant used in gpt-oss, which has a scaling factor on x and bias of 1 to y.
    https://github.com/openai/gpt-oss/blob/7be9334950053a888e24887a57dac797a17d6e00/gpt_oss/torch/model.py#L249
    x * sigmoid(alpha * x) * (y + 1)
    Compile down to FMUL, FMUL, TANH, FFMA, FFMA
    r   r   r   r    r!   )	r   rl   rz   r   r   r(   rq   alpha_x_halftanh_alpha_x_halfr   r   r   
swiglu_oaiI  s   
r}   c                C   sH  t t| t r=d| |  }ddt|  }| | }|| }	|||||    | }
|
| |
 }|	}|| | }|||fS tjd| d| f| }t|d t|d f}tj|dd}tj| |}tj||}	tj||d  |d  f|}tj||f||}tj||}
tj|
||
}|	}tj|||}|||fS )au  
    Swiglu OAI backward pass: computes gradients w.r.t. x and y
    Given: swiglu_oai_out = x * sigmoid(alpha * x) * (y + 1), and dout = grad w.r.t. swiglu_oai_out
    Returns: (dx, dy, swiglu_oai_out)

    Derivative of x * sigmoid(alpha * x) w.r.t. x:
    d/dx[x * sigmoid(alpha * x)] = sigmoid(alpha * x) + alpha * x * sigmoid(alpha * x) * (1 - sigmoid(alpha * x))
    r   r   r    r   r!   )r   rl   r,   rz   r   r   r{   sigmoid_alpha_xrq   rs   rt   r6   ru   rv   r|   silu_x_minus_productsigmoid_plus_alpha_diffr   r   r   dswiglu_oai`  s4   


r   c                C   s6   t t| t rt| }|| S t| }tj||S )zGLU: Gated Linear Unit
    glu(x, y) = sigmoid(x) * y
    Using tanh to compute sigmoid: sigmoid(x) = 0.5 * (1 + tanh(x/2))
    )r   r"   r#   r*   r$   r%   r&   )r   rl   r   r   rp   r   r   r   glu  s
   r   c                C   s   t t| t r!t| }|| }|| }|| | }|}	||	|fS t| }tj||}tj||}t||}
tj|
|}|}	||	|fS )a/  
    GLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
    Given: glu_out = sigmoid(x) * y, and dout = grad w.r.t. glu_out
    Returns: (dx, dy, glu_out) where:
    - dx = dout * y * sigmoid(x) * (1 - sigmoid(x))
    - dy = dout * sigmoid(x)
    - glu_out = sigmoid(x) * y
    )r   r"   r#   r*   r$   r%   r&   sub_packed_f32x2)r   rl   r,   r   r   rp   sigmoid_x_doutglu_outr6   ru   y_minus_glu_outr   r   r   dglu  s   


r   c                C   s<   t t| t rtj| td| S t| }tj||S )zPReGLU: ReLU Gated Linear Unit
    reglu(x, y) = relu(x) * y = max(x, 0) * y
    r0   )	r   r"   r#   r$   r%   r1   r   r2   r&   )r   rl   r   r   r8   r   r   r   reglu  s   r   c                C   s   t t| t r.t| dk}tj| td}|r|| ntd}|| }|| }	|||	fS t| d dk}
t| d dk}t| }tj	||}|
rO|d ntd|rX|d ntdf}tj	||}tj	||}	|||	fS )a!  
    ReGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
    Given: reglu_out = relu(x) * y, and dout = grad w.r.t. reglu_out
    Returns: (dx, dy, reglu_out) where:
    - dx = dout * y if x > 0, else 0
    - dy = dout * relu(x)
    - reglu_out = relu(x) * y
    r   r0   r    )
r   r"   r#   r   r$   r%   r1   r   r2   r&   )r   rl   r,   r   r   r3   r8   r6   ru   	reglu_outr4   r5   dout_yr   r   r   dreglu  s   
(
r   c                C   rm   )zhGeGLU: GELU Gated Linear Unit
    geglu(x, y) = gelu(x) * y
    Uses the tanh approximation of GELU
    )r   r"   r#   rK   r$   r%   r&   rn   r   r   r   geglu  s   r   c          
      C   s   t t| t r t| |\}}|| }|| }|| }	|||	fS t| |\}}tj||}tj||}tj||}	|||	fS )a  
    GeGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
    Given: geglu_out = gelu(x) * y, and dout = grad w.r.t. geglu_out
    Returns: (dx, dy, geglu_out) where:
    - dx = dout * y * d_gelu(x)
    - dy = dout * gelu(x)
    - geglu_out = gelu(x) * y
    )r   r"   r#   rU   r$   r%   r&   )
r   rl   r,   r   r   dgelu_x_doutgelu_xr6   ru   	geglu_outr   r   r   dgeglu  s   

r   )Nrk   r2   r9   rK   )Nr2   r9   rK   )ro   r}   r   r   r   )ry   )2rA   typingr   	functoolsr   cutlass.cuter$   cutlassr   r   r   cutlass.cutlass_dslr   r   cutlass._mlir.dialectsr	   r
   F32_or_F32x2r%   calc_packed_f32x2_opr   floatr   r*   r.   r2   jitr7   r9   r<   rK   rU   rh   ri   boolrk   ro   rx   r}   r   r   r   r   r   r   r   
act_fn_mapdact_fn_mapgate_fn_mapdgate_fn_mapr   r   r   r   <module>   sT   
 


>"" A/ #  "		
