o
    پiwQ                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZ d dl	m
Z
mZ d dlmZ d dlmZ eeeef B ZeddddeeB defd	d
ZeddddedefddZeddddededefddZeddddedefddZeejddddededeeef fddZeddddedefddZeejddddededeeef fddZeddddedefddZeddddededeeef fddZeejddddedefddZeejddddededefd d!Zed"ddd#ded$edefd%d&Zedddded'edefd(d)Z ed"ddd#ded'eded$edeeeef f
d*d+Z!e	,d>dddded'ed-edefd.d/Z"e	,d>dddded'eded-edeeeef f
d0d1Z#edddded'edefd2d3Z$edddded'ededeeeef fd4d5Z%edddded'edefd6d7Z&eejdddded'ededeeeef fd8d9Z'edddded'edefd:d;Z(edddded'ededeeeef fd<d=Z)dS )?    N)Tuple)Float32Boolean
const_expr)Tdsl_user_op)llvmlocipareturnc             
   C   s4   t tjt t | j||dgddddtjjdS )Nr	   ztanh.approx.f32 $0, $1;z=f,fF)has_side_effectsis_align_stackasm_dialect)r   r   
inline_asmr   f32ir_value
AsmDialectAD_ATT)r   r
   r    r   D/home/ubuntu/.local/lib/python3.10/site-packages/quack/activation.pytanh   s   r   xc                C   sV   t t| t rddtd|    S td| }t|d t|d f}t|ddS )N      ?r   r   r      r   
isinstancetupler   utilsmul_packed_f32x2fma_packed_f32x2)r   r
   r   x_halftanh_x_halfr   r   r   sigmoid    s
   r%   outdoutc                C   s   || | |    S Nr   )r&   r'   r
   r   r   r   r   dsigmoid_from_output+   s   r)   c                C   sN   t t| t rtj| tdS tj| d tdtj| d tdfS N        r   r   )r   r   r   cutearchfmaxr   )r   r
   r   r   r   r   relu1   s   ,r/   c                C   s   t t| t rt| dk}|r|ntdtj| tdfS t| d dk}t| d dk}|r5|d ntd|r>|d ntdf}|t| fS )Nr   r+   r   )	r   r   r   r   r   r,   r-   r.   r/   )r   r'   r
   r   x_posx0_posx1_posdxr   r   r   drelu9   s   "(r4   c                C   s^   t t| t rtj| td|  S tj| d tdtj| d tdf}t|| S r*   )	r   r   r   r,   r-   r.   r   r    r!   )r   r
   r   relu_xr   r   r   relu_sqH   s   ,r6   c                C   sd   t t| t rt| }||  }d||  }||fS t| }t|| }tdt||}||fS )a  
    ReLU squared backward pass: computes gradient w.r.t. x and recomputes forward
    Given: relu_sq_out = max(x, 0) * x, and dout = grad w.r.t. relu_sq_out
    Returns: (dx, relu_sq_out) where:
    - dx = dout * 2 * x if x > 0, else 0
    - relu_sq_out = max(x, 0) * x
           @)r7   r7   )r   r   r   r/   r    r!   )r   r'   r
   r   r5   relu_sq_outr3   r   r   r   drelu_sqQ   s   r9   c          
   	   C   s   t dt j }d| }tt| t r&d| dt| ||| |        S t| | }t	|||f||f}t| |}t|d t|d f}t	|| | }	td|	S )z
    gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
            = 0.5 * x * (1 + tanh(x * (0.797885 + 0.0356774 * x * x)))
       Hm?r         ?r   r   r   
mathsqrtpir   r   r   r   r    r!   r"   )
r   r
   r   sqrt_2_over_pisqrt_2_over_pi_coeffx_sqx_sq_scaledztanh_zx_tanh_zr   r   r   gelu_tanh_approxj   s    rH   c                C   sj  t dt j }d| }d| }tt| t rN| |  }t| |||   }dd|  }	| |	 }
d||  }|||  }|	| d||    }|| }||
fS t| | }t	|||f||f}t| |}t|d t|d f}t	|dd}	t| |	}
t	||d  |d  fd}t	|||f||f}t||}t| |}t	|d|	}t||}||
fS )	a  
    GELU tanh approximation backward pass: computes gradient w.r.t. x and recomputes forward
    Given: gelu_out = 0.5 * x * (1 + tanh(x * (c1 + c2 * x^2))), and dout = grad w.r.t. gelu_out
    Returns: (dx, gelu_out)

    Derivative uses the chain rule:
    d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
    where z = x * (c1 + c2 * x^2), dz/dx = c1 + 3 * c2 * x^2
    and sech^2(z) = 1 - tanh^2(z)
    r:   r;   g      @r   r   r   r   r<   r<   r=   )r   r'   r
   r   rA   rB   sqrt_2_over_pi_coeff_3rC   rF   half_tanh_z_plus_onegelu_outsech2_zdz_dxdgelur3   rD   rE   sech2_dz_dxx_sech2_dz_dxr   r   r   dgelu_tanh_approx   s<   rR   c                C   s$  t t| t r$t| dk}|s"tjjttjj| ddd ddS | S t	tj
}t| ||f}tjj|d ddtjj|d ddf}t|d}tjj	|d ddtjj	|d ddf}td}	t||	|	f}
t| d dk}t| d dk}|s|
d n| d |s|
d fS | d fS )	N      4@Tfastmathr<   r   r   rI   r7   )r   r   r   r   r,   r>   logr   explog2er    r!   add_packed_f32x2)r   r
   r   
use_linearlog2_ex_log2ex_expx_exp_p1log_x_exp_p1ln2
softplus_xuse_linear_0use_linear_1r   r   r   softplus   s.   $(

re   c                C   s2   t | dk}||tjj|  dd  }|s|S |S )NrS   TrT   )r   r,   r>   rW   )r&   r'   r
   r   r[   r3   r   r   r   dsoftplus_from_output   s   rf   F)already_halvedr
   r   rg   c                C   sv   t t| t rt | rd|  n| }|t| | S t | r&td| n| }t|d t|d f}t|||S )z
    silu(x) = x * sigmoid(x) = x * (1 + tanh(x / 2)) / 2 = (0.5 * x) * tanh(0.5 * x) + (0.5 * x)
    This compiles down to 3 SASS instructions: FMUL to get 0.5 * x, MUFU.TANH, and FFMA.
    r   r   r   r   r   )r   rg   r
   r   r#   r$   r   r   r   silu   s   rh   yc                C   ,   t t| t rt| | S tt| |S r(   )r   r   r   rh   r    r!   r   ri   r
   r   r   r   r   swiglu   s   rl   c                C   s8  t t| t rCt | rt| }| | }nt| }d| d }| | |  }|| }	|||  | |	 }
|
| }|	}|| }|||fS t | rSt| }t| |}nt| d t| d f}t|dd}t| || }t||}	t||d  |d  f|}t|||	}
t|
|}|	}t||}|||fS )a  
    SwiGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
    Given: swiglu_out = silu(x) * y, and dout = grad w.r.t. swiglu_out
    Returns: (dx, dy, swiglu_out) where dx = dout * y * d_silu(x), dy = dout * silu(x)

    d_silu(x) = sigmoid(x) * (1 + x * (1 - sigmoid(x)))

    This has been optimized to use fewer instructions (i.e. we expand things out
    to use FFMA instead of FADD and FMUL).
    r   r   r   r   )r   r   r   r%   r   r    r!   r"   )r   ri   r'   rg   r
   r   	sigmoid_xsilu_xtanh_xsilu_x_doutd_silu_x_doutr3   dy
swiglu_out sigmoid_x_minus_silu_x_sigmoid_xr   r   r   dswiglu  s8   




ru   Zd;?alphac          	      C   s   t t| t rd|  }|t||  | }|| | S td| }t||f|}t|d t|d f}t|||}t|||S )a"  The swiglu variant used in gpt-oss, which has a scaling factor on x and bias of 1 to y.
    https://github.com/openai/gpt-oss/blob/7be9334950053a888e24887a57dac797a17d6e00/gpt_oss/torch/model.py#L249
    x * sigmoid(alpha * x) * (y + 1)
    Compile down to FMUL, FMUL, TANH, FFMA, FFMA
    r   r   r   r   r   )	r   ri   rw   r
   r   r#   rn   alpha_x_halftanh_alpha_x_halfr   r   r   
swiglu_oaiA  s   
rz   c                C   s6  t t| t r=d| |  }ddt|  }| | }|| }	|||||    | }
|
| |
 }|	}|| | }|||fS td| d| f| }t|d t|d f}t|dd}t| |}t||}	t||d  |d  f|}t||f||}t||}
t|
||
}|	}t|||}|||fS )au  
    Swiglu OAI backward pass: computes gradients w.r.t. x and y
    Given: swiglu_oai_out = x * sigmoid(alpha * x) * (y + 1), and dout = grad w.r.t. swiglu_oai_out
    Returns: (dx, dy, swiglu_oai_out)

    Derivative of x * sigmoid(alpha * x) w.r.t. x:
    d/dx[x * sigmoid(alpha * x)] = sigmoid(alpha * x) + alpha * x * sigmoid(alpha * x) * (1 - sigmoid(alpha * x))
    r   r   r   r   r   )r   ri   r'   rw   r
   r   rx   sigmoid_alpha_xrn   rp   rq   r3   rr   rs   ry   silu_x_minus_productsigmoid_plus_alpha_diffr   r   r   dswiglu_oaiX  s4   


r~   c                C   s4   t t| t rt| }|| S t| }t||S )zGLU: Gated Linear Unit
    glu(x, y) = sigmoid(x) * y
    Using tanh to compute sigmoid: sigmoid(x) = 0.5 * (1 + tanh(x/2))
    )r   r   r   r%   r    r!   )r   ri   r
   r   rm   r   r   r   glu  s
   r   c                C   s   t t| t r!t| }|| }|| }|| | }|}	||	|fS t| }t||}t||}t||}
t|
|}|}	||	|fS )a/  
    GLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
    Given: glu_out = sigmoid(x) * y, and dout = grad w.r.t. glu_out
    Returns: (dx, dy, glu_out) where:
    - dx = dout * y * sigmoid(x) * (1 - sigmoid(x))
    - dy = dout * sigmoid(x)
    - glu_out = sigmoid(x) * y
    )r   r   r   r%   r    r!   sub_packed_f32x2)r   ri   r'   r
   r   rm   sigmoid_x_doutglu_outr3   rr   y_minus_glu_outr   r   r   dglu  s   

r   c                C   s:   t t| t rtj| td| S t| }t	||S )zPReGLU: ReLU Gated Linear Unit
    reglu(x, y) = relu(x) * y = max(x, 0) * y
    r+   )
r   r   r   r,   r-   r.   r   r/   r    r!   )r   ri   r
   r   r5   r   r   r   reglu  s   r   c                C   s   t t| t r.t| dk}tj| td}|r|| ntd}|| }|| }	|||	fS t| d dk}
t| d dk}t| }t	
||}|
rN|d ntd|rW|d ntdf}t	
||}t	
||}	|||	fS )a!  
    ReGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
    Given: reglu_out = relu(x) * y, and dout = grad w.r.t. reglu_out
    Returns: (dx, dy, reglu_out) where:
    - dx = dout * y if x > 0, else 0
    - dy = dout * relu(x)
    - reglu_out = relu(x) * y
    r   r+   r   )r   r   r   r   r,   r-   r.   r   r/   r    r!   )r   ri   r'   r
   r   r0   r5   r3   rr   	reglu_outr1   r2   dout_yr   r   r   dreglu  s   
(
r   c                C   rj   )zhGeGLU: GELU Gated Linear Unit
    geglu(x, y) = gelu(x) * y
    Uses the tanh approximation of GELU
    )r   r   r   rH   r    r!   rk   r   r   r   geglu  s   r   c          
      C   s|   t t| t r t| |\}}|| }|| }|| }	|||	fS t| |\}}t||}t||}t||}	|||	fS )a  
    GeGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
    Given: geglu_out = gelu(x) * y, and dout = grad w.r.t. geglu_out
    Returns: (dx, dy, geglu_out) where:
    - dx = dout * y * d_gelu(x)
    - dy = dout * gelu(x)
    - geglu_out = gelu(x) * y
    )r   r   r   rR   r    r!   )
r   ri   r'   r
   r   dgelu_x_doutgelu_xr3   rr   	geglu_outr   r   r   dgeglu  s   

r   )rv   )*r>   typingr   cutlass.cuter,   cutlassr   r   r   cutlass.cutlass_dslr   r   cutlass._mlir.dialectsr   quack.utilsr    F32_or_F32x2floatr   r%   r)   r/   jitr4   r6   r9   rH   rR   re   rf   boolrh   rl   ru   rz   r~   r   r   r   r   r   r   r   r   r   r   <module>   s   
 


>"" ?/ #  