o
    ÒÙ¾i¬  ã                   @   s´   d dl mZ d dlZd dlZd dlmZ 		ddejdejdejdedeej d	eej d
dfdd„Z			ddejdejdejdedeej deej deej d
dfdd„Z	dS )é    )ÚOptionalN)Úrms_norm_kernelÚxÚweightÚoutÚepsÚin_scaleÚ	out_scaleÚreturnc           
      C   s¼   | j \}}t |¡}tdtd|d ƒƒ}	t|f di d|“d|“d| “d|  d¡“d	|“d
d“dd“d|“d|“d| d¡“d|“d|“d|“d|du“d|du“dd“dd“d|	“Ž dS )z[RMS norm.

    Computes `out[i,j] = x[i,j] * weight[j] / sqrt(eps + sum(x[i]^2) / n)`.
    é   é    é   ÚnÚbÚx_ptrÚx_strider   Úx_scale_ptrÚr_ptrNÚr_strideÚw_ptrÚo_ptrÚo_strideÚo_scale_ptrÚEPSÚ
BLOCK_SIZEÚHAS_IN_SCALEÚHAS_OUT_SCALEÚ
HAS_OUTPUTTÚHAS_RESIDUALFÚ	num_warps© )ÚshapeÚtritonÚnext_power_of_2ÚmaxÚminr   Ústride)
r   r   r   r   r   r	   r   r   Ú
block_sizer   r    r    úJ/home/ubuntu/.local/lib/python3.10/site-packages/flashinfer/triton/norm.pyÚrms_norm	   sP   

ÿþý
üûúùø	÷

öõôóòñðïîr)   ÚresidualÚx_outÚ
x_in_scaleÚx_out_scalec                 C   s>  | j \}}| j |j ksJ ‚|  d¡| d¡ksJ ‚t |¡}	tdt |	d¡ƒ}
t|f di d|“d|“d| “d|  d¡“d|“d|“d	| d¡“d
|“d|“d|durX| d¡n$d“d|“d|“d|	“d|du“d|du“d|du“dd“d|
“Ž dS “d|“d|“d|	“d|du“d|du“d|du“dd“d|
“Ž dS )zmIn-place RMS norm with fused residual addition.

    Computes `r = r + x`, followed by `x = rmsnorm(r)`.
    r   r   r   r   r   r   r   r   r   r   r   r   Nr   r   r   r   r   r   r   Tr   r    )r!   r&   r"   r#   r%   Úcdivr   )r   r*   r   r   r+   r,   r-   r   r   r'   r   r    r    r(   Úrms_norm_add_residual1   sx   

ÿþý
üûú
ùø	÷
öõôóòñðïî
öõôóòñðïîr/   )NN)NNN)
Útypingr   Útorchr"   Úflashinfer.triton.kernels.normr   ÚTensorÚfloatr)   r/   r    r    r    r(   Ú<module>   sR    úÿþýüûú
ù-ùÿþýüûúùø