o
    (i                     @   s   d dl mZ d dlZd dlZd dlmZ 		ddejdejdejdedeej d	eej d
dfddZ			ddejdejdejdedeej deej deej d
dfddZ	dS )    )OptionalN)rms_norm_kernelxweightoutepsin_scale	out_scalereturnc           
      C   s   | j \}}t|}tdtd|d }	t|f di d|d|d| d| dd	|d
dddd|d|d|dd|d|d|d|dud|duddddd|	 dS )z[RMS norm.

    Computes `out[i,j] = x[i,j] * weight[j] / sqrt(eps + sum(x[i]^2) / n)`.
              nbx_ptrx_strider   x_scale_ptrr_ptrNr_stridew_ptro_ptro_strideo_scale_ptrEPS
BLOCK_SIZEHAS_IN_SCALEHAS_OUT_SCALE
HAS_OUTPUTTHAS_RESIDUALF	num_warps )shapetritonnext_power_of_2maxminr   stride)
r   r   r   r   r   r	   r   r   
block_sizer   r    r    S/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/flashinfer/triton/norm.pyrms_norm	   sP   


	

r)   residualx_out
x_in_scalex_out_scalec                 C   s>  | j \}}| j |j ksJ | d|dksJ t|}	tdt|	d}
t|f di d|d|d| d| dd|d|d	|dd
|d|d|durX|dn$dd|d|d|	d|dud|dud|duddd|
 dS d|d|d|	d|dud|dud|duddd|
 dS )zmIn-place RMS norm with fused residual addition.

    Computes `r = r + x`, followed by `x = rmsnorm(r)`.
    r   r   r   r   r   r   r   r   r   r   r   r   Nr   r   r   r   r   r   r   Tr   r    )r!   r&   r"   r#   r%   cdivr   )r   r*   r   r   r+   r,   r-   r   r   r'   r   r    r    r(   rms_norm_add_residual1   sx   



	

r/   )NN)NNN)
typingr   torchr"   flashinfer.triton.kernels.normr   Tensorfloatr)   r/   r    r    r    r(   <module>   sR    
-