o
    c۷i                     @   s  d dl Z d dlmZmZmZ d dlmZmZ d dlm	  m
Z d dlZd dlmZ d dlmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlm Z  d d	l!m"Z" d d
l#m$Z$ G dd de Z%ej&j'ddddd							d[dedee dedee dee dee dee dee de(de)ddfdd Z*e*j+							d[dedee dedee dee dee dee dee de(de)ddfd!d"Z,edd#d$d% Z-							d[dedee dee dee d&eej. d'eej. de(d(e)deeeee f fd)d*Z/d\d+d,Z0d]d-d.Z1G d/d0 d0e Z2d1e3d2ej4de3fd3d4Z5ej&j'd5h d6dd7d				d^dedee d8eded9ed:ee d;ee d<ee d=ee d>ee3 ddfd?d@Z6e6j+				d^dedee d8eded9ed:ee d;ee d<ee d=ee d>ee3 ddfdAdBZ7edd#dCdD Z8			d_dedee d8eded<ee dEe)dFe)deeee ee ee f fdGdHZ9G dIdJ dJej:j;Z<							d[dedee dee dee d&eej. d'eej. de(dKe)defdLdMZ=G dNdO dOej>j%Z?				d`dededee de(dPe)dQe)fdRdSZ@d]dedTede(defdUdVZAd]dejde(fdWdXZBdejdejfdYdZZCdS )a    N)OptionalTupleType)	lru_cachepartial)Float32Int32
const_expr)Tensor)make_fake_tensor)
row_reduce)ReductionBasecompile_and_cache)torch2cute_dtype_mapc                       s  e Zd Zddeej dedef fddZdd Z	d	d
 Z
ejdejdeej deej deej dejdeej deej deej dedejfddZejdejdeej deej deej dejdeej deej deej dedejdejdeje fddZ  ZS )RMSNormFdtypeNis_layernormc                    sD   t  j|||r	dndd || _||rdndkrd nd| _d| _d S )N      )stage @      smemF)super__init__r   reload_fromdelay_w_load)selfr   r   r   	__class__ C/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/rmsnorm.pyr      s   
zRMSNorm.__init__c                 C   (   | j }dD ]\}}||kr|  S qdS )N)@            )i       )i   r&   )r   r)      r   r   r   limitthreadsr"   r"   r#   _threads_per_row!      zRMSNorm._threads_per_rowc                 C   sT   | j }t| jjdkrg d}ng d}|D ]\}}||kr$|| _ d S qd| _d S )Nr*   ))r   r   )   r   )      )   r'   ))r3   r   )r4   r   )r6   r5   )i   r'   )r   r	   r   width	cluster_n)r   r   
thresholdsr/   clusterr"   r"   r#   _set_cluster_n(   s   

zRMSNorm._set_cluster_nmXmWmBmResmOmResOmRstdmMeanepsstreamc                    s  |j  jksJ    ttdd ||||||fD  }t jd| } j|d\}}|j	}fdd||fD \}} fdd||fD \}} 
|||||||||	||jt|jd d  jd	g|d	d	gt jd	kr{d	 jd	gnd |
d
 d S )Nc                 s        | ]}|d ur|j jV  qd S Nelement_typer7   .0tr"   r"   r#   	<genexpr>G       z#RMSNorm.__call__.<locals>.<genexpr>r)   vecsizec                    s0   g | ]}t |d urtj|d d dnd qS )Nr   dimsize)r	   layout_utilsexpandrK   mT)tiler_mnr"   r#   
<listcomp>L   s    "z$RMSNorm.__call__.<locals>.<listcomp>c                    s.   g | ]}t |d urtj|d jdnd qS )Nr   rQ   )r	   rT   rU   r   rV   r   r"   r#   rY   P        r   r   gridblockr:   rE   )rI   r   r;   r	   maxmathgcdr   _get_tiled_copyrS   kernellaunchcuteceil_divshaper8   )r   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   largest_dtype_widthrP   
tiled_copythreads_per_rownum_threadsr"   )r   rX   r#   __call__6   s.   




zRMSNorm.__call__rX   ri   rj   c           ?   
      s  t j \}}}t j \ }}t| jdkrtdnt j d |j}tj	 }|j
|jt jdddd}t|d urL|j
|jt jdddd}| ||\}}|j}t |} fdd|||||||fD \}}}}}}}fd	d||fD \}}||} t|d ur| |nd }!t|d ur| |nd }"| |}#| |}$t|d ur| |}%| |}&| |}'t|d ur| |}(t|d ur| |nd })t|d ur| |nd }*| |d
 }+t|d urt |!nd },t|d urt |"nd }-dd |#|'fD \}.}/t|d urt |%}0t |t jj }1| |||1 t|d d | j k}2|2sKtj| ||d dnd }3ttj|3d}4|+d d }5|5|d k rv|4|#|$dd t|d urv|4|%|&dd t j  t| j rt|d ur|4|!|, t|d ur|4|"|- t jd t |$|. |.  t j!}6t|d urt |&|0 |6|0  t j!7 }6t|d urt |(}7|7"|6 |7j |5|d k r|4|7|( d\}8}9t| j#rt$|6t j%j&||d t| jdkr|d nd dt| jdkrt jj'nd d}:|:|d  }8t|d urF|+d d dkrF|5|d k rF| jdksBt j( dkrF|8|*d< t| j)dkrtt |$|. |.  t j!}6t|d urst |&|0 |6|0  t j!7 }6n+t| j)dkr|4|#|. |.  t j!}6t|d ur|4|%|0 |6|0  t j!7 }6t$|6|8 |6|8  t j%j&||d t| jdkr|d nd dd};t j*j+|;|d  |	 dd}9n/td}8t$|6|6 t j%j&||d |dt| jdkrt jj'nd d}<t j*j+|<|d  |	 dd}9t|d ur&|+d d dkr&|5|d k r&| jdks"t j( dkr&|9|)d< t| jrDt|d ur8|4|!|, t|d urD|4|"|- t| j)dkpO| j)dkrt| j)dkrnt |$|. t|d urmt |&|0 n|4|#|. t|d ur|4|%|0 |.  t j!}6t|d ur|6|0  t j!7 }6t| j#r|6|8 |9 n|6|9 }=|=}>t|d ur|>|,  t j!9 }>t|d ur|>|-  t j!7 }>|/"|> |/j |5|d k r|4|/|' d S d S )Nr   r   r   r   orderr*   byte_alignmentc                    s*   g | ]}|d urt | fnd qS rG   re   
local_tilerV   bidx	cluster_yrX   r"   r#   rY          z"RMSNorm.kernel.<locals>.<listcomp>c                    s.   g | ]}t |d urt|d fnd qS )Nr   r	   re   rs   rV   rv   rX   r"   r#   rY      r[   )r   NNNc                 S   s   g | ]}t |qS r"   re   make_fragment_likerJ   r"   r"   r#   rY      s    r/   predTis_asyncNNNNr           )init_valhook_fnr   gmem)NNr   )r   )fastmath),re   arch
thread_idx	block_idxr	   r8   layout_tv_tiledcutlassutilsSmemAllocatorallocate_tensorrI   make_ordered_layout#_allocate_reduction_buffer_and_mbarrg   make_identity_tensor	get_slicepartition_Spartition_Dr|   rS   	WARP_SIZE_initialize_cluster
copy_utilspredicate_kr   copycp_async_commit_groupr   cp_async_wait_groupautovec_copyloadtor   storer   r   ReductionOpADDcluster_waitblock_idx_in_clusterr   r`   rsqrt)?r   r<   r=   r>   r?   r@   rA   rB   rC   rD   rX   ri   rj   tidx_	tv_layoutr   sXsResreduction_buffermbar_ptrrg   idXgXgResgOgResOgRstdgMeancXgWgB
thr_copy_XtXgWtXgBtXgXtXsXtXgRestXsRestXgOtXgResOtXrRstdtXrMeantXcXtXrWtXrBtXrXtXrOtXrRes	num_warps	is_even_NtXpXr   rowxtXrResOmeanrstdsum_xsum_sq_x_sub_meansum_sq_xx_hatyr"   rt   r#   rc   ]   s4  $















	

	



 zRMSNorm.kernel)F)__name__
__module____qualname__r   r   Numericintboolr   r1   r;   re   jitr
   r   r   cudaCUstreamrl   rc   Shape	TiledCopy	Constexpr__classcell__r"   r"   r    r#   r      sh    "	
&	
r   zquack::_rmsnorm_fwd)outr   r   residual_outr   z(Tensor x, Tensor? weight, Tensor(a2!) out, Tensor? bias, Tensor(a4!)? rstd, Tensor(a5!)? mean, Tensor? residual, Tensor(a7!)? residual_out, float eps=1e-6, bool is_layernorm=False) -> ())mutates_argsdevice_typesschemaư>Fr   weightr   biasr   r   residualr   rD   r   returnc
                 C   s   t jt jt jh}
| j|
v sJ d|dur|j|
v sJ d|dur+|j|
v s+J d| j\}}dd | |||||fD \}}}}}}t||||||||du|du|	
| ||||||||	 dS )aA  RMSNorm/LayerNorm forward pass.
    Args:
        x: Input tensor of shape (M, N)
        weight: Optional weight tensor of shape (N,)
        eps: Small value for numerical stability
        is_layernorm: If True, compute LayerNorm instead of RMSNorm
    Returns:
        Normalized output tensor of same shape as x
    Unsupported dtypeN+Weight must be float32, float16 or bfloat16.Residual must be float16, bfloat16, or float32c                 S   "   g | ]}|d urt |j nd qS rG   r   r   rJ   r"   r"   r#   rY   F      z _rmsnorm_fwd.<locals>.<listcomp>)torchfloat16bfloat16float32r   rg   _compile_rmsnorm_fwd)r   r   r   r   r   r   r   r   rD   r   supported_typesr   r   r   	out_dtypeweight_dtype
bias_dtype	res_dtyperes_out_dtyper"   r"   r#   _rmsnorm_fwd!  s0   
r   c
                 C   s   ddl m}
 |
rLt| dtjsN| d}dd | |||||fD \}}}}}}t||||||||d u|d u|	
 t||||||d u|||d u	 d S d S d S )Nr   COMPILE_ONLYr   c                 S   r   rG   r   rJ   r"   r"   r#   rY   j  r   z%_rmsnorm_fwd_fake.<locals>.<listcomp>)quack.cache_utilsr   
isinstancerS   r   SymIntr   _compile_rmsnorm_bwd)r   r   r   r   r   r   r   r   rD   r   r   r   r   r   r   r   r   r   r"   r"   r#   _rmsnorm_fwd_fakeX  s<   
r  )maxsizec
                    sB   d	 f}
 	f
dd}t |
|S )Nrmsnorm_fwdc            	         s   t   	
g} tjgdd | D R   fdd	
fD \}}}}fddfD \}}rCtt fnd }rMtt fnd }t jtd||||||||tdt jj	dd	d
dS )Nc                 s   "    | ]}|d urd|j  V  qd S Nr)   r7   rK   dtr"   r"   r#   rM          z9_compile_rmsnorm_fwd.<locals>._compile.<locals>.<genexpr>c                       g | ]
}t | fqS r"   fake_tensorr  r   	batch_symdivr"   r#   rY         z:_compile_rmsnorm_fwd.<locals>._compile.<locals>.<listcomp>c                    s   g | ]	}t | fqS r"   r  r  )r   r  r"   r#   rY     s    )r   r   Tuse_tvm_ffi_env_stream--enable-tvm-ffioptions)
re   sym_intr`   ra   r  r   compiler   runtimemake_fake_stream)	
all_dtypesx_cuteout_cuteres_cuteres_out_cuteweight_cute	bias_cute	rstd_cute	mean_cute
r   r   r   has_meanhas_rstdr   r   r   r   r   r  r  r#   _compile  s.   
z&_compile_rmsnorm_fwd.<locals>._compiler   )r   r   r   r   r   r   r   r)  r(  r   keyr+  r"   r'  r#   r     s   
r   r   residual_dtype
store_rstdc                 C   s   |d u r| j n|}tj| |d}|rtj| jd | jtjdnd }	|d ur(|j }|d us5|d urD|| j krDtj| |d ur>|n| j d}
nd }
t| ||||	d ||
|d
 |
d u rY| }
||
|	fS )Nr   r   devicer   F)r   r   
empty_likeemptyrg   r1  r   r   )r   r   r   r   r   r-  rD   r.  r   r   r   r"   r"   r#   r    s   "
r  c           	      C   s   |   }|d ur|  }||7 }|ttj| ddd|  }|d ur)|| n|}|d ur5||   }|d u r?|| jS || j||jfS )NTrR   keepdim)floatr   sqrtr   squarer   r   )	r   wr   r   rD   x_f32residual_f32x_normr   r"   r"   r#   rmsnorm_ref  s   "r>  c                 C   s   |   }||d }|dur|| }n|}|| jddd}|||  |d }	|durB|| jdd}
|	| j|
|jfS |	| jdfS )z3Reference implementation for RMSNorm backward pass.r   Nr4  Tr5  r   rR   )r7  	unsqueezer   sumr   r   )r   r:  doutr   rD   r;  r   wdyc1dxdwr"   r"   r#   rmsnorm_bwd_ref  s   
rG  c                       s  e Zd Zdejdef fddZdd Zdd Zd	d
 Z	e
jde
jdee
j de
jdee
j de
jde
jdee
j dee
j dee
j dedejfddZe
jde
jdee
j de
jdee
j de
jde
jdee
j dee
j dee
j de
jde
jdeje fddZ  ZS )RMSNormBackwardr   r   c                    sL   t  j||dtd |dkrd nd| _| jdkr"| jjdkr$tdd S d S )Nr   )r   reduction_dtyper   r   r6   r+   z?RMSNormBackward does not support N > 128k with dtype >= 32 bits)r   r   r   
reload_wdyr   r   r7   
ValueError)r   r   r   r    r"   r#   r     s
   zRMSNormBackward.__init__c                 C   s   | j dkrdS dS )N   r)   r,   r-   rZ   r"   r"   r#   _num_threads	  s   zRMSNormBackward._num_threadsc                 C   r$   )N)r%   r(   )r,   r+   )i   r&   )rL  r)   r,   r-   r.   r"   r"   r#   r1     r2   z RMSNormBackward._threads_per_rowc                 C   s2   | j }dD ]\}}||kr|| _ d S qd| _d S )N))r   r   )r   r   )r3   r5   )r4   r'   r*   )r   r8   )r   r   r/   r:   r"   r"   r#   r;     s   
zRMSNormBackward._set_cluster_nr<   r=   mdOmdResOrB   mdXmdWmdResmdBsm_countrE   c                 C   s   |j | jksJ |   ttdd ||||||fD  }t| jd| }| j|d\}}}|j	}t|d urBt
j|d|d dnd }|
}| ||||||||	||||j|| jdg|ddg| jdkrjd| jdgnd |d d S )	Nc                 s   rF   rG   rH   rJ   r"   r"   r#   rM   -  rN   z+RMSNormBackward.__call__.<locals>.<genexpr>r)   rO   r   rQ   r   r\   )rI   r   r;   r	   r_   r`   ra   r   rb   rS   rT   rU   rc   rd   r8   )r   r<   r=   rN  rO  rB   rP  rQ  rR  rS  rT  rE   rh   rP   ri   rX   rj   rk   
num_blocksr"   r"   r#   rl     s(   "

zRMSNormBackward.__call__rX   ri   rj   c           T   
      s  t j \}}}t j \ }}t j \}}}t| jdkr#tdnt j d |j}|j}|d |d }}t|d d | j k}t 	|}t
j }t jd d dfdd}|j|j|dd}|j|j|dd}| j||dd	\}}t|d ur||d }}nd
\}}||}fdd|||||	|fD \}} }!}"}#}$|d urt |dfnd }% fdd||fD \}&}'||}(||})|| }*||}+||"},t|d ur||!}-t|	d ur||#}.||$d }/dd |(|*|,fD \}0}1}2d }3t|d urt |-d }3d }4t|	d ur(t |.d }4|r-d ntj||$d |d d}5ttj|5d}6d
\}7}8d
\}9}:t|d ur\||&}7t |7t}8t|d urn||'}9t |9t}:t |t jj };| j|||;dd	 d }<t|d ur||%}=t |=}<t| r|<d |6|=|< |/d d d  f d d }>|>|k r|6|(d d d  f |)d dd |6|*d d d  f |+d dd n!td dkrtj|)d d |jj d tj|+d d |jj d t j!  t| jdkrt j"  t|d ur|8d t|d ur|:d t#d}?t#d}@t#d}At
$ t %|d |D ]}B|/d d d |Bf d d }>|>|d   |k r|6|(d d d |B| f |)d d d |?dA f dd |6|*d d d |B| f |+d d d |?dA f dd n-td dkrtj|)d d d |?dA f d |jj d tj|+d d d |?dA f d |jj d t j!  t
j&j }C|>|k sd dkr||> }Ct|d ur|>|k sوd dkr|6|-d d d |Bf |3 nd dkr|3d t j'd t (|)d d d |?f |0 |0) *t j}Dt (|+d d d |?f |1 |1) *t j}E|D|C }F|E}Gt|d ur5|G|<) *t9 }Gt| jdkrFt j+||? |@ t,|F|G t j-j.||d d |?f t| jdkr`||? nd |Add|d  }Ht| jdkrt j/  t j0  t j1 }I|I| jk rt jj2||? |Id t| j3dkrt (|+d d d |?f |1 |1) *t j}E|E}Gt|d ur|G|<) *t9 }G|G|F|H  |C }Jt|d ur|J|3) *t j7 }J|24|J*|2j |>|k sd dkr|6|2|,d d d |Bf  t|	d ur|44|J*|4j |>|k sd dkr|6|4|.d d d |Bf  t|d ur1|84|8) |E|F   t|d urA|:4|:) |E  |?dN }?|?dkrR|AdN }A|@dN }@q7td dkrSt|d urt 5t j6|j7t jdt jdd}K||K}Lt j8  |/d d d }>|>dkrt (|8|L t j8  |>dkrt
9dtd D ](}Mt |8}Nt 5|Lj7|M|Kj:d   |Lj;}Ot (|O|N |84|8) |N)   q|6|8|7 t j8  t|d urRt 5t j6|j7t jdt jdd}P||P}Qt j8  |/d d d }>|>dkrt (|:|Q t j8  |>dkrRt
9dtd D ](}Mt |:}Rt 5|Qj7|M|Pj:d   |Qj;}St (|S|R |:4|:) |R)   q$|6|:|9 nt|d ur_|6|8|7 t|d urk|6|:|9 t| jdkr|?dN }?|?dkr|@dN }@t j+||? |@ d S d S )Nr   r   r   )r   r   r   rn   r*   rp   T)is_persistentr   c                    s*   g | ]}|d urt |d  fnd qS rG   rr   rV   ry   r"   r#   rY   i  rw   z*RMSNormBackward.kernel.<locals>.<listcomp>c                    s6   g | ]}t |d urt|dd f fnd qS )Nr   rx   rV   
bidx_startrv   rX   r"   r#   rY   n  s    
)rz   NNNc                 S   s   g | ]	}t |d  qS )NNNr   r{   )rK   thrr"   r"   r#   rY     s    rY  r   r}   r~   r   r   )
fill_value)phaser   )peer_cta_rank_in_clusterr   r/  rm   )<re   r   r   r   grid_dimr	   r8   r   rg   r   r   r   r   r   r   rI   r   r   rs   r   r   r|   r   r   r   r   r   rS   r   r   fillfill_oobzeror   r   r   rangerf   Floatr   r   r   r   mbarrier_waitr   r   r   fence_view_async_shared	sync_warplane_idxmbarrier_arriverJ  r   make_tensor
recast_ptriteratorbarrierrange_constexprstridelayout)Tr   r<   r=   rN  rO  rB   rP  rQ  rS  rR  rX   ri   rj   r   r   gdimr   rg   Mr   r   r   r   smem_layoutr   sdOr   r   mbar_full_ptrmbar_empty_ptrr   r   gdOgdResOgdXgdResr   r   gdWgdBr   r   tXgdOtXsdOtXgdXtXgdResOtXgdResr   r   tXrdOtXrdXtXrdResOtXrdResr   r   tXgdWtXrdWtXgdBtXrdBr   r   r   r   r   producer_phaseconsumer_phaseru   r   r   rB  r   rC  mean_xhat_wdyrg  rE  sdWtXsdWitXrdW_othertXsdW_othersdBtXsdBtXrdB_othertXsdB_otherr"   rW  r#   rc   ?  s  $


















 



 

	





















zRMSNormBackward.kernel)r   r   r   r   r   r   r   rM  r1   r;   re   r   r
   r   r   r   r   rl   rc   r   r   r   r   r"   r"   r    r#   rH     sn    	
#	
rH  r   r1  c                 C   sv   | dkrdn| dkrdn| dkrdn| dkrdnd	}t j|j}| d
kr+|| }|S | dkr5|d }|S |d }|S )Nr,   r*   i   r'   i   r5   rL  r   r   r   r   )r   r   get_device_propertiesmulti_processor_count)r   r1  sm_count_multiplerT  r"   r"   r#   _get_sm_countZ  s   2r  zquack::_rmsnorm_bwd>   rE  	dresidual
db_partial
dw_partialz(Tensor x, Tensor? weight, Tensor dout, Tensor rstd, Tensor(a4!) dx, Tensor(a5!)? dw_partial, Tensor(a6!)? db_partial, Tensor? dresidual_out, Tensor(a8!)? dresidual, int? sm_count) -> ()rB  rE  r  r  dresidual_outr  rT  c
                 C   s  |   dks
J d| jsJ dtjtjtjh}
| j|
v s"J d|durN|  dks0J d| jd |jd	 ks>J d
|jsEJ d|j|
v sNJ d|durh|j| jksZJ |js_J |j|
v shJ d|dur|j| jkstJ |jsyJ |j|
v sJ d| d}|du r|du r|	dusJ n|dur|jd	 n|jd	 }	dd | |||||fD \}}}}}}t	||||||du|||du	| |||||||||	
 dS )a  RMSNorm backward pass.
    Args:
        x: Input tensor of shape (M, N)
        weight: Optional weight tensor of shape (N,)
        dout: Upstream gradients tensor of shape (M, N)
        rstd: Reciprocal standard deviation tensor of shape (M,)
    Returns:
        Tuple of (dx, dw) where:
        - dx: Input gradients tensor of same shape as x
        - dw: Weight gradients tensor of same shape as weight (or None if weight is None)
    r   Input must be 2Dz#Input tensor must be on CUDA devicer   Nr   Weight must be 1Dr4  r   z3Last dimension of input must match weight dimensionz$Weight tensor must be on CUDA devicer   r   c                 S   r   rG   r   rJ   r"   r"   r#   rY     r   z _rmsnorm_bwd.<locals>.<listcomp>)
rR   is_cudar   r   r   r   r   rg   rS   r  )r   r   rB  r   rE  r  r  r  r  rT  r   r   r   
dout_dtypedx_dtyper   
dres_dtypedres_out_dtyper"   r"   r#   _rmsnorm_bwdk  sN   



r  c
                 C   s   ddl m}
 |
rIt| dtjsK| d}|d u r$|d u r$|	d u r$d S dd | |||||fD \}}}}}}t||||||d u|||d u	 d S d S d S )Nr   r   r   c                 S   r   rG   r   rJ   r"   r"   r#   rY     r   z%_rmsnorm_bwd_fake.<locals>.<listcomp>)r  r   r  rS   r   r  r  )r   r   rB  r   rE  r  r  r  r  rT  r   r   r   r  r  r   r  r  r"   r"   r#   _rmsnorm_bwd_fake  s(   
r  c	              
      s>   d f
}	 f	dd}
t |	|
S )Nrmsnorm_bwdc                     s   t  t   } g}tjgdd |D R   fddfD \}}}}}t
f}tt f}	rJtt| fnd }	rVtt| fnd }
t jt|||||||	||
dt jj	dddd	S )
Nc                 s   r  r	  r
  r  r"   r"   r#   rM     r  z9_compile_rmsnorm_bwd.<locals>._compile.<locals>.<genexpr>c                    r  r"   r  r  r  r"   r#   rY     r  z:_compile_rmsnorm_bwd.<locals>._compile.<locals>.<listcomp>r   Tr  r  r  )
re   r  r`   ra   r  r   r  rH  r  r  )batch_partial_symr  r  	dout_cutedx_cutedres_out_cute	dres_cuter#  r%  dw_partial_cutedb_partial_cute	r   r  r  r  r   r  has_db_partialhas_dw_partialr   r*  r#   r+    s6   z&_compile_rmsnorm_bwd.<locals>._compiler   )r   r   r  r  r   r  r  r  r  r,  r+  r"   r  r#   r    s   
 r  has_biashas_residualc                 C   s   | j }| d}t| }	|d ur |j|	jkr tj| |jd}
nd }
t||}|d ur6tj|||tjd}nd }|rDtj|||tjdnd }t| ||||	||||
|
 |d ura|j	dd
|jnd }|ro|j	dd
|jnd }|ry|
d u ry|	}
|	|||
fS )Nr   r/  r0  r   r?  )r1  rS   r   r2  r   r  r3  r   r  rA  r   )r   r   rB  r   r  r  r  r1  r   rE  r  rT  r  r  rF  dbr"   r"   r#   r    s&   	


 r  c                   @   s2   e Zd Ze						dddZedd ZdS )	RMSNormFunctionNr   Fc	              
   C   s   |j }	|d|j d }|d ur|d|j d }t| jd d }
t||||||||
d\}}}| |d u r9|n||| |d u| _|| _|	| _|d urP|j	nd | _
|| _|d u s\|sa||	S ||	||	fS )Nr4     )r   r   r   r-  rD   r.  )rg   reshapeanyneeds_input_gradr  save_for_backwardr  rD   
x_shape_ogr   r-  prenorm)ctxr   r   r   r   r   r-  rD   r  r  	need_gradr   r   r   r"   r"   r#   forward:  s0   


zRMSNormFunction.forwardc              	   G   s   | j \}}}| j}| jr| jd ur|d }|d|jd }nd }| j}|d|jd }t||||||| jd ud\}	}
}}|	|}	|d urM||}|	|
||gd gd R S )Nr   r4  )r  r5   )	saved_tensorsr  r  r-  r  rg   r  viewr  )r  rB  argsr   r   r   r  r  r  rE  rF  r  r  r"   r"   r#   backwarda  s*   
	
zRMSNormFunction.backward)NNNNr   F)r   r   r   staticmethodr  r  r"   r"   r"   r#   r  9  s    &r  r  c              
   C   s   t | |||||||S )a  RMSNorm with automatic differentiation support.

    Args:
        x: Input tensor of shape (M, N)
        weight: Optional weight tensor of shape (N,)
        eps: Small value for numerical stability

    Returns:
        Normalized output tensor of same shape as x
    )r  apply)r   r   r   r   r   r-  rD   r  r"   r"   r#   rmsnorm}  s   r  c                       sD   e Zd ZdZ	ddededef fdd	Zd
edefddZ	  Z
S )QuackRMSNorma  RMSNorm module that behaves like torch.nn.RMSNorm.

    This class provides a drop-in replacement for torch.nn.RMSNorm that uses
    the quack.rmsnorm implementation under the hood.

    Args:
        dim (int): The dimension to normalize over
        eps (float, optional): A small constant for numerical stability. Default: 1e-6

    Attributes:
        weight (torch.nn.Parameter): The learnable weight parameter
        eps (float): A small constant for numerical stability
    r   TNrR   rD   elementwise_affinec                    s   t  j|||||d d S )Nr0  )r   r   )r   rR   rD   r  r1  r   r    r"   r#   r     s   zQuackRMSNorm.__init__r   r   c                 C   s   t || j| jdS )zApply RMSNorm to the input tensor.

        Args:
            x (Tensor): Input tensor

        Returns:
            Tensor: Normalized tensor
        )rD   )r  r   rD   )r   r   r"   r"   r#   r    s   	zQuackRMSNorm.forward)r   TNN)r   r   r   __doc__r   r7  r   r   r
   r  r   r"   r"   r    r#   r    s    r  return_rstdreturn_meanc                 C   s   |   dks
J d|  dksJ d| jtjtjtjfv s#J d|jtjks-J d|durE|  dks;J d|jtjksEJ d	| j\}}| j}t| }	|r]tj	||tjd
nd}
|rjtj	||tjd
nd}t
| ||	||
|dd|d
 |r|r|	|
|fS |r|	|
fS |r|	|fS |	S )au  LayerNorm forward pass using the unified RMSNorm/LayerNorm kernel.

    Args:
        x: Input tensor of shape (M, N)
        weight: Weight tensor of shape (N,). Must be float32.
        bias: Optional bias tensor of shape (N,). Must be float32.
        eps: Small value for numerical stability
        return_rstd: Whether to return the reciprocal standard deviation
        return_mean: Whether to return the mean

    Returns:
        Normalized output tensor of same shape as x
        If return_rstd is True, also returns rstd tensor of shape (M,)
        If return_mean is True, also returns mean tensor of shape (M,)
    r   r  r   r  r   zWeight must be float32NzBias must be 1DzBias must be float32r0  T)rR   r   r   r   r   r   rg   r1  r2  r3  r   )r   r   r   rD   r  r  rq  r   r1  r   r   r   r"   r"   r#   layernorm_fwd  s(   


r  r:  c                 C   s(   |   }tjj||j|d|| jS )z'Reference implementation for LayerNorm.N)r7  r   nn
functional
layer_normrg   r   r   )r   r:  rD   r;  r"   r"   r#   layernorm_ref  s    r  c                 C   s<   |   }|jddd}|| d jdd}dt||  S )Nr4  Tr5  r   r?  g      ?)r7  r   r   r8  )r   rD   r;  r   varr"   r"   r#   layernorm_rstd_ref  s   r  c                 C   s   |   jddS )Nr4  r?  )r7  r   )r   r"   r"   r#   layernorm_mean_ref  s   r  )NNNNNr   F)NNNr   )r   )NNNN)NFF)Nr   FF)Dr`   typingr   r   r   	functoolsr   r   cuda.bindings.driverbindingsdriverr   r   cutlass.cutere   r   r   r	   r   r
   quack.utilsr   quack.copy_utilsr   quack.layout_utilsrT   quack.compile_utilsr   r  quack.reducer   quack.reduction_baser   r  r   quack.cute_dsl_utilsr   r   library	custom_opr7  r   r   register_faker  r   r   r  r>  rG  rH  r   r1  r  r  r  r  r  autogradFunctionr  r  r  r  r  r  r  r  r"   r"   r"   r#   <module>   s    		
0	
.
9	


  \	
A	
$
@
%F	
#
0