o
    پi                     @   s  d dl Z d dlmZmZmZ d dlmZ d dlm  m	Z
 d dlZd dlmZ d dlmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	l m!Z! G d
d deZ"ej#j$ddddd							dQdedee dedee dee dee dee dee de%de&ddfddZ'i e'_(							dQdedee dee dee d eej) d!eej) de%d"e&deeeee f fd#d$Z*dRd%d&Z+dSd'd(Z,G d)d* d*eZ-d+e.d,ej/de.fd-d.Z0ej#j$d/h d0dd1d				dTdedee d2eded3ed4ee d5ee d6ee d7ee d8ee. ddfd9d:Z1i e1_(			dUdedee d2eded6ee d;e&d<e&deeee ee ee f fd=d>Z2G d?d@ d@ej3j4Z5							dQdedee dee dee d eej) d!eej) de%dAe&defdBdCZ6G dDdE dEej7j"Z8				dVdededee de%dFe&dGe&fdHdIZ9dSdedJede%defdKdLZ:dSdejde%fdMdNZ;dejdejfdOdPZ<dS )W    N)OptionalTupleType)partial)Float32Int32
const_expr)Tensor)make_fake_tensor)
row_reduce)ReductionBase)torch2cute_dtype_mapc                       s  e Zd Zddeej dedef fddZdd Z	d	d
 Z
ejdejdeej deej deej dejdeej deej deej dedejfddZejdejdeej deej deej dejdeej deej deej dedejdejdeje fddZ  ZS )RMSNormFdtypeNis_layernormc                    sD   t  j|||r	dndd || _||rdndkrd nd| _d| _d S )N      )stage @      smemF)super__init__r   reload_fromdelay_w_load)selfr   r   r   	__class__ A/home/ubuntu/.local/lib/python3.10/site-packages/quack/rmsnorm.pyr      s   
zRMSNorm.__init__c                 C   (   | j }dD ]\}}||kr|  S qdS )N)@            )i       )i   r#   )r   r&      r   r   r   limitthreadsr   r   r    _threads_per_row       zRMSNorm._threads_per_rowc                 C   sT   | j }t| jjdkrg d}ng d}|D ]\}}||kr$|| _ d S qd| _d S )Nr'   ))r   r   )   r   )      )   r$   ))r0   r   )r1   r   )r3   r2   )i   r$   )r   r   r   width	cluster_n)r   r   
thresholdsr,   clusterr   r   r    _set_cluster_n'   s   

zRMSNorm._set_cluster_nmXmWmBmResmOmResOmRstdmMeanepsstreamc                    s  |j  jksJ    ttdd ||||||fD  }t jd| } j|d\}}|j	}fdd||fD \}} fdd||fD \}} 
|||||||||	||jt|jd d  jd	g|d	d	gt jd	kr{d	 jd	gnd |
d
 d S )Nc                 s        | ]}|d ur|j jV  qd S Nelement_typer4   .0tr   r   r    	<genexpr>F       z#RMSNorm.__call__.<locals>.<genexpr>r&   vecsizec                    s0   g | ]}t |d urtj|d d dnd qS )Nr   dimsize)r   layout_utilsexpandrH   mT)tiler_mnr   r    
<listcomp>K   s    "z$RMSNorm.__call__.<locals>.<listcomp>c                    s.   g | ]}t |d urtj|d jdnd qS )Nr   rN   )r   rQ   rR   r   rS   r   r   r    rV   O        r   r   gridblockr7   rB   )rF   r   r8   r   maxmathgcdr   _get_tiled_copyrP   kernellaunchcuteceil_divshaper5   )r   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   largest_dtype_widthrM   
tiled_copythreads_per_rownum_threadsr   )r   rU   r    __call__5   s.   




zRMSNorm.__call__rU   rf   rg   c           ?   
      s  t j \}}}t j \ }}t| jdkrtdnt j d |j}tj	 }|j
|jt jdddd}t|d urL|j
|jt jdddd}| ||\}}|j}t |} fdd|||||||fD \}}}}}}}fd	d||fD \}}||} t|d ur| |nd }!t|d ur| |nd }"| |}#| |}$t|d ur| |}%| |}&| |}'t|d ur| |}(t|d ur| |nd })t|d ur| |nd }*| |d
 }+t|d urt |!nd },t|d urt |"nd }-dd |#|'fD \}.}/t|d urt |%}0t |t jj }1| |||1 t|d d | j k}2|2sKtj| ||d dnd }3ttj|3d}4|+d d }5|5|d k rv|4|#|$dd t|d urv|4|%|&dd t j  t| j rt|d ur|4|!|, t|d ur|4|"|- t jd t |$|. |.  t j!}6t|d urt |&|0 |6|0  t j!7 }6t|d urt |(}7|7"|6 |7j |5|d k r|4|7|( d\}8}9t| j#rt$|6t j%j&||d t| jdkr|d nd dt| jdkrt jj'nd d}:|:|d  }8t|d urF|+d d dkrF|5|d k rF| jdksBt j( dkrF|8|*d< t| j)dkrtt |$|. |.  t j!}6t|d urst |&|0 |6|0  t j!7 }6n+t| j)dkr|4|#|. |.  t j!}6t|d ur|4|%|0 |6|0  t j!7 }6t$|6|8 |6|8  t j%j&||d t| jdkr|d nd dd};t j*j+|;|d  |	 dd}9n/td}8t$|6|6 t j%j&||d |dt| jdkrt jj'nd d}<t j*j+|<|d  |	 dd}9t|d ur&|+d d dkr&|5|d k r&| jdks"t j( dkr&|9|)d< t| jrDt|d ur8|4|!|, t|d urD|4|"|- t| j)dkpO| j)dkrt| j)dkrnt |$|. t|d urmt |&|0 n|4|#|. t|d ur|4|%|0 |.  t j!}6t|d ur|6|0  t j!7 }6t| j#r|6|8 |9 n|6|9 }=|=}>t|d ur|>|,  t j!9 }>t|d ur|>|-  t j!7 }>|/"|> |/j |5|d k r|4|/|' d S d S )Nr   r   r   r   orderr'   byte_alignmentc                    s*   g | ]}|d urt | fnd qS rD   rb   
local_tilerS   bidx	cluster_yrU   r   r    rV          z"RMSNorm.kernel.<locals>.<listcomp>c                    s.   g | ]}t |d urt|d fnd qS )Nr   r   rb   rp   rS   rs   rU   r   r    rV      rX   )r   NNNc                 S   s   g | ]}t |qS r   rb   make_fragment_likerG   r   r   r    rV      s    r,   predTis_asyncNNNNr           )init_valhook_fnr   gmem)NNr   )r   )fastmath),rb   arch
thread_idx	block_idxr   r5   layout_tv_tiledcutlassutilsSmemAllocatorallocate_tensorrF   make_ordered_layout#_allocate_reduction_buffer_and_mbarrd   make_identity_tensor	get_slicepartition_Spartition_Dry   rP   	WARP_SIZE_initialize_cluster
copy_utilspredicate_kr   copycp_async_commit_groupr   cp_async_wait_groupautovec_copyloadtor   storer   r   ReductionOpADDcluster_waitblock_idx_in_clusterr   r]   rsqrt)?r   r9   r:   r;   r<   r=   r>   r?   r@   rA   rU   rf   rg   tidx_	tv_layoutr   sXsResreduction_buffermbar_ptrrd   idXgXgResgOgResOgRstdgMeancXgWgB
thr_copy_XtXgWtXgBtXgXtXsXtXgRestXsRestXgOtXgResOtXrRstdtXrMeantXcXtXrWtXrBtXrXtXrOtXrRes	num_warps	is_even_NtXpXr   rowxtXrResOmeanrstdsum_xsum_sq_x_sub_meansum_sq_xx_hatyr   rq   r    r`   \   s4  $















	

	



 zRMSNorm.kernel)F)__name__
__module____qualname__r   r   Numericintboolr   r.   r8   rb   jitr	   r   r   cudaCUstreamri   r`   Shape	TiledCopy	Constexpr__classcell__r   r   r   r    r      sh    "	
&	
r   zquack::_rmsnorm_fwd)outr   r   residual_outr   z(Tensor x, Tensor? weight, Tensor(a2!) out, Tensor? bias, Tensor(a4!)? rstd, Tensor(a5!)? mean, Tensor? residual, Tensor(a7!)? residual_out, float eps=1e-6, bool is_layernorm=False) -> ())mutates_argsdevice_typesschemaư>Fr   weightr   biasr   r   residualr   rA   r   returnc
                    s  t jt jt jh}
| j|
v sJ d|dur|j|
v sJ d|dur+|j|
v s+J d| j\} dd | |||||fD \}}}}}}|||||| |du|du|	f
}|tjvrt	 ||||||g}t
j gdd |D R   fd	d||||fD \}}}} fd
d||fD \}}|durttfnd}|durttfnd}tjt| |	d||||||||tdtjjddddtj|< tj| | ||||||||	 dS )aA  RMSNorm/LayerNorm forward pass.
    Args:
        x: Input tensor of shape (M, N)
        weight: Optional weight tensor of shape (N,)
        eps: Small value for numerical stability
        is_layernorm: If True, compute LayerNorm instead of RMSNorm
    Returns:
        Normalized output tensor of same shape as x
    Unsupported dtypeN+Weight must be float32, float16 or bfloat16.Residual must be float16, bfloat16, or float32c                 S   "   g | ]}|d urt |j nd qS rD   r   r   rG   r   r   r    rV   E      z _rmsnorm_fwd.<locals>.<listcomp>c                 s   "    | ]}|d urd|j  V  qd S Nr&   r4   rH   dtr   r   r    rJ   X       z_rmsnorm_fwd.<locals>.<genexpr>c                       g | ]
}t | fqS r   fake_tensorr   r   	batch_symdivr   r    rV   Y      c                    s   g | ]	}t | fqS r   r   r   )r   r   r   r    rV   ]  s    )r   r   Tuse_tvm_ffi_env_stream--enable-tvm-ffioptions)torchfloat16bfloat16float32r   rd   _rmsnorm_fwdcompile_cacherb   sym_intr]   r^   r   r   compiler   runtimemake_fake_stream)r   r   r   r   r   r   r   r   rA   r   supported_typesr   r   	out_dtypeweight_dtype
bias_dtype	res_dtyperes_out_dtypecompile_key
all_dtypesx_cuteout_cuteres_cuteres_out_cuteweight_cute	bias_cute	rstd_cute	mean_cuter   r   r    r     s`   


r  r  residual_dtype
store_rstdc                 C   s   |d u r| j n|}tj| |d}|rtj| jd | jtjdnd }	|d ur(|j }|d us5|d urD|| j krDtj| |d ur>|n| j d}
nd }
t| ||||	d ||
|d
 |
d u rY| }
||
|	fS )Nr   r   devicer   F)r   r  
empty_likeemptyrd   r   r  r  )r   r   r   r   r  r  rA   r  r   r   r   r   r   r    rmsnorm_fwdv  s   "
r#  c           	      C   s   |   }|d ur|  }||7 }|ttj| ddd|  }|d ur)|| n|}|d ur5||   }|d u r?|| jS || j||jfS )NTrO   keepdim)floatr  sqrtr   squarer   r   )	r   wr   r   rA   x_f32residual_f32x_normr   r   r   r    rmsnorm_ref  s   "r.  c                 C   s   |   }||d }|dur|| }n|}|| jddd}|||  |d }	|durB|| jdd}
|	| j|
|jfS |	| jdfS )z3Reference implementation for RMSNorm backward pass.r   Nr$  Tr%  r   rO   )r'  	unsqueezer   sumr   r   )r   r*  doutr   rA   r+  r   wdyc1dxdwr   r   r    rmsnorm_bwd_ref  s   
r7  c                       s  e Zd Zdejdef fddZdd Zdd Zd	d
 Z	e
jde
jdee
j de
jdee
j de
jde
jdee
j dee
j dee
j dedejfddZe
jde
jdee
j de
jdee
j de
jde
jdee
j dee
j dee
j de
jde
jdeje fddZ  ZS )RMSNormBackwardr   r   c                    sL   t  j||dtd |dkrd nd| _| jdkr"| jjdkr$tdd S d S )Nr   )r   reduction_dtyper   r   r3   r(   z?RMSNormBackward does not support N > 128k with dtype >= 32 bits)r   r   r   
reload_wdyr   r   r4   
ValueError)r   r   r   r   r   r    r     s
   zRMSNormBackward.__init__c                 C   s   | j dkrdS dS )N   r&   r)   r*   rW   r   r   r    _num_threads  s   zRMSNormBackward._num_threadsc                 C   r!   )N)r"   r%   )r)   r(   )i   r#   )r<  r&   r)   r*   r+   r   r   r    r.     r/   z RMSNormBackward._threads_per_rowc                 C   s2   | j }dD ]\}}||kr|| _ d S qd| _d S )N))r   r   )r   r   )r0   r2   )r1   r$   r'   )r   r5   )r   r   r,   r7   r   r   r    r8     s   
zRMSNormBackward._set_cluster_nr9   r:   mdOmdResOr?   mdXmdWmdResmdBsm_countrB   c                 C   s   |j | jksJ |   ttdd ||||||fD  }t| jd| }| j|d\}}}|j	}t|d urBt
j|d|d dnd }|
}| ||||||||	||||j|| jdg|ddg| jdkrjd| jdgnd |d d S )	Nc                 s   rC   rD   rE   rG   r   r   r    rJ     rK   z+RMSNormBackward.__call__.<locals>.<genexpr>r&   rL   r   rN   r   rY   )rF   r   r8   r   r\   r]   r^   r   r_   rP   rQ   rR   r`   ra   r5   )r   r9   r:   r>  r?  r?   r@  rA  rB  rC  rD  rB   re   rM   rf   rU   rg   rh   
num_blocksr   r   r    ri     s(   "

zRMSNormBackward.__call__rU   rf   rg   c           T   
      s,  t j \}}}t j \ }}t j \}}}t| jdkr#tdnt j d |j}|j}|d |d }}t|d d | j k}t 	|}t
j }t jd d dfdd}|j|j|dd}|j|j|dd}| j||dd	\}}t|d ur||d }}nd
\}}||}fdd|||||	|fD \}} }!}"}#}$|d urt |dfnd }% fdd||fD \}&}'||}(||})|| }*||}+||"},t|d ur||!}-t|	d ur||#}.||$d }/dd |(|*|,fD \}0}1}2d }3t|d urt |-d }3d }4t|	d ur(t |.d }4|r-d ntj||$d |d d}5ttj|5d}6d
\}7}8d
\}9}:t|d ur\||&}7t |7t}8t|d urn||'}9t |9t}:t |t jj };| j|||;dd	 d }<t|d ur||%}=t |=}<t| r|<d |6|=|< |/d d d  f d d }>|>|k r|6|(d d d  f |)d dd |6|*d d d  f |+d dd n!td dkrtj|)d d |jj d tj|+d d |jj d t j!  t| jdkrt j"  t|d ur|8d t|d ur|:d t#d}?t#d}@t#d}At
$ t %|d |D ]$}B|/d d d |Bf d d }>|>|d   |k r|6|(d d d |B| f |)d d d |?dA f dd |6|*d d d |B| f |+d d d |?dA f dd n-td dkrtj|)d d d |?dA f d |jj d tj|+d d d |?dA f d |jj d t j!  t
j&j }C|>|k sd dkr||> }Ct|d ur|>|k sوd dkr|6|-d d d |Bf |3 nd dkr|3d t j'd t (|)d d d |?f |0 |0) *t j}Dt (|+d d d |?f |1 |1) *t j}E|D|C }F|E}Gt|d ur5|G|<) *t9 }Gt| jdkrFt j+||? |@ t,|F|G t j-j.||d d |?f t| jdkr`||? nd |Add|d  }Ht| jdkrt jj/t jj0j1t jj2j3d t j4  t j5 }I|I| jk rt jj6||? |Id t| j7dkrt (|+d d d |?f |1 |1) *t j}E|E}Gt|d ur|G|<) *t9 }G|G|F|H  |C }Jt|d ur|J|3) *t j7 }J|28|J*|2j |>|k sd dkr|6|2|,d d d |Bf  t|	d ur(|48|J*|4j |>|k sd dkr(|6|4|.d d d |Bf  t|d ur:|88|8) |E|F   t|d urJ|:8|:) |E  |?dN }?|?dkr[|AdN }A|@dN }@q7td dkr\t|d urt 9t j:|j;t jdt jdd}K||K}Lt j<  |/d d d }>|>dkrt (|8|L t j<  |>dkrt
=dtd D ](}Mt |8}Nt 9|Lj;|M|Kj>d   |Lj?}Ot (|O|N |88|8) |N)   q|6|8|7 t j<  t|d ur[t 9t j:|j;t jdt jdd}P||P}Qt j<  |/d d d }>|>dkrt (|:|Q t j<  |>dkr[t
=dtd D ](}Mt |:}Rt 9|Qj;|M|Pj>d   |Qj?}St (|S|R |:8|:) |R)   q-|6|:|9 nt|d urh|6|8|7 t|d urt|6|:|9 t| jdkr|?dN }?|?dkr|@dN }@t j+||? |@ d S d S )Nr   r   r   )r   r   r   rk   r'   rm   T)is_persistentr   c                    s*   g | ]}|d urt |d  fnd qS rD   ro   rS   rv   r   r    rV      rt   z*RMSNormBackward.kernel.<locals>.<listcomp>c                    s6   g | ]}t |d urt|dd f fnd qS )Nr   ru   rS   
bidx_startrs   rU   r   r    rV   %  s    
)rw   NNNc                 S   s   g | ]	}t |d  qS )NNNr   rx   )rH   thrr   r   r    rV   7  s    rI  r   rz   r{   r   r}   )
fill_value)phaser   )space)peer_cta_rank_in_clusterr   r  rj   )@rb   r   r   r   grid_dimr   r5   r   rd   r   r   r   r   r   r   rF   r   r   rp   r   r   ry   r   r   r   r   r   rP   r   r   fillfill_oobzeror   r   r   rangerc   Floatr   r   r   r   mbarrier_waitr   r   r   fence_proxy	ProxyKindasync_sharedSharedSpace
shared_cta	sync_warplane_idxmbarrier_arriver:  r   make_tensor
recast_ptriteratorbarrierrange_constexprstridelayout)Tr   r9   r:   r>  r?  r?   r@  rA  rC  rB  rU   rf   rg   r   r   gdimr   rd   Mr   r   r   r   smem_layoutr   sdOr   r   mbar_full_ptrmbar_empty_ptrr   r   gdOgdResOgdXgdResr   r   gdWgdBr   r   tXgdOtXsdOtXgdXtXgdResOtXgdResr   r   tXrdOtXrdXtXrdResOtXrdResr   r   tXgdWtXrdWtXgdBtXrdBr   r   r   r   r   producer_phaseconsumer_phaserr   r   r   r2  r   r3  mean_xhat_wdyr\  r5  sdWtXsdWitXrdW_othertXsdW_othersdBtXsdBtXrdB_othertXsdB_otherr   rG  r    r`     s  $


















 



 

	




















zRMSNormBackward.kernel)r   r   r   r   r   r   r   r=  r.   r8   rb   r   r	   r   r   r   r   ri   r`   r   r   r   r   r   r   r   r    r8    sn    	
#	
r8  r   r   c                 C   sv   | dkrdn| dkrdn| dkrdn| dkrdnd	}t j|j}| d
kr+|| }|S | dkr5|d }|S |d }|S )Nr)   r'   i   r$   i   r2   r<  r   r   r   r   )r  r   get_device_propertiesmulti_processor_count)r   r   sm_count_multiplerD  r   r   r    _get_sm_count  s   2r  zquack::_rmsnorm_bwd>   r5  	dresidual
db_partial
dw_partialz(Tensor x, Tensor? weight, Tensor dout, Tensor rstd, Tensor(a4!) dx, Tensor(a5!)? dw_partial, Tensor(a6!)? db_partial, Tensor? dresidual_out, Tensor(a8!)? dresidual, int? sm_count) -> ()r2  r5  r  r  dresidual_outr  rD  c
                    s  |   dks
J d| jsJ dtjtjtjh}
| j|
v s"J d|durN|  dks0J d| jd |jd	 ks>J d
|jsEJ d|j|
v sNJ d|durh|j| jksZJ |js_J |j|
v shJ d|dur|j| jkstJ |jsyJ |j|
v sJ d| d |du r|du r|	dusJ n|dur|jd	 n|jd	 }	dd | |||||fD \}}}}}} |||||du||f}|t	j
vrDt t }|||||g}tj gdd |D R   fdd|||||fD \}}}}}t| f}ttf}|durtt| fnd}|dur%tt| fnd}tjt| ||||||||||	tjjddddt	j
|< t	j
| | |||||||||	
 dS )a  RMSNorm backward pass.
    Args:
        x: Input tensor of shape (M, N)
        weight: Optional weight tensor of shape (N,)
        dout: Upstream gradients tensor of shape (M, N)
        rstd: Reciprocal standard deviation tensor of shape (M,)
    Returns:
        Tuple of (dx, dw) where:
        - dx: Input gradients tensor of same shape as x
        - dw: Weight gradients tensor of same shape as weight (or None if weight is None)
    r   Input must be 2Dz#Input tensor must be on CUDA devicer   Nr   Weight must be 1Dr$  r   z3Last dimension of input must match weight dimensionz$Weight tensor must be on CUDA devicer   r   c                 S   r   rD   r   rG   r   r   r    rV   [  r   z _rmsnorm_bwd.<locals>.<listcomp>c                 s   r   r   r   r   r   r   r    rJ   l  r   z_rmsnorm_bwd.<locals>.<genexpr>c                    r   r   r   r   r   r   r    rV   m  r   Tr   r   r   )rO   is_cudar  r  r  r  r   rd   rP   _rmsnorm_bwdr  rb   r  r]   r^   r   r   r	  r8  r
  r  )r   r   r2  r   r5  r  r  r  r  rD  r  r   
dout_dtypedx_dtyper  
dres_dtypedres_out_dtyper  batch_partial_symr  r  	dout_cutedx_cutedres_out_cute	dres_cuter  r  dw_partial_cutedb_partial_cuter   r   r    r  $  s   



r  has_biashas_residualc                 C   s   | j }| d}t| }	|d ur |j|	jkr tj| |jd}
nd }
t||}|d ur6tj|||tjd}nd }|rDtj|||tjdnd }t| ||||	||||
|
 |d ura|j	dd
|jnd }|ro|j	dd
|jnd }|ry|
d u ry|	}
|	|||
fS )Nr   r  r  r   r/  )r   rP   r  r!  r   r  r"  r  r  r1  r   )r   r   r2  r   r  r  r  r   r   r5  r  rD  r  r  r6  dbr   r   r    rmsnorm_bwd  s&   	


 r  c                   @   s2   e Zd Ze						dddZedd ZdS )	RMSNormFunctionNr   Fc	              
   C   s   |j }	|d|j d }|d ur|d|j d }t| jd d }
t||||||||
d\}}}| |d u r9|n||| |d u| _|| _|	| _|d urP|j	nd | _
|| _|d u s\|sa||	S ||	||	fS )Nr$     )r   r   r  r  rA   r  )rd   reshapeanyneeds_input_gradr#  save_for_backwardr  rA   
x_shape_ogr   r  prenorm)ctxr   r   r   r   r  r  rA   r  r  	need_gradr   r   r   r   r   r    forward  s0   


zRMSNormFunction.forwardc              	   G   s   | j \}}}| j}| jr| jd ur|d }|d|jd }nd }| j}|d|jd }t||||||| jd ud\}	}
}}|	|}	|d urM||}|	|
||gd gd R S )Nr   r$  )r  r2   )	saved_tensorsr  r  r  r  rd   r  viewr  )r  r2  argsr   r   r   r  r  r  r5  r6  r  r  r   r   r    backward  s*   
	
zRMSNormFunction.backward)NNNNr   F)r   r   r   staticmethodr  r  r   r   r   r    r    s    &r  r  c              
   C   s   t | |||||||S )a  RMSNorm with automatic differentiation support.

    Args:
        x: Input tensor of shape (M, N)
        weight: Optional weight tensor of shape (N,)
        eps: Small value for numerical stability

    Returns:
        Normalized output tensor of same shape as x
    )r  apply)r   r   r   r   r  r  rA   r  r   r   r    rmsnorm  s   r  c                       sD   e Zd ZdZ	ddededef fdd	Zd
edefddZ	  Z
S )QuackRMSNorma  RMSNorm module that behaves like torch.nn.RMSNorm.

    This class provides a drop-in replacement for torch.nn.RMSNorm that uses
    the quack.rmsnorm implementation under the hood.

    Args:
        dim (int): The dimension to normalize over
        eps (float, optional): A small constant for numerical stability. Default: 1e-6

    Attributes:
        weight (torch.nn.Parameter): The learnable weight parameter
        eps (float): A small constant for numerical stability
    r   TNrO   rA   elementwise_affinec                    s   t  j|||||d d S )Nr  )r   r   )r   rO   rA   r  r   r   r   r   r    r     s   zQuackRMSNorm.__init__r   r   c                 C   s   t || j| jdS )zApply RMSNorm to the input tensor.

        Args:
            x (Tensor): Input tensor

        Returns:
            Tensor: Normalized tensor
        )rA   )r  r   rA   )r   r   r   r   r    r  $  s   	zQuackRMSNorm.forward)r   TNN)r   r   r   __doc__r   r'  r   r   r	   r  r   r   r   r   r    r    s    r  return_rstdreturn_meanc                 C   s   |   dks
J d|  dksJ d| jtjtjtjfv s#J d|jtjks-J d|durE|  dks;J d|jtjksEJ d	| j\}}| j}t| }	|r]tj	||tjd
nd}
|rjtj	||tjd
nd}t
| ||	||
|dd|d
 |r|r|	|
|fS |r|	|
fS |r|	|fS |	S )au  LayerNorm forward pass using the unified RMSNorm/LayerNorm kernel.

    Args:
        x: Input tensor of shape (M, N)
        weight: Weight tensor of shape (N,). Must be float32.
        bias: Optional bias tensor of shape (N,). Must be float32.
        eps: Small value for numerical stability
        return_rstd: Whether to return the reciprocal standard deviation
        return_mean: Whether to return the mean

    Returns:
        Normalized output tensor of same shape as x
        If return_rstd is True, also returns rstd tensor of shape (M,)
        If return_mean is True, also returns mean tensor of shape (M,)
    r   r  r   r  r   zWeight must be float32NzBias must be 1DzBias must be float32r  T)rO   r   r  r  r  r  rd   r   r!  r"  r  )r   r   r   rA   r  r  rf  r   r   r   r   r   r   r   r    layernorm_fwd0  s(   


r  r*  c                 C   s(   |   }tjj||j|d|| jS )z'Reference implementation for LayerNorm.N)r'  r  nn
functional
layer_normrd   r   r   )r   r*  rA   r+  r   r   r    layernorm_ref`  s    r  c                 C   s<   |   }|jddd}|| d jdd}dt||  S )Nr$  Tr%  r   r/  g      ?)r'  r   r  r(  )r   rA   r+  r   varr   r   r    layernorm_rstd_reff  s   r  c                 C   s   |   jddS )Nr$  r/  )r'  r   )r   r   r   r    layernorm_mean_refm  s   r  )NNNNNr   F)NNNr   )r   )NNNN)NFF)Nr   FF)=r]   typingr   r   r   	functoolsr   cuda.bindings.driverbindingsdriverr   r   cutlass.cuterb   r   r   r   r  r	   quack.utilsr   quack.copy_utilsr   quack.layout_utilsrQ   quack.compile_utilsr
   r   quack.reducer   quack.reduction_baser   quack.cute_dsl_utilsr   r   library	custom_opr'  r   r  r  r   r#  r.  r7  r8  r   r   r  r  r  autogradFunctionr  r  r  r  r  r  r  r  r   r   r   r    <module>   s|    		
L	


  ^	
b
%F	
#
0