o
    پiJ                  	   @   s  d Z ddlZddlmZmZmZ ddlZddlmZ ddl	m  m
Z ddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZ e Ze Ze Z e Z!ed	obe Z"e Z#e Z$e Z%d
Z&esre%rerz
ddl'm(Z( dZ&W n e)e*fy   d
Z&Y nw d
Z&ddl+m,Z,m-Z-m.Z.m/Z/ d
Z0e"rddl1m2Z3 ddl1m4Z5 dZ0ne rzddl6m5Z5m3Z3 dZ0W n e)y   d
Z0Y nw e7e8Z9e!rddl:Z:G dd deZ;G dd deZ<G dd deZ=G dd deZ>dS )z)Fused operators for normalization layers.    N)OptionalTupleUnion)is_batch_invariant_mode_enabledrms_norm_batch_invariant)envs)MultiPlatformOp)get_global_server_args)cpu_has_amx_supportget_bool_env_varis_cpuis_cudais_flashinfer_availableis_hipis_npuis_xpuSGLANG_USE_AITERF)	layernormT)fused_add_rmsnormgemma_fused_add_rmsnormgemma_rmsnormrmsnorm)rmsnorm2d_fwd)rmsnorm2d_fwd_with_add)fused_add_rms_normrms_normc                       sv  e Zd Z							d#dededee ded	ed
edededdf fddZ		d$dej	deej	 deej	 de
ej	eej	ej	f f fddZ		d$dej	deej	 deej	 de
ej	eej	ej	f f fddZ		d$dej	deej	 deej	 de
ej	eej	ej	f f fddZ		d$dej	deej	 deej	 de
ej	eej	ej	f f fddZ		d$dej	deej	 deej	 de
ej	eej	ej	f f fddZ		d$dej	deej	 deej	 de
ej	eej	ej	f f fddZ		d$dej	deej	 deej	 de
ej	eej	ej	f f fdd Z		d$dej	deej	 deej	 de
ej	eej	ej	f f fd!d"Z  ZS )%RMSNormư>NFThidden_sizeepsvar_hidden_sizecast_x_before_out_mulfp32_residual
has_weightweight_dtypeoverride_orig_dtypereturnc	           	         s   t    || _|| _|| _|| _| jr ttj	||d| _
ntj	||d| _
|| _|| _||kr4d n|| _tr?| j| _d S d S N)dtype)super__init__r#   r!   r"   r%   nn	Parametertorchonesweightvariance_epsilonr   variance_size_override
_use_aiterforward_aiter_forward_method)	selfr   r   r    r!   r"   r#   r$   r%   	__class__ O/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/layernorm.pyr*   Z   s   
zRMSNorm.__init__xresidualpost_residual_additionc                 C   s   |  dkr|S | jd ur| |||S t r1|d us!t jdkr(| |||S t|| jj| j	S |d urK|d ur=|| }t
||| jj| j	 ||fS t|| jj| j	}|S )Nr   fsdp)numelr1   forward_nativer   r	   rl_on_policy_targetr   r/   datar0   r   r   r5   r:   r;   r<   outr8   r8   r9   forward_cudav   s(   
zRMSNorm.forward_cudac                 C   sT   |d ur|d ur|| }t ||| jj| j\}}}||fS t || jj| jd S )Nr   )	torch_npunpu_add_rms_normr/   rA   r0   npu_rms_norm)r5   r:   r;   r<   rC   _residual_outr8   r8   r9   forward_npu   s   
zRMSNorm.forward_npuc                 C   s^   |d ur&t |}t |}|d ur|| }t||||| jj| j ||fS t|| jj| jS N)r-   
empty_liker   r/   rA   r0   r   )r5   r:   r;   r<   rI   outputr8   r8   r9   r3      s   

zRMSNorm.forward_aiterc                 C   s   t s	| |||S | s| }|d ur7t|}t|}|d ur'|| }t||||| jj| j	 ||fS t|}t
||| jj| j	 |S rK   )_has_vllm_rms_normr?   is_contiguous
contiguousr-   rL   r   r/   rA   r0   r   )r5   r:   r;   r<   rC   rI   r8   r8   r9   forward_hip   s    


zRMSNorm.forward_hipc                 C   sB  |  s| }| jp|j}|tj}|d ur9||tj }|d ur,||tj }| jr4| }n||}|j	d }|| j
krNtd| j
 d| | jd u rV|}n|| jk rftd| j d| |dd | jf }|djddd}|t|| j  }| jr| j|| }n|| j |}|d u r|S ||fS )	NzExpected hidden_size to be z, but found: z$Expected hidden_size to be at least .   Tdimkeepdim)rO   rP   r%   r(   tor-   float32r"   cloneshaper   
ValueErrorr1   powmeanrsqrtr0   r!   r/   )r5   r:   r;   r<   
orig_dtyper   x_varvariancer8   r8   r9   r?      sN   





zRMSNorm.forward_nativec                 C   d   t r+|d ur|d ur|| }tjj||| jj| j ||fS tjj|| jj| jS | 	|||S rK   )
_is_cpu_amx_availabler-   ops
sgl_kernelfused_add_rmsnorm_cpur/   rA   r0   rmsnorm_cpur?   r5   r:   r;   r<   r8   r8   r9   forward_cpu  s   zRMSNorm.forward_cpuc                 C   sb   | j d ur| |||S |d ur&|d ur|| }t||| jj| j ||fS t|| jj| j}|S rK   )r1   r?   r   r/   rA   r0   r   rB   r8   r8   r9   forward_xpu  s   
zRMSNorm.forward_xpuc                 C   sl   |dur/ddl m} ddlm} | dkr/|dur|| }|||| j| jd}|d dur/|S | |||S )z`
        Forward method with allreduce fusion, prioritizing flashinfer fused operations
        Nr   )$get_tensor_model_parallel_world_size)%flashinfer_allreduce_residual_rmsnorm   )input_tensorr;   r/   r   )sglang.srt.distributedrk   (sglang.srt.layers.flashinfer_comm_fusionrl   r/   r0   forward)r5   r:   r;   r<   rk   rl   fused_resultr8   r8   r9   forward_with_allreduce_fusion,  s   	
z%RMSNorm.forward_with_allreduce_fusion)r   NFFTNNNN)__name__
__module____qualname__intfloatr   boolr*   r-   Tensorr   r   rD   rJ   r3   rQ   r?   ri   rj   rs   __classcell__r8   r8   r6   r9   r   Y   s    	

$



5

r   c                       s   e Zd Zdddejfdededededejdd	f fd
dZ	dej
dej
fddZdej
dej
fddZdej
dej
fddZdej
dej
fddZdej
dej
fddZ  ZS )	LayerNormr   Tr   r   elementwise_affinebiasr(   r&   Nc                    s\   t    || _|| _|| _|| _|| _tt	j
|| jd| _tt	j|| jd| _d S r'   )r)   r*   r   r0   r~   use_biasr(   r+   r,   r-   zerosr   r.   r/   )r5   r   r   r~   r   r(   r6   r8   r9   r*   K  s   
zLayerNorm.__init__r:   c                 C   s:   t r|jtjkr| jtjkrt|| j| j| jS | 	|S rK   )
_flashinfer_layernorm_availabler(   r-   bfloat16rX   r   r/   r   r0   r?   r5   r:   r8   r8   r9   rD   ]  s   
zLayerNorm.forward_cudac                 C   sR   | j r| jnd }| jr| jnd }|j}|| j}tj|| jf||| j	d|S )N)r/   r   r   )
r~   r/   r   r   r(   rW   F
layer_normr   r0   )r5   r:   r/   r   r_   r8   r8   r9   r?   j  s   zLayerNorm.forward_nativec                 C   
   |  |S rK   r?   r   r8   r8   r9   rQ   z     
zLayerNorm.forward_hipc                 C   r   rK   r   r   r8   r8   r9   rJ     r   zLayerNorm.forward_npuc                 C   s&   t rtjj|| jj| jS | |S rK   )	rc   r-   rd   re   layernorm_cpur/   rA   r0   r?   r   r8   r8   r9   ri     s
   
zLayerNorm.forward_cpu)ru   rv   rw   r-   rX   rx   ry   rz   r(   r*   r{   rD   r?   rQ   rJ   ri   r|   r8   r8   r6   r9   r}   J  sV    



r}   c                       s  e Zd Z	ddededdf fddZ		ddejd	eej d
eej de	eje
ejejf f fddZ		ddejd	eej d
eej de	eje
ejejf f fddZ		ddejd	eej d
eej de	eje
ejejf f fddZ		ddejd	eej d
eej de	eje
ejejf f fddZ		ddejd	eej d
eej de	eje
ejejf f fddZ		ddejd	eej d
eej de	eje
ejejf f fddZ  ZS )GemmaRMSNormr   r   r   r&   Nc                    s6   t    tt|| _|| _tr| j	| _
d S d S rK   )r)   r*   r+   r,   r-   r   r/   r0   _is_hipr?   r4   )r5   r   r   r6   r8   r9   r*     s   
zGemmaRMSNorm.__init__r:   r;   r<   c                 C   sJ   |d ur|d ur|| }t ||| jj| j ||fS t|| jj| j}|S rK   )r   r/   rA   r0   r   rB   r8   r8   r9   _forward_impl  s   zGemmaRMSNorm._forward_implc                 C   s   |j }|d ur|d ur|| }|| }|}| }|djddd}|t|| j  }|d| j   }||}|d u rA|S ||fS )NrS   rR   TrT         ?)	r(   ry   r\   r]   r-   r^   r0   r/   rW   )r5   r:   r;   r<   r_   ra   r8   r8   r9   r?     s   
zGemmaRMSNorm.forward_nativec                 C      |  |||S rK   r   rh   r8   r8   r9   rD        zGemmaRMSNorm.forward_cudac                 C   rb   rK   )
rc   r-   rd   re   gemma_fused_add_rmsnorm_cpur/   rA   r0   gemma_rmsnorm_cpur?   rh   r8   r8   r9   ri     s   zGemmaRMSNorm.forward_cpuc                 C   sd   t j r| ||S |d ur|d ur|| }|| }|}t|| j| j\}}|d u r.|S ||fS rK   )r   (SGLANG_NPU_FORWARD_NATIVE_GEMMA_RMS_NORMgetr?   rE   npu_gemma_rms_normr/   r0   )r5   r:   r;   r<   rH   r8   r8   r9   rJ     s   
zGemmaRMSNorm.forward_npuc                 C   r   rK   r   rh   r8   r8   r9   rj     r   zGemmaRMSNorm.forward_xpur   rt   )ru   rv   rw   rx   ry   r*   r-   r{   r   r   r   r   r?   rD   ri   rJ   rj   r|   r8   r8   r6   r9   r     s    




r   c                       sX   e Zd Zddedef fddZdd Zdd	 Zd
d Zdd Z	dd Z
dd Z  ZS )Gemma3RMSNormr   rU   r   c                    s&   t    || _tt|| _d S rK   )r)   r*   r   r+   r,   r-   r   r/   )r5   rU   r   r6   r8   r9   r*     s   
zGemma3RMSNorm.__init__c                 C   s$   |t |djddd| j  S )NrS   rR   T)rV   )r-   r^   r\   r]   r   r   r8   r8   r9   _norm   s   $zGemma3RMSNorm._normc                 C   s*   |  | }|d| j   }||S )Nr   )r   ry   r/   type_as)r5   r:   rM   r8   r8   r9   r?     s   
zGemma3RMSNorm.forward_nativec                 C   s2   t r|ddkrtjj|| j| jS | |S )NrR   rm   )	rc   strider-   rd   re   gemma3_rmsnorm_cpur/   r   r?   r   r8   r8   r9   ri   
  s   
zGemma3RMSNorm.forward_cpuc                 C   r   rK   r   r   r8   r8   r9   rD     s   
zGemma3RMSNorm.forward_cudac                 C   s   t || j| j\}}|S rK   )rE   r   r/   r   )r5   r:   rM   rH   r8   r8   r9   rJ     s   zGemma3RMSNorm.forward_npuc                 C   s   t | jj d| j S )Nz, eps=)tupler/   rZ   r   )r5   r8   r8   r9   
extra_repr  s   zGemma3RMSNorm.extra_reprr   )ru   rv   rw   rx   ry   r*   r   r?   ri   rD   rJ   r   r|   r8   r8   r6   r9   r     s    r   )?__doc__loggingtypingr   r   r   r-   torch.nnr+   torch.nn.functional
functionalr   sglang.srt.batch_invariant_opsr   r   sglang.srt.environr   sglang.srt.layers.utilsr   sglang.srt.server_argsr	   sglang.srt.utilsr
   r   r   r   r   r   r   r   _is_cuda_is_flashinfer_availabler   _is_npur2   rc   _is_cpu_is_xpur   flashinfer.normr   ImportErrorAttributeErrorre   r   r   r   r   rN   aiterr   r   r   r   vllm._custom_ops	getLoggerru   loggerrE   r   r}   r   r   r8   r8   r8   r9   <module>   sd   (
 rHg