o
    پiAH                     @   s  d Z ddlmZmZmZ ddlZddlmZ ddlm  m	Z
 ddlmZ e Ze Zer7ddlmZmZ er=ddlZddlmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& e$'dG dd de$Z(e$'dG dd de$Z)G dd dej)Z*deej+ deej+ fddZ,G dd de$Z-G dd de-Z.G dd de-Z/G dd  d e$Z0G d!d" d"e0Z1G d#d$ d$e0Z2	%d5d&ej+d'ej+d(dd)dd*e3d+e4deej+ej+f fd,d-Z5d.ej+d/ddej+fd0d1Z6d.ej+d2e1fd3d4Z7dS )6zCustom normalization layers.    )OptionalTupleUnionN)current_platform)fused_add_rmsnormrmsnorm)
norm_inferrms_norm_fn)triton_one_pass_rms_norm)fuse_scale_shift_kernel)can_use_fused_inplace_qknormfused_inplace_qknorm)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizeget_tp_group)CustomOp)get_bool_env_varrms_normc                       s  e Zd ZdZdejdfdededejde	e ddf
 fd	d
Z
ddejde	ej fddZ	ddejde	ej deejeejejf f fddZ	ddejde	ej deejeejejf f fddZ	ddejde	ej deejeejejf f fddZ	ddejde	ej deejeejejf f fddZ	ddejde	ej deejeejejf f fddZdefddZ  ZS )RMSNormzRoot mean square normalization.

    Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
    Refer to https://arxiv.org/abs/1910.07467
    ư>Nhidden_sizeepsdtypevar_hidden_sizereturnc                    sR   t    tt|| _|| _|| _||krd n|| _	t
dr'| j| _d S d S )N%SGLANG_ENABLE_DETERMINISTIC_INFERENCE)super__init__nn	Parametertorchonesweightvariance_epsilonr   variance_size_overrider   forward_native_forward_method)selfr   r   r   r   	__class__ b/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/layers/layernorm.pyr   -   s   
zRMSNorm.__init__xresidualc                 C   s   t || jd || jdS )N)biasr-   r   )r	   r"   r#   r'   r,   r-   r*   r*   r+   forward_triton>   s   zRMSNorm.forward_tritonc                 C   s  |j }|j}|d|d }|d ur|j }|d|d }|jtjkrB| ||}|d ur;|d ||d |fS ||}|S | jd urM| 	||S |d uret
||| jj| j ||||fS |j d dkrvt|| jj| j}n	t|| jj| j}||}|S )Nr         )shapedevicereshapeviewr   r    floatr0   r$   r%   r   r"   datar#   r
   r   )r'   r,   r-   r4   r5   residual_shapeoutr*   r*   r+   forward_cudaC   s0   


zRMSNorm.forward_cudac                 C   s   |  s| }|j}|tj}|d ur"||tj }||}|jd }|| jkr7td| j d| | j	d u r?|}n|| j	k rOtd| j	 d| |dd | j	f }|
djddd}|t|| j  }|| j |}|d u rz|S ||fS )	Nr1   zExpected hidden_size to be z, but found: z$Expected hidden_size to be at least .   Tdimkeepdim)is_contiguous
contiguousr   tor    float32r4   r   
ValueErrorr$   powmeanrsqrtr#   r"   )r'   r,   r-   
orig_dtyper   x_varvariancer*   r*   r+   r%   f   sB   




zRMSNorm.forward_nativec                 C      |  ||S Nr%   r/   r*   r*   r+   forward_cpu      zRMSNorm.forward_cpuc                 C   sD   |d urt ||| jj| j\}}}||fS t || jj| jd S )Nr   )	torch_npunpu_add_rms_normr"   r9   r#   npu_rms_norm)r'   r,   r-   r;   _residual_outr*   r*   r+   forward_npu   s   
zRMSNorm.forward_npuc                 C   rL   rM   rN   r/   r*   r*   r+   forward_hip   s   zRMSNorm.forward_hipc                 C   (   d| j jd }|d| j 7 }|S Nzhidden_size=r   z, eps=r"   r9   sizer#   r'   sr*   r*   r+   
extra_repr      zRMSNorm.extra_reprrM   )__name__
__module____qualname____doc__r    rD   intr8   r   r   r   Tensorr0   r   r   r<   r%   rO   rV   rW   strr^   __classcell__r*   r*   r(   r+   r   %   st    	
&
*



r   
layer_normc                       s  e Zd Z					ddededdf fddZd	ejdejfd
dZd	ejfddZ	d	ejde
ejeejejf f fddZejde d	dd	ejdeej de
ejeejejf f fddZ	dd	ejdeej de
ejeejejf f fddZdefddZ  ZS )	LayerNormh㈵>TNr   r.   r   c                    s   t    || _||d}|| _|r7tjtj|fi || _|r2tjtj|fi || _	d S d | _	d S | 
dd  | 
dd  d | _d S )Nr5   r   r"   r.   )r   r   r   r   r    r   r   emptyr"   r.   register_parameter_weight_fallback_cache)r'   r   r   r.   elementwise_affiner5   r   factory_kwargsr(   r*   r+   r      s   
	

zLayerNorm.__init__r,   c                 C   sZ   t | dd }|d u s|j|jks|j|jks| | jkr+tj| j|j|jd}|| _|S )Nrn   rk   )getattrr5   r   numelr   r    r!   rn   )r'   r,   wfr*   r*   r+   _get_weight_fallback   s   zLayerNorm._get_weight_fallbackc                 C   s*   t |d| j| j| j| jdd|jS Nr1   F)r   is_rms_norm)r   r7   r   r"   r.   r   r4   )r'   r,   r*   r*   r+   r0      s   zLayerNorm.forward_tritonc                 C   s$   |j }|d| j}| ||S )Nr1   )r4   r7   r   r0   )r'   r,   r4   r*   r*   r+   r<      s   zLayerNorm.forward_cudainductor)backenddisabler-   c                 C   sv   |j }|jddd}|| djddd}|| t|| j  }| jd ur,| j| }| jd ur6|| j }||S )Nr1   T)r@   r=   )	r   rG   rF   r    rH   r   r"   r.   rC   )r'   r,   r-   input_dtyperG   rK   r*   r*   r+   r%      s   




zLayerNorm.forward_nativec                 C   rL   rM   rN   r/   r*   r*   r+   rO      rP   zLayerNorm.forward_cpuc                 C   rX   rY   rZ   r\   r*   r*   r+   r^     r_   zLayerNorm.extra_repr)rj   TTNNrM   )r`   ra   rb   rd   boolr   r    re   rt   r0   r   r   r<   compiler   is_npur   r%   rO   rf   r^   rg   r*   r*   r(   r+   ri      sL    


ri   c                   @   s"   e Zd ZdejdejfddZdS )FP32LayerNorminputsr   c                 C   sd   |j }|j}t| | j| jd ur| j j|dnd | jd ur*| j j|dnd | j	|S )N)r5   )
r   r5   Frh   r8   normalized_shaper"   rC   r.   r   )r'   r   origin_dtyper5   r*   r*   r+   forward  s   zFP32LayerNorm.forwardN)r`   ra   rb   r    re   r   r*   r*   r*   r+   r~   
  s    r~   tensorr   c                 C   s   | d ur|   S d S rM   )rB   )r   r*   r*   r+   _ensure_contiguous  s   r   c                       s   e Zd ZU dZeed< ddejdfdede	de
d	ejd
ef
 fddZdejdejdejeB dejdejdeejejf fddZdd ZdejdejdejeB dejdejdeejejf fddZ  ZS )_ScaleResidualNormScaleShiftz
    Fused kernel that combines:
    1. residual_out = residual + gate * x
    2. normed = layernorm(residual_out) or rmsnorm(residual_out)
    3. out = normed * (1 + scale) + shift
    compute_dtype is always fp32 for higher precision.
    	norm_typer   F r   r   ro   r   prefixc                    sf   t    || _|| _| jdkrt|||d| _d S | jdkr*t||||d| _d S td| j dNrms)r   r   layer)ro   r   r   z
Norm type z not implemented)	r   r   r   r   r   r   normr~   NotImplementedErrorr'   r   r   ro   r   r   r(   r*   r+   r   )  s   


z%_ScaleResidualNormScaleShift.__init__r-   r,   gateshiftscaler   c              
   C   s   |j d d dkr$|j d dkr$dd l}|jddd | |||||S ddlm} t|tr:|d	kr:td
| ||	 |	 t|t
jrK|	 nd tt| jdd tt| jdd |	 |	 | j| j	S )Nr1      r       zJFusedScaleResidualNormScaleShift cuda not available, using native fallbackr=   
stacklevel)%fused_scale_residual_norm_scale_shiftr2   z8Only gate value of 1 is supported for int type, but got r"   r.   )r4   warningswarnr%   Csglang.jit_kernel.diffusion.cutedsl.scale_residual_norm_scale_shiftr   
isinstancerd   rE   rB   r    re   r   rq   r   r   r   )r'   r-   r,   r   r   r   r   r   r*   r*   r+   r<   =  s.    z)_ScaleResidualNormScaleShift.forward_cudac                 O      | j |i |S rM   rN   r'   argskwargsr*   r*   r+   rW   c     z(_ScaleResidualNormScaleShift.forward_hipc                 C   s   t |tr|dksJ || }n;t |tjrA| dkr:|jd }|jd | }||jd||fd| dd }n|||  }n
tdt	| d| 
|}	t|	||}
|
|fS )Nr2      )r?   sizesr=   z
Gate type z not supported)r   rd   r    re   r?   r4   	unflattenflattenrE   typer   r   )r'   r-   r,   r   r   r   residual_output
num_framesframe_seqlen
normalized	modulatedr*   r*   r+   r%   h  s    
	


z+_ScaleResidualNormScaleShift.forward_native)r`   ra   rb   rc   rf   __annotations__r    rD   rd   r8   r{   r   r   re   tupler<   rW   r%   rg   r*   r*   r(   r+   r     sZ   
 
&r   c                   @      e Zd ZdZdS ) ScaleResidualLayerNormScaleShiftr   Nr`   ra   rb   r   r*   r*   r*   r+   r         r   c                   @   r   )ScaleResidualRMSNormScaleShiftr   Nr   r*   r*   r*   r+   r     r   r   c                       s   e Zd ZU dZeed< ddejdfdede	de
d	ejd
ef
 fddZdejdejdejdejfddZdd ZdejdejdejdejfddZ  ZS )_NormScaleShiftz
    Fused kernel that combines:
    1. normed = layernorm(x) or rmsnorm(x)
    2. out = normed * (1 + scale) + shift
    compute_dtype is always fp32 for higher precision.
    r   r   Fr   r   r   ro   r   r   c                    s`   t    || _| jdkrt|||d| _d S | jdkr't||||d| _d S td| j dr   )r   r   r   r   r   r   r~   r   r   r(   r*   r+   r     s   


z_NormScaleShift.__init__r,   r   r   r   c                 C   s   |j d d dkr"|j d dkr"dd l}|jddd | |||S ddlm} || tt| j	d	d tt| j	d
d | | | j
| jS )Nr1   r   r   r   z=FusedNormScaleShift cuda not available, using native fallbackr=   r   )fused_norm_scale_shiftr"   r.   )r4   r   r   r%   r   r   rB   r   rq   r   r   r   )r'   r,   r   r   r   r   r*   r*   r+   r<     s"    z_NormScaleShift.forward_cudac                 O   r   rM   rN   r   r*   r*   r+   rW     r   z_NormScaleShift.forward_hipc                 C   s"   |  |}t|||}||jS rM   )r   r   rC   r   )r'   r,   r   r   r   r   r*   r*   r+   r%     s   
z_NormScaleShift.forward_native)r`   ra   rb   rc   rf   r   r    rD   rd   r8   r{   r   r   re   r<   rW   r%   rg   r*   r*   r(   r+   r     sJ   
 
r   c                   @   r   )LayerNormScaleShiftr   Nr   r*   r*   r*   r+   r     r   r   c                   @   r   )RMSNormScaleShiftr   Nr   r*   r*   r*   r+   r     r   r   Tqkq_normk_normhead_dimallow_inplacec                 C   s   |  d}|j}|j}tr3|r3||kr3t|| jr3t| |d|||d||j|j||d | |fS | j}	|j}
|| d||	}||d||
}||fS )zApply QK normalization for query and key tensors.

    Uses JIT fused inplace kernel when available, falls back to standard RMSNorm.
    r   r1   )r   r   q_weightk_weightr   r   )	r[   r#   _is_cudar   r   r   r7   r"   r4   )r   r   r   r   r   r   
batch_sizeq_epsk_epsq_shapek_shapeq_outk_outr*   r*   r+   apply_qk_norm  s0   

r   r,   r   c           	      C   s~   t  }t }| j}|j||  }|  }|djddd}t j	|t
jjjjd}|t
||j  | }|j|dS )Nr=   r1   Tr>   )op)r   )r   r   r   r"   tensor_splitr8   rF   rG   r   
all_reducer    _C_distributed_c10dReduceOpAVGrH   r#   rC   )	r,   r   tp_ranktp_size	src_dtyper"   x_fp32rK   outputr*   r*   r+   tensor_parallel_rms_norm  s   r   layernorm_scale_shiftc                 C   s2   t | d| jd |jj|jj|jdd| jS ru   )r   r7   r4   r   r"   r.   r   )r,   r   r*   r*   r+   apply_layernorm_only  s   r   )T)8rc   typingr   r   r   r    torch.nnr   torch.nn.functional
functionalr   'sglang.multimodal_gen.runtime.platformsr   is_cudar   r}   _is_npu
sgl_kernelr   r   rQ   'sglang.jit_kernel.diffusion.triton.normr   r	   2sglang.jit_kernel.diffusion.triton.rmsnorm_onepassr
   .sglang.jit_kernel.diffusion.triton.scale_shiftr   sglang.jit_kernel.normr   r   8sglang.multimodal_gen.runtime.distributed.parallel_stater   r   r   .sglang.multimodal_gen.runtime.layers.custom_opr   *sglang.multimodal_gen.runtime.utils.commonr   registerr   ri   r~   re   r   r   r   r   r   r   r   rd   r{   r   r   r   r*   r*   r*   r+   <module>   sd    
ZiD

(