o
    
۾iY8                     @   sT  d Z ddlZddlmZ ddlm  mZ ddlmZ ddl	m
Z
 ddlmZmZ ddlmZ dejdejd	ed
ejfddZdejdejdejd	ed
eejejf f
ddZdejdejdejd	ed
ejf
ddZ	d#dedejdefddZe
dG dd de
Ze
dG dd de
Ze
dG dd  d e
ZG d!d" d"ejZdS )$zCustom normalization layers.    N)rocm_aiter_ops)CustomOp)rms_norm_batch_invariantvllm_is_batch_invariant)current_platformxweightvariance_epsilonreturnc                 C   s<   ddl m} t rt| ||S t| }||| || |S Nr   )_custom_ops)vllmr   r   r   torch
empty_likerms_norm)r   r   r	   opsout r   X/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/layers/layernorm.pyr      s   
r   residualc                 C   sB   ddl m} t rt| | ||| | fS || ||| | |fS r   )r   r   r   r   fused_add_rms_norm)r   r   r   r	   r   r   r   r   r   #   s   
r   biasc                 C   s,   ddl m} t| }||| ||| |S r   )r   r   r   r   	poly_norm)r   r   r   r	   r   r   r   r   r   r   8   s   
r   Fwith_fused_adddtype	use_aiterc                 C   s8   |o	|t jt jfv }|r| rtjS |rtjS | rtS tS N)r   float16bfloat16r   rms_norm2d_with_addr   r   r   r   r   r   r   r   dispatch_rocm_rmsnorm_funcH   s   r!   c                       s  e Zd ZdZ				ddedededB ded	ejdB d
df fddZ	e
			d dejdededejdejdB dejdB dedB d
ejeejejf B fddZ	d!dejdejdB d
ejeejejf B fddZ	d!dejdejdB d
ejeejejf B fddZ	d!dejdejdB d
ejeejejf B fddZ	d!dejdejdB d
ejeejejf B fddZd
efddZ  ZS )"RMSNormzRoot mean square normalization.

    Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
    Refer to https://arxiv.org/abs/1910.07467
    ư>NThidden_sizeepsvar_hidden_size
has_weightr   r
   c                    s   t    || _|| _||krd n|| _|pt }|| _tj||d| _	| jr/t
| j	| _	t rIt }td||d| _td||d| _d S d S )Nr   Fr    T)super__init__r$   r	   variance_size_overrider   get_default_dtyper'   onesr   nn	Parameterr   is_rocmr   is_rmsnorm_enabledr!   rocm_norm_funcrocm_norm_func_with_add)selfr$   r%   r&   r'   r   weight_dtypeaiter_rmsnorm_enabled	__class__r   r   r*   f   s*   
zRMSNorm.__init__r   r	   
orig_dtyper   r   r+   c           	      C   s   |  tj} |dur| | } |  |}| jd |kr'td| d| jd  |du r.| }n||k r<td| d| | ddddd|f }|djddd}| t||  } |  |} |duri| | } |du ro| S | |fS )	6PyTorch-native implementation equivalent to forward().NzExpected hidden_size to be z, but found: z$Expected hidden_size to be at least    Tdimkeepdim)tor   float32shape
ValueErrorpowmeanrsqrt)	r   r	   r$   r9   r   r   r+   x_varvariancer   r   r   forward_static   s6   

zRMSNorm.forward_staticc              	   C   s,   |  || j| j|j| jr| jjnd|| jS r:   N)rI   r	   r$   r   r'   r   datar+   r4   r   r   r   r   r   forward_native   s   zRMSNorm.forward_nativec                 C   sH   | j d ur| ||S |d u}|rt||| jj| jS t|| jj| jS r   )r+   rM   r   r   rK   r	   r   r4   r   r   add_residualr   r   r   forward_cuda   s   
zRMSNorm.forward_cudac                 C   sL   | j d ur| ||S |d u}|r| ||| jj| jS | || jj| jS r   )r+   rM   r3   r   rK   r	   r2   rN   r   r   r   forward_hip   s   
zRMSNorm.forward_hipc                 C   s   |  ||S r   )rP   rL   r   r   r   forward_xpu   s   zRMSNorm.forward_xpuc                 C   s(   d| j jd }|d| j 7 }|S )Nzhidden_size=r   z, eps=)r   rK   sizer	   )r4   sr   r   r   
extra_repr   s   zRMSNorm.extra_repr)r#   NTN)NNNr   )__name__
__module____qualname____doc__intfloatboolr   r   r*   staticmethodTensortuplerI   rM   rP   rQ   rR   strrU   __classcell__r   r   r7   r   r"   \   s     0



r"   gemma_rms_normc                       s   e Zd ZdZ	ddededdf fddZed	ej	d
edej	dej	fddZ
ed	ej	d
edej	dej	deej	ej	f f
ddZ	ddej	dej	dB dej	eej	ej	f B fddZ	ddej	dej	dB dej	eej	ej	f B fddZ  ZS )GemmaRMSNormzRMS normalization for Gemma.

    Two differences from the above RMSNorm:
        1. x * (1 + w) instead of x * w.
        2. (x * w).to(orig_dtype) instead of x.to(orig_dtype) * w.
    r#   r$   r%   r
   Nc                    s&   t    tt|| _|| _d S r   )r)   r*   r.   r/   r   zerosr   r	   )r4   r$   r%   r7   r   r   r*      s   

zGemmaRMSNorm.__init__r   r	   r   c                 C   sR   |j }| }|djddd}|t||  }|d|    }||}|S )zGPyTorch-native implementation equivalent to forward() without residual.r<   r;   Tr=         ?)r   r[   rD   rE   r   rF   r@   )r   r	   r   r9   rH   r   r   r   _forward_static_no_residual  s   
z(GemmaRMSNorm._forward_static_no_residualr   c                 C   s|   |j }|tjkr| |  n|| }|}| }|djddd}|t||  }|d|    }||}||fS )zDPyTorch-native implementation equivalent to forward() with residual.r<   r;   Tr=   re   )r   r   r   r[   rD   rE   rF   r@   )r   r	   r   r   r9   rH   r   r   r   _forward_static_with_residual  s   

z*GemmaRMSNorm._forward_static_with_residualc                 C   s2   |du r|  | jj| j|S | | jj| j||S rJ   )rf   r   rK   r	   rg   rL   r   r   r   rM   /  s   zGemmaRMSNorm.forward_nativec                 C   sP   t j r| ||S t| dds"t | j| _t | j| _d| _| ||S )N_is_compiledFT)	r   compileris_compilingrM   getattrcompilerf   rg   rh   rL   r   r   r   rP   >  s   
zGemmaRMSNorm.forward_cudar#   r   )rV   rW   rX   rY   rZ   r[   r*   r]   r   r^   rf   r_   rg   rM   rP   ra   r   r   r7   r   rc      s`    	
rc   rms_norm_gatedc                       s   e Zd ZdZ					ddedededB ded	ejdB d
ej	dB f fddZ
dd Z	ddejdejdB dejfddZ	ddejdejdB dejfddZ  ZS )RMSNormGatedzRMS Normalization with optional gating.

    This is a native PyTorch implementation that supports:
    - Standard RMS normalization
    - Group RMS normalization
    - Optional gating with SiLU activation
    h㈵>NFr$   r%   
group_sizenorm_before_gatedevicer   c                    sX   ||d}t    || _ttj|fi || _| dd || _	|| _
|   dS )a  Initialize RMSNormGated.

        Args:
            hidden_size: Size of the hidden dimension
            eps: Epsilon for numerical stability
            group_size: If not None, do GroupNorm with each group
                        having group_size elements.
                        group_size=None is equivalent to group_size=hidden_size
                        (i.e. there's only 1 group).
            norm_before_gate: If True and z is provided: out = norm(x) * silu(z)
                              If False and z is provided: out = norm(x * silu(z))
            device: Device to create parameters on
            dtype: Data type for parameters
        )rs   r   r   N)r)   r*   r%   r.   r/   r   emptyr   register_parameterrq   rr   reset_parameters)r4   r$   r%   rq   rr   rs   r   factory_kwargsr7   r   r   r*   ^  s   

zRMSNormGated.__init__c                 C   s   t jj| j d S r   )r   r.   initones_r   )r4   r   r   r   rv   ~  s   zRMSNormGated.reset_parametersr   zr
   c                 C   s   |dur| j s|t| }| jdu r-|djddd}|t|| j  }|| j	 }n*ddl
m} ||d| jd	}|djddd}|t|| j  }||d
| j	 }|dure| j re|t| }|S )a  
        Native PyTorch implementation of RMS normalization with gating.

        Args:
            x: Input tensor
            z: Optional gating tensor

        Returns:
            Normalized (and optionally gated) tensor

        If z is not None:
            - norm_before_gate=True: out = norm(x) * silu(z)
            - norm_before_gate=False: out = norm(x * silu(z))
        Nr<   r;   Tr=   r   )	rearrangez... (g d) -> ... g d)dz... g d -> ... (g d))rr   Fsilurq   rD   rE   r   rF   r%   r   einopsr{   )r4   r   rz   rH   x_normedr   r{   x_groupr   r   r   rM     s   
zRMSNormGated.forward_nativec              	   C   s,   ddl m} ||| j| j|| j| j| jdS )Nr   )
rmsnorm_fn)rz   r%   rq   rr   )2vllm.model_executor.layers.fla.ops.layernorm_guardr   r   r   r%   rq   rr   )r4   r   rz   r   r   r   r   rP     s   zRMSNormGated.forward_cuda)rp   NFNNr   )rV   rW   rX   rY   rZ   r[   r\   r   rs   r   r*   rv   r^   rM   rP   ra   r   r   r7   r   ro   R  sJ     
+ro   c                       s<   e Zd ZdZd
dedef fddZdejfdd	Z	  Z
S )	LayerNormz
    Layer Normalization.
    r#   r>   r%   c                    sJ   t    || _|| _ttj|tjd| _	ttj
|tjd| _d S )Nr(   )r)   r*   r>   r%   r.   r/   r   r-   rA   r   rd   r   )r4   r>   r%   r7   r   r   r*     s
   
zLayerNorm.__init__r   c                 C   s&   t | | jf| j| j| j|S r   )r}   
layer_normr[   r>   r   r   r%   type_as)r4   r   r   r   r   forward  s
   zLayerNorm.forwardrm   )rV   rW   rX   rY   rZ   r[   r*   r   r^   r   ra   r   r   r7   r   r     s    r   )F)rY   r   torch.nnr.   torch.nn.functional
functionalr}   vllm._aiter_opsr   vllm.model_executor.custom_opr   *vllm.model_executor.layers.batch_invariantr   r   vllm.platformsr   r^   r[   r   r_   r   r   r\   r   r!   registerr"   rc   ro   Moduler   r   r   r   r   <module>   sn   



 ^h