o
    
۾ia^                     @   s  d Z ddlZddlZddlmZ ddlm  mZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ eeZejd
ejdejdejddfddZ	dRdejdejd
efddZ e!dG dd deZ"e!dG dd deZ#e!dG dd deZ$e!dG dd  d eZ%e!d!G d"d# d#eZ&e!d$G d%d& d&eZ'e!d'G d(d) d)eZ(e!d*G d+d, d,eZ)e!d-G d.d/ d/eZ*e!d0G d1d2 d2eZ+e!d3G d4d5 d5eZ,e!d6G d7d8 d8eZ-G d9d: d:ej.Z/ed;d< d=d< d>d< d?d< d@d< dAd< dBd< dCd< dDd< dEd< dFd< dGZ0dHe1dej.fdIdJZ2edKd< dLd< dMd< dNd< dOZ3dHe1dej.fdPdQZ4dS )SzCustom activation functions.    N)divideget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)init_logger)CustomOp)set_weight_attrs)current_platform)tltriton)LazyDictlimitd
BLOCK_SIZEreturnc                 C   s   t jddt j}t jdd}| ||  }	|||  }
|| t d| }||k }t j|
| |dt j}t j|
| | |dt j}t || }t ||}t t 	|| |}|| }||j
j}t j|	| ||d d S )Nr   )axis   )mask)r	   
program_idtoint64arangeloadfloat32sigmoidminimummaximumdtype
element_tystore)o_ptro_stridex_ptrx_strider   r   r   ij	o_row_ptr	x_row_ptroffsetsr   gateup	gate_silugate_clamped
up_clampedresult r.   Y/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/layers/activation.py_swiglustep_and_mul_kernel   s   
r0         @outputinputc              	      sh   |j \ }|jdksJ |d dksJ |d  fdd}t| | | d||d|dd d S )N   r   c                    s    t | d fS )Nr   )r
   cdiv)metabr   r.   r/   grid?   s   z'swiglustep_and_mul_triton.<locals>.gridi   )r   r   r   )shapendimr0   stride)r2   r3   r   nr9   r.   r7   r/   swiglustep_and_mul_triton7   s   

r>   fatrelu_and_mulc                       sT   e Zd ZdZddef fddZdejdejfdd	Zdejdejfd
dZ	  Z
S )FatreluAndMula5  An activation function for FATReLU.

    The function computes x -> FATReLU(x[:d]) * x[d:] where
    d = x.shape[-1] // 2.
    This is used in openbmb/MiniCPM-S-1B-sft.

    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
            	thresholdc                    s@   t    || _t rtjjj| _	d S t
 r| j| _d S d S N)super__init__rB   r   is_cuda_aliketorchops_Cr?   opis_cpuforward_native_forward_method)selfrB   	__class__r.   r/   rE   ]   s   
zFatreluAndMul.__init__xr   c                 C   sF   |j d d }|dd |f }|d|d f }t|| jd}|| S )Nr4   .rA   )r:   FrB   )rN   rQ   r   x1x2r.   r.   r/   rL   e   s
   zFatreluAndMul.forward_nativec                 C   sJ   |j d d }|j d d |f }tj||j|jd}| ||| j |S NrR   r4   )r   device)r:   rG   emptyr   rW   rJ   rB   rN   rQ   r   output_shapeoutr.   r.   r/   forward_cudal   s
   zFatreluAndMul.forward_cuda)rA   )__name__
__module____qualname____doc__floatrE   rG   TensorrL   r\   __classcell__r.   r.   rO   r/   r@   N   s
    r@   silu_and_mulc                       sr   e Zd ZdZdddef fddZedejdejfd	d
Z	dejdejfddZ
dejdejfddZ  ZS )
SiluAndMulzAn activation function for SwiGLU.

    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.

    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
    Tcompile_nativerg   c                   sF   t  j|d t st rtjjj| _	d S t
 r!| j| _d S d S )Nrf   )rD   rE   r   rF   is_xpurG   rH   rI   rd   rJ   rK   rL   rM   )rN   rg   rO   r.   r/   rE      s   zSiluAndMul.__init__rQ   r   c                 C   s4   | j d d }t| dd|f | d|df  S 6PyTorch-native implementation equivalent to forward().rR   r4   .Nr:   rS   silu)rQ   r   r.   r.   r/   rL      s   &zSiluAndMul.forward_nativec                 C   F   |j d d }|j d d |f }tj||j|jd}| || |S rV   r:   rG   rX   r   rW   rJ   rY   r.   r.   r/   r\      
   zSiluAndMul.forward_cudac                 C   
   |  |S rC   r\   rN   rQ   r.   r.   r/   forward_xpu      
zSiluAndMul.forward_xpu)r]   r^   r_   r`   boolrE   staticmethodrG   rb   rL   r\   rs   rc   r.   r.   rO   r/   re   u   s    re   mul_and_siluc                       sb   e Zd ZdZ fddZdejdejfddZdejdejfdd	Zdejdejfd
dZ	  Z
S )
MulAndSiluzAn activation function for SwiGLU.

    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.

    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
    c                    sB   t    t st rtjjj| _	d S t
 r| j| _d S d S rC   )rD   rE   r   rF   rh   rG   rH   rI   rw   rJ   rK   rL   rM   rN   rO   r.   r/   rE      s   
zMulAndSilu.__init__rQ   r   c                 C   s4   |j d d }|dd|f t|d|df  S ri   rk   )rN   rQ   r   r.   r.   r/   rL      s   &zMulAndSilu.forward_nativec                 C   rm   rV   rn   rY   r.   r.   r/   r\      ro   zMulAndSilu.forward_cudac                 C   rp   rC   rq   rr   r.   r.   r/   rs      rt   zMulAndSilu.forward_xpu)r]   r^   r_   r`   rE   rG   rb   rL   r\   rs   rc   r.   r.   rO   r/   rx      s    rx   gelu_and_mul_sparsec                       sn   e Zd ZdZddedef fddZdejdejfd	d
Z	dejdejfddZ
dejdejfddZ  ZS )GeluAndMulSparsea  An activation function for GeluAndMulSparse.
    This activation function is used in Gemma3n. It computes:
        up_proj = self.up_proj(x)
        gate_proj = self.gate_proj(x)
        gate_proj = self._gaussian_topk(gate_proj) # sparsity
        activations = self.act_fn(gate_proj) # gelu
        down_proj = self.down_proj(activations * up_proj)
    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
    noneactivation_sparsityapproximatec                    s   t    || _|dvrtd| t r#|dkr#td d| _|dkr+tdtj	|tj
d}tjjd	d
}||| _d S )Nr|   tanhUnknown approximate mode: r   z[ROCm] Pytorch's native GELU with tanh approximation is currently unstable and produces garbage. Fallback to 'none' approximation.r|   rA   z2activation_sparsity is 0.0. Please use GeluAndMul.r   r   r   )rD   rE   r~   
ValueErrorr   is_rocmloggerwarning_oncerG   tensorr   distributionsnormalNormalicdfstd_multiplier)rN   r}   r~   target_sparsity_tensornormal_distrO   r.   r/   rE      s   
zGeluAndMulSparse.__init__rQ   r   c                 C   s@   t j|ddd}t j|dddd}||| j  }tj|| S )z5Get % sparse percentile of the Gaussian distribution.rR   T)dimkeepdimF)r   r   unbiased)rG   meanstdr   nn
functionalrelu)rN   rQ   r   r   cutoff_xr.   r.   r/   _gaussian_topk   s   zGeluAndMulSparse._gaussian_topkc                 C   sH   |j d d }| |dd|f }tj|| jd}||d|df  S )rj   rR   r4   .Nr~   )r:   r   rS   gelur~   )rN   rQ   r   r[   r.   r.   r/   rL      s   zGeluAndMulSparse.forward_nativec                 C   rp   rC   rL   rr   r.   r.   r/   r\      rt   zGeluAndMulSparse.forward_cudar|   )r]   r^   r_   r`   ra   strrE   rG   rb   r   rL   r\   rc   r.   r.   rO   r/   r{      s    
r{   gelu_and_mulc                       sx   e Zd ZdZddef fddZdejdejfdd	Zdejdejfd
dZ	dejdejfddZ
defddZ  ZS )
GeluAndMulzAn activation function for GeGLU.

    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.

    Shapes:
        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
        return: (batch_size, seq_len, d) or (num_tokens, d)
    r|   r~   c                    s   t    || _|dvrtd| t st st r4|dkr*tj	j
j| _n
|dkr4tj	j
j| _t rC|dkrEtd d S d S d S )Nr   r   r|   r   z[ROCm] PyTorch's native GELU with tanh approximation is unstable with torch.compile. For native implementation, fallback to 'none' approximation. The custom kernel implementation is unaffected.)rD   rE   r~   r   r   rF   rK   rh   rG   rH   rI   r   rJ   gelu_tanh_and_mulr   r   r   )rN   r~   rO   r.   r/   rE     s&   
zGeluAndMul.__init__rQ   r   c                 C   sR   | j }t r|dkrd}|jd d }tj|dd|f |d|d|df  S )rj   r   r|   rR   r4   .Nr   )r~   r   r   r:   rS   r   )rN   rQ   r~   r   r.   r.   r/   rL     s
   *zGeluAndMul.forward_nativec                 C   rm   rV   rn   rY   r.   r.   r/   r\   &  ro   zGeluAndMul.forward_cudac                 C   rp   rC   rq   rr   r.   r.   r/   rs   -  rt   zGeluAndMul.forward_xpuc                 C      dt | j S )Nzapproximate=)reprr~   ry   r.   r.   r/   
extra_repr0     zGeluAndMul.extra_reprr   )r]   r^   r_   r`   r   rE   rG   rb   rL   r\   rs   r   rc   r.   r.   rO   r/   r      s    	r   swigluoai_and_mulc                       sb   e Zd Zddedef fddZdejdejfd	d
ZdejdejfddZde	fddZ
  ZS )SwigluOAIAndMulZd;?r1   alphar   c                    s   t    || _|| _d S rC   )rD   rE   r   r   )rN   r   r   rO   r.   r/   rE   :  s   

zSwigluOAIAndMul.__init__rQ   r   c                 C   sn   |ddddf |ddddf }}|j d| jd}|j | j | jd}|t|| j  }|d | }|S )rj   .Nr4   r   minmax)clampr   rG   r   r   )rN   rQ   r(   r)   glugated_outputr.   r.   r/   rL   ?  s   &zSwigluOAIAndMul.forward_nativec                 C   sR   |j d d }|j d d |f }tj||j|jd}tjj||| j| j	 |S rV   )
r:   rG   rX   r   rW   rH   rI   r   r   r   rY   r.   r.   r/   r\   I  s
   zSwigluOAIAndMul.forward_cudac                 C   s   dt | j dt | j S )Nzalpha=z, limit=)r   r   r   ry   r.   r.   r/   r   P  s   zSwigluOAIAndMul.extra_repr)r   r1   )r]   r^   r_   ra   rE   rG   rb   rL   r\   r   r   rc   r.   r.   rO   r/   r   5  s
    
r   swiglustep_and_mulc                       sb   e Zd ZdZddef fddZdejdejfdd	Zdejdejfd
dZ	de
fddZ  ZS )SwigluStepAndMula*  An activation function for SwiGLU with clamping.

    Computes x -> silu(x[:d]).clamp(max=limit) * x[d:].clamp(-limit, limit)
    where d = x.shape[-1] // 2.

    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
    r1   r   c                    s$   t    |d u rtd|| _d S )Nz*SwigluStepAndMul requires limit to be set.)rD   rE   r   r   )rN   r   rO   r.   r/   rE   a  s   

zSwigluStepAndMul.__init__rQ   r   c                 C   sF   |j ddd\}}t|}|j| jd}|j| j | jd}|| S )rj   r4   rR   )r   )r   r   )chunkrS   rl   r   r   )rN   rQ   r(   r)   r.   r.   r/   rL   g  s
   
zSwigluStepAndMul.forward_nativec                 C   sH   |j d d }|j d d |f }tj||j|jd}t||| j |S rV   )r:   rG   rX   r   rW   r>   r   rY   r.   r.   r/   r\   o  s
   zSwigluStepAndMul.forward_cudac                 C   r   )Nzlimit=)r   r   ry   r.   r.   r/   r   v  r   zSwigluStepAndMul.extra_reprr1   )r]   r^   r_   r`   ra   rE   rG   rb   rL   r\   r   r   rc   r.   r.   rO   r/   r   U  s    
r   gelu_newc                       ^   e Zd Z fddZdejdejfddZdejdejfddZdejdejfd	d
Z  Z	S )NewGELUc                    6   t    t st st rtjjj	| _
d S d S rC   )rD   rE   r   rF   rK   rh   rG   rH   rI   r   rJ   ry   rO   r.   r/   rE        
zNewGELU.__init__rQ   r   c                 C   s:   t dt j }d| dt||dt|d     S )rj   g       @      ?      ?Hm?g      @)mathsqrtpirG   r   pow)rN   rQ   cr.   r.   r/   rL     s   *zNewGELU.forward_nativec                 C      t |}| || |S rC   rG   
empty_likerJ   rN   rQ   r[   r.   r.   r/   r\        
zNewGELU.forward_cudac                 C   rp   rC   rq   rr   r.   r.   r/   rs     rt   zNewGELU.forward_xpu
r]   r^   r_   rE   rG   rb   rL   r\   rs   rc   r.   r.   rO   r/   r   {  s
    	r   	gelu_fastc                       r   )FastGELUc                    r   rC   )rD   rE   r   rF   rK   rh   rG   rH   rI   r   rJ   ry   rO   r.   r/   rE     r   zFastGELU.__init__rQ   r   c                 C   s*   d| dt |d dd| |     S )rj   r   r   g3E?r   )rG   r   rr   r.   r.   r/   rL     s   *zFastGELU.forward_nativec                 C   r   rC   r   r   r.   r.   r/   r\     r   zFastGELU.forward_cudac                 C   rp   rC   rq   rr   r.   r.   r/   rs     rt   zFastGELU.forward_xpur   r.   r.   rO   r/   r     s
    	r   
quick_geluc                       r   )	QuickGELUc                    r   rC   )rD   rE   r   rF   rK   rh   rG   rH   rI   
gelu_quickrJ   ry   rO   r.   r/   rE     r   zQuickGELU.__init__rQ   r   c                 C   s   |t d|  S )rj   r   )rG   r   rr   r.   r.   r/   rL     s   zQuickGELU.forward_nativec                 C   r   rC   r   r   r.   r.   r/   r\     r   zQuickGELU.forward_cudac                 C   rp   rC   rq   rr   r.   r.   r/   rs     rt   zQuickGELU.forward_xpur   r.   r.   rO   r/   r     s
    	r   relu2c                   @   s<   e Zd ZdZdejdejfddZdejdejfddZdS )	ReLUSquaredActivationzX
    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
    rQ   r   c                 C   s   t t|S )rj   )rG   squarerS   r   rr   r.   r.   r/   rL     s   z$ReLUSquaredActivation.forward_nativec                 C   rp   rC   r   rr   r.   r.   r/   r\     s   
z"ReLUSquaredActivation.forward_cudaN)r]   r^   r_   r`   rG   rb   rL   r\   r.   r.   r.   r/   r     s    r   xieluc                       s   e Zd ZdZddddejdfdededed	ed
ejdef fddZ	dej
dej
fddZdej
dej
fddZdej
dej
fddZdej
dej
fddZ  ZS )XIELUz
    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010
    If the user has installed the nickjbrowning/XIELU, we import xIELU CUDA
    Otherwise, we emit a single warning and use xIELU Python
    g?r   gưFalpha_p_initalpha_n_initbetaepsr   with_vector_loadsc              
      s  t    ttttj||dd d| _	ttttj|| |dd d| _
| dtj||d | dtj||d || _t| j    | _t| j    | _d | _zFdd l}tjj | _d}zddlm}	 |	| j| _|d7 }W n ty }
 z|d	|
 d
7 }| j| _W Y d }
~
nd }
~
ww t | W d S  ty }
 zt dt!|
 W Y d }
~
d S d }
~
ww )Nr   r   r   r   r   zUsing experimental xIELU CUDA.)allow_in_graphz& Enabled torch._dynamo for xIELU CUDA.z+ Could not enable torch._dynamo for xIELU (z*) - this may result in slower performance.u   CUDA-fused xIELU not available (%s) – falling back to a Python version.
For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`)"rD   rE   r   	ParameterrG   logexpr   	unsqueezealpha_palpha_nregister_bufferr   ra   r   detachcpuitem_beta_scalarr   _eps_scalar_xielu_cuda_obj	xielu.opsclassesr   r   torch._dynamor   _xielu_cuda_xielu_cuda_fn	Exceptionr   r   r   )rN   r   r   r   r   r   r   r   msgr   errrO   r.   r/   rE     sR   
	
zXIELU.__init__rQ   r   c              
   C   sh   t j| j}| jt j| j }t|dk|| | | j|  tt	|| j
| | | j|  S Nr   )r   r   softplusr   r   r   rG   whereexpm1r   r   )rN   rQ   r   r   r.   r.   r/   _xielu_python  s   $zXIELU._xielu_pythonc                 C   s   | j dus	J d|j}| dk r|d}| dk s| dkr-|dd|d}||jkr:td||j | j || j	| j
| j| j| j}||S )z>Firewall function to prevent torch.compile from seeing .item()Nz"XIELU CUDA object must not be None   r   rR   r   z_Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).)r   r:   r   r   viewsizer   r   forwardr   r   r   r   r   )rN   rQ   original_shaper-   r.   r.   r/   r   (  s,   



zXIELU._xielu_cudar3   c                 C   s8   | j d ur|jrtj s| |S td | |S )Nz:torch._dynamo is compiling, using Python version of xIELU.)	r   is_cudarG   _dynamois_compilingr   r   r   r   rN   r3   r.   r.   r/   rL   D  s   


zXIELU.forward_nativec                 C   rp   rC   r   r   r.   r.   r/   r\   N  rt   zXIELU.forward_cuda)r]   r^   r_   r`   rG   bfloat16ra   r   ru   rE   rb   r   r   rL   r\   rc   r.   r.   rO   r/   r     s2    
5	
r   c                
       sl   e Zd ZdZ		ddejdededej	dB f fdd	Z
d
ejdejfddZdejdejfddZ  ZS )ScaledActivationzqAn activation function with post-scale parameters.

    This is used for some quantization methods like AWQ.
    TN
act_moduleintermediate_sizeinput_is_parallelparams_dtypec                    sl   t    || _|| _|rt }t||}n|}|d u r t }t	tj
||d| _t| jd| ji d S )Nr   weight_loader)rD   rE   actr   r   r   rG   get_default_dtyper   r   rX   scalesr   r   )rN   r   r   r   r   tp_sizeintermediate_size_per_partitionrO   r.   r/   rE   X  s   
zScaledActivation.__init__rQ   r   c                 C   s   |  || j S rC   )r   r  rr   r.   r.   r/   r   n  r   zScaledActivation.forwardparamloaded_weightc                 C   sP   |j }| jrt }|jd }|| }|d||}|j|jks!J || d S r   )datar   r   r:   narrowcopy_)rN   r  r  
param_datatp_rank
shard_size	start_idxr.   r.   r/   r   q  s   
zScaledActivation.weight_loader)TN)r]   r^   r_   r`   r   Moduleintru   rG   r   rE   rb   r   r   r   rc   r.   r.   rO   r/   r   R  s    	r   c                   C      t  S rC   )r   GELUr.   r.   r.   r/   <lambda>~      r  c                   C      t  S rC   )r   r.   r.   r.   r/   r        c                   C   r  rC   )r   r.   r.   r.   r/   r    r  c                   C   s.   t  rtdtjddfd S tjddS )Nzk[ROCm] PyTorch's native GELU with tanh approximation is unstable. Falling back to GELU(approximate='none').r|   r   r   r   )r   r   r   r   r   r  r.   r.   r.   r/   r    s   
	c                   C   r  rC   )r   ReLUr.   r.   r.   r/   r    r  c                   C   r  rC   )r   r.   r.   r.   r/   r    r  c                   C   r  rC   )r   SiLUr.   r.   r.   r/   r    r  c                   C   r  rC   )r   r.   r.   r.   r/   r    r  c                   C   r  rC   )r   Tanhr.   r.   r.   r/   r    r  c                   C   r  rC   )r   Sigmoidr.   r.   r.   r/   r    r  c                   C   r  rC   )r   r.   r.   r.   r/   r    r  )r   r   r   gelu_pytorch_tanhr   r   rl   r   r   r   r   act_fn_namec                 C   sT   |   } | dr| dd }|dkrt S |} | tvr&td| dt|  S )z#Get an activation function by name.ztorch.nn.modules..rR   identityActivation function  is not supported.)lower
startswithsplitr   Identity_ACTIVATION_REGISTRYr   )r  activation_namer.   r.   r/   
get_act_fn  s   
r&  c                   C   r  rC   r   r.   r.   r.   r/   r    r  c                   C   r  rC   )re   r.   r.   r.   r/   r    r  c                   C   r  rC   r'  r.   r.   r.   r/   r    r  c                  O   s   t | i |S rC   )r   )argskwargsr.   r.   r/   r    s    )r   rl   geglu	swigluoaic                 C   s(   |   } | tvrtd| dt|  S )z=Get an activation-and-mul (i.e. SiluAndMul) function by name.r  r  )r   _ACTIVATION_AND_MUL_REGISTRYr   )r  r.   r.   r/   get_act_and_mul_fn  s   r-  r   )5r`   r   rG   torch.nnr   torch.nn.functionalr   rS   vllm.distributedr   r   r   vllm.loggerr   vllm.model_executor.custom_opr   vllm.model_executor.utilsr   vllm.platformsr   vllm.triton_utilsr	   r
   vllm.utils.collection_utilsr   r]   r   jit	constexprr0   rb   ra   r>   registerr@   re   rx   r{   r   r   r   r   r   r   r   r   r  r   r$  r   r&  r,  r-  r.   r.   r.   r/   <module>   s   
&%$:9%q*

