o
    i53                     @   s  d dl Z d dlZd dlmZ d dlZd dlmZmZ ddlmZ ddl	m
Z
 ddlmZ e
eZedG d	d
 d
ejZedG dd dejZedG dd dejZedG dd dejZedG dd dejZedG dd dejZG dd dejZG dd dejZG dd dejZG d d! d!ejZG d"d# d#ejZG d$d% d%ejZG d&d' d'eZG d(d) d)ejZi d*ed+ed,d-d.fd/ed0ed1ed2d3ifd4ed5ed6d3ifd7ed8ed9ejd:ed;ed<ed=ej d>ed?ej!d@ej"eej#ej$ej%edAZ&ee&Z'dBdC Z(e(d1Z)e(d0Z*e(d*Z+e(d/Z,e(d<Z-e(dDZ.e(d;Z/e(d:Z0dS )E    N)OrderedDict)Tensornn   )use_kernel_forward_from_hub)logging)is_torchdynamo_compilingGeluTanhc                       L   e Zd ZdZddef fddZdedefdd	Zdedefd
dZ  Z	S )GELUTanha&  
    A fast C implementation of the tanh approximation of the GeLU activation function. See
    https://huggingface.co/papers/1606.08415.

    This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
    match due to rounding errors.
    Fuse_gelu_tanh_pythonc                    s2   t    |r| j| _d S tjtjjdd| _d S )Ntanh)approximate)	super__init___gelu_tanh_pythonact	functoolspartialr   
functionalgelu)selfr   	__class__ U/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/activations.pyr   (   s   
zGELUTanh.__init__inputreturnc                 C   s6   |d dt tdtj |dt |d     S N      ?      ?       @Hm?g      @torchr   mathsqrtpipowr   r   r   r   r   r   /      6zGELUTanh._gelu_tanh_pythonc                 C   
   |  |S Nr   r)   r   r   r   forward2      
zGELUTanh.forwardF)
__name__
__module____qualname____doc__boolr   r   r   r.   __classcell__r   r   r   r   r      s
    r   NewGELUc                   @   "   e Zd ZdZdedefddZdS )NewGELUActivationz
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    r   r   c                 C   s6   d| dt tdtj |dt |d     S r   r#   r)   r   r   r   r.   =   r*   zNewGELUActivation.forwardNr1   r2   r3   r4   r   r.   r   r   r   r   r9   6   s    r9   GeLUc                       r
   )GELUActivationa  
    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
    Also see the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    Fuse_gelu_pythonc                    s(   t    |r| j| _d S tjj| _d S r,   )r   r   _gelu_pythonr   r   r   r   )r   r=   r   r   r   r   J   s   
zGELUActivation.__init__r   r   c                 C   s    |d dt |td   S )Nr   r    r!   )r$   erfr%   r&   r)   r   r   r   r>   Q   s    zGELUActivation._gelu_pythonc                 C   r+   r,   r-   r)   r   r   r   r.   T   r/   zGELUActivation.forwardr0   )
r1   r2   r3   r4   r5   r   r   r>   r.   r6   r   r   r   r   r<   A   s
    r<   SiLUc                   @   r8   )SiLUActivationa  
    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
    later.
    r   r   c                 C   s   t j|S r,   )r   r   silur)   r   r   r   r.   b   s   zSiLUActivation.forwardNr:   r   r   r   r   rA   X   s    rA   FastGELUc                   @   r8   )FastGELUActivationz}
    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
    r   r   c                 C   s*   d| dt |d dd| |     S )Nr   r    g3E?r"   )r$   r   r)   r   r   r   r.   l   s   *zFastGELUActivation.forwardNr:   r   r   r   r   rD   f       rD   	QuickGELUc                   @   r8   )QuickGELUActivationzr
    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
    r   r   c                 C   s   |t d|  S )NgZd;?)r$   sigmoidr)   r   r   r   r.   v   s   zQuickGELUActivation.forwardNr:   r   r   r   r   rG   p   rE   rG   c                       s<   e Zd ZdZdedef fddZdedefdd	Z  ZS )
ClippedGELUActivationa  
    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
    https://huggingface.co/papers/2004.09602.

    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
    initially created.

    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://huggingface.co/papers/1606.08415
    minmaxc                    s8   ||krt d| d| dt   || _|| _d S )Nzmin should be < max (got min: z, max: ))
ValueErrorr   r   rJ   rK   )r   rJ   rK   r   r   r   r      s
   

zClippedGELUActivation.__init__xr   c                 C   s   t t|| j| jS r,   )r$   clipr   rJ   rK   )r   rN   r   r   r   r.         zClippedGELUActivation.forward)	r1   r2   r3   r4   floatr   r   r.   r6   r   r   r   r   rI   z   s    rI   c                       s2   e Zd ZdZ fddZdedefddZ  ZS )AccurateGELUActivationz
    Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
    https://github.com/hendrycks/GELUs

    Implemented along with MEGA (Moving Average Equipped Gated Attention)
    c                    s    t    tdtj | _d S )N   )r   r   r%   r&   r'   precomputed_constantr   r   r   r   r      s   
zAccurateGELUActivation.__init__r   r   c                 C   s,   d| dt | j|dt |d     S )Nr   r   r"      )r$   r   rT   r(   r)   r   r   r   r.      s   ,zAccurateGELUActivation.forward)r1   r2   r3   r4   r   r   r.   r6   r   r   r   r   rR      s    rR   c                       sD   e Zd ZdZ fddZdedefddZdedefdd	Z  ZS )
MishActivationz
    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://huggingface.co/papers/1908.08681). Also
    visit the official repository for the paper: https://github.com/digantamisra98/Mish
    c                    s   t    tjj| _d S r,   )r   r   r   r   mishr   rU   r   r   r   r      s   
zMishActivation.__init__r   r   c                 C   s   |t tj| S r,   )r$   r   r   r   softplusr)   r   r   r   _mish_python   rP   zMishActivation._mish_pythonc                 C   r+   r,   r-   r)   r   r   r   r.      r/   zMishActivation.forward)	r1   r2   r3   r4   r   r   rZ   r.   r6   r   r   r   r   rW      s
    rW   c                   @   r8   )LinearActivationz[
    Applies the linear activation function, i.e. forwarding input directly to output.
    r   r   c                 C   s   |S r,   r   r)   r   r   r   r.      s   zLinearActivation.forwardNr:   r   r   r   r   r[      s    r[   c                   @   s   e Zd ZdZdddZdS )LaplaceActivationz
    Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
    https://huggingface.co/papers/2209.10655

    Inspired by squared relu, but with bounded range and gradient for better stability
    绹۞? ^/?c                 C   s*   ||  |td }ddt|  S )Nr!   r   r    )divr%   r&   r$   r?   )r   r   musigmar   r   r   r.      s   zLaplaceActivation.forwardN)r]   r^   r1   r2   r3   r4   r.   r   r   r   r   r\      s    r\   c                   @   s   e Zd ZdZdd ZdS )ReLUSquaredActivationz`
    Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.08668v2
    c                 C   s   t j|}t|}|S r,   )r   r   relur$   square)r   r   relu_appliedsquaredr   r   r   r.      s   
zReLUSquaredActivation.forwardNrb   r   r   r   r   rc      s    rc   c                       s   e Zd Z fddZ  ZS )ClassInstantierc                    s4   t  |}t|tr|n|i f\}}|di |S )Nr   )r   __getitem__
isinstancetuple)r   keycontentclskwargsr   r   r   ri      s   zClassInstantier.__getitem__)r1   r2   r3   ri   r6   r   r   r   r   rh      s    rh   c                       sf   e Zd ZdZddddejdf fdd	Zded	efd
dZded	efddZ	ded	efddZ
  ZS )XIELUActivationz
    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010

    If the user has installed the nickjbrowning/XIELU wheel, we import xIELU CUDA
    Otherwise, we emit a single warning and use xIELU Python
    g?r   gưFc              
      s  t    ttttj||dd| _	ttttj|| |dd| _
| dtj||d | dtj||d || _t| j    | _t| j    | _d | _zFdd l}tjj | _d}zddlm}	 |	| j| _|d7 }W n ty }
 z|d|
 d	7 }| j| _W Y d }
~
nd }
~
ww t | W d S  ty }
 zt d
t!|
 W Y d }
~
d S d }
~
ww )N)dtyper   betaepszUsing experimental xIELU CUDA.)allow_in_graphz& Enabled torch._dynamo for xIELU CUDA.z+ Could not enable torch._dynamo for xIELU (z*) - this may result in slower performance.u   CUDA-fused xIELU not available (%s) – falling back to a Python version.
For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`)"r   r   r   	Parameterr$   logexpm1tensor	unsqueezealpha_palpha_nregister_bufferwith_vector_loadsrQ   rr   detachcpuitem_beta_scalarrs   _eps_scalar_xielu_cuda_obj	xielu.opsclassesxieluXIELUtorch._dynamort   _xielu_cuda_xielu_cuda_fn	Exceptionloggerwarning_oncestr)r   alpha_p_initalpha_n_initrr   rs   rq   r}   r   msgrt   errr   r   r   r      s@   
	("zXIELUActivation.__init__rN   r   c              
   C   sh   t j| j}| jt j| j }t|dk|| | | j|  tt	|| j
| | | j|  S )Nr   )r   r   rY   rz   rr   r{   r$   whererw   rJ   rs   )r   rN   rz   r{   r   r   r   _xielu_python  s   $zXIELUActivation._xielu_pythonc                 C   s   |j }| dk r|d}| dk s	| dkr$|dd|d}||j kr1td||j  | j|| j	
|j| j
|j| j| j| j}||S )zDFirewall function to prevent torch.compile from seeing .item() callsrV   r   r   z_Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).)shapedimry   viewsizer   r   r   r.   rz   torq   r{   r   r   r}   )r   rN   original_shaperesultr   r   r   r     s*   


	zXIELUActivation._xielu_cudar   c                 C   s4   | j d ur|jrt s| |S td | |S )Nz:torch._dynamo is compiling, using Python version of xIELU.)r   is_cudar   r   r   r   r   r)   r   r   r   r.   1  s
   


zXIELUActivation.forward)r1   r2   r3   r4   r$   bfloat16r   r   r   r   r.   r6   r   r   r   r   rp      s    	+	rp   r   gelu_10i
   )rJ   rK   	gelu_fastgelu_newgelu_pythonr=   Tgelu_pytorch_tanhgelu_python_tanhr   gelu_accuratelaplace
leaky_relulinearrX   
quick_gelurd   relu2relu6rH   )rB   swishr   prelur   c                 C   s,   | t v rt |  S td|  dtt   )Nz	function z not found in ACT2FN mapping )ACT2FNKeyErrorlistkeys)activation_stringr   r   r   get_activationU  s   r   rB   )1r   r%   collectionsr   r$   r   r   integrations.hub_kernelsr   utilsr   utils.import_utilsr   
get_loggerr1   r   Moduler   r9   r<   rA   rD   rG   rI   rR   rW   r[   r\   rc   rh   rp   	LeakyReLUReLUReLU6Sigmoidr@   TanhPReLUACT2CLSr   r   r   r   r   r   r   rB   rX   
linear_actr   r   r   r   <module>   s   

			^	
