o
    i                     @   s   d dl Zd dlZd dlmZ d dlm  mZ d dlm	Z	m
Z
 d dlmZmZ d dlmZmZ G dd dejjZejZG dd dejZdS )	    N)
custom_fwd
custom_bwd)
sqrelu_bwd
sqrelu_fwd)triton_dgrad_acttriton_linear_actc                   @   s.   e Zd ZeedddZeedd ZdS )FusedDenseSqreluDenseFuncr   c                    sp  t  rt    fdd|||||fD \}}}}}|jt jk}|dv s'J | }| }| }| }| }|jdd |jd }}	| }
|r^t	|
|
|	||}t|}n|dk}t|
|
|	||d|d}|rv|\}}n|}t	|||}|| _|d	kr| |||||| n|d
kr| ||||| n|dkr| |||| |j
g ||jd R  S )zcheckpoint_lvl:
        0: no recomputation in the bwd
        1: recompute gelu_out in the bwd
        2: recompute act_input and gelu_out in the bwd
        c                    s   g | ]}|j  d qS )dtype)to).0ar	    O/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/ops/triton/mlp.py
<listcomp>   s    z5FusedDenseSqreluDenseFunc.forward.<locals>.<listcomp>r         Nr   squared_relu
activationsave_act_inputr   r   )torchis_autocast_enabledget_autocast_gpu_dtyper
   bfloat16
contiguousshapenumelfused_dense_cudalinear_bias_forwardreshaper   r   checkpoint_lvlsave_for_backward)ctxxweight1bias1weight2bias2r#   is_bf16batch_shapen	batch_dim	act_inputoutput1r   resultoutput2r   r	   r   forward   sN   



z!FusedDenseSqreluDenseFunc.forwardc                 C   sz  |  }| j}| j^}}}}}|jd d |jd }}	| }
|jtjk}|dkr/|\}}n1|dkr;|\}t|}n%|dkr`|rQt	
||
|	||}t|}nt||
|	||ddd\}}|r||
|jd }t	||\}}|| }t||}t	||
|	||\}}}n'||
|jd }t	||\}}t||d|d}t	||
|	||\}}}||||||d fS )	Nr   r   r   r   r   Tr   )r   r/   )r   r#   saved_tensorsr   r   r
   r   r   r   r    r!   r"   r   linear_bias_wgradr   linear_bias_backwardr   
reshape_as)r%   grad_outputr#   r&   r'   r(   r)   restr,   r-   r.   r+   r/   r0   grad_weight2
grad_bias2grad_output1grad_act_input
grad_inputgrad_weight1
grad_bias1r   r   r   backward@   sR   






z"FusedDenseSqreluDenseFunc.backwardN)r   )__name__
__module____qualname__staticmethodr   r3   r   rA   r   r   r   r   r      s    0r   c                       s4   e Zd Z							d fdd	Zdd Z  ZS )	FusedDenseSqreluDenseNTr   c	           
         s   |dv sJ ||d}	t    |p|}|p|d }|dks"J d|dks*J d|| _tj||fd|i|	| _tj||fd|i|	| _dS )z
        checkpoint_lvl (increasing lvl means slower but more memory saving):
            0: no recomputation in the bwd
            1: recompute gelu_out in the bwd
            2: recompute gelu_in and gelu_out in the bwd
        r   )devicer
      Tz?DenseSqreluDense module without bias is currently not supportedbiasN)super__init__r#   nnLinearfc1fc2)
selfin_featureshidden_featuresout_featuresr(   r*   r#   rG   r
   factory_kwargs	__class__r   r   rK   u   s   

zFusedDenseSqreluDense.__init__c                 C   s.   |j sJ t|| jj| jj| jj| jj| jS )N)is_cuda!fused_dense_sqrelu_dense_functionrN   weightrI   rO   r#   )rP   r&   r   r   r   r3      s   
zFusedDenseSqreluDense.forward)NNTTr   NN)rB   rC   rD   rK   r3   __classcell__r   r   rU   r   rF   t   s    rF   )fused_dense_libr    r   torch.nnrL   torch.nn.functional
functionalFflash_attn.utils.torchr   r   flash_attn.ops.activationsr   r   flash_attn.ops.triton.linearr   r   autogradFunctionr   applyrX   ModulerF   r   r   r   r   <module>   s   d