o
    im                     @   s  d dl mZ d dlmZ d dlZd dlZd dlmZ d dl	m  m
Z d dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZ d dlmZmZmZmZmZ G d	d
 d
ejjZ				d,dededee de dee de fddZ!G dd dej"Z#G dd dej"Z$G dd dej"Z%G dd dejjZ&						 	 		d-dededed ee d!ee d"e'd#e de d$e(d%e(dee de fd&d'Z)G d(d) d)ej*Z+G d*d+ d+ej*Z,dS ).    )partial)OptionalN)Tensor)ProcessGroup)
custom_fwd
custom_bwd)gelu_bwdrelu_bwd
sqrelu_bwd
sqrelu_fwd)all_gather_raw
all_reduceall_reduce_rawreduce_scatterreduce_scatter_rawc                   @   s0   e Zd Zee	dddZeedd ZdS )	FusedDenseFuncFNTc                 C   s,  |j | _|| _|| _|| _t r|jt d}|	 }|dur-|r-t
||dd\}}n|}t rI|jt d}|durG|jt dnd}|	 }|durW|rW|  |jdd |jd }	}
|	 }t||
g|jR  dkrxtdt|||}| jr| || n| | |s|S ||fS )z
        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
        with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
        dtypeNTasync_op +fused_dense only supports matrix dims <= 2M)requires_gradcompute_weight_gradientreturn_residualprocess_groupsequence_paralleltorchis_autocast_enabledtoget_autocast_gpu_dtype
contiguousr   waitshapenumelminRuntimeErrorFlinearsave_for_backward)ctxxweightbiasr   r   r   total_xhandle_xbatch_shapen	batch_dimoutput r5   P/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/ops/fused_dense.pyforward   s2   	
zFusedDenseFunc.forwardc                 G   s  |  }| jr|\}|  }| j}| j}| jr/| j\}}|d ur,|r,t||dd\}}	n	|}n| j\}d }|jd d }
|
 }|	||jd }| j
d r| jsZt|| }nt|	||jd ||}|j	g |
|jd R  }|d ur|r}tnt}|||dd\}}nd }| j
d r| jsJ |d ur|r|	  t|	||jd || j
d \}}nd }| j
d r|nd }|d ur| j
d r|  |||d d d fS )NTr   r   r         )r"   r   r   r   r   saved_tensorsr   r$   r%   reshapeneeds_input_gradr(   r)   tr   addmmr   r   r#   fused_dense_cudalinear_bias_wgrad)r+   grad_outputargs
grad_inputr   r   r,   r-   r/   r0   r1   r3   	reduce_fnhandle_grad_inputgrad_weight	grad_biasr5   r5   r6   backwardE   sP   




zFusedDenseFunc.backward)FNT__name__
__module____qualname__staticmethodr   r7   r   rH   r5   r5   r5   r6   r      s    'r   FTr,   r-   r.   r   r   r   c                 C   s   | j tjtjfv p| j tjkot }| jr,|jr,|d u s |jr,|r,t| |||||S |d u s2J t	
| ||}|s=|S || fS N)r   r   float16bfloat16float32r   is_cudar   applyr(   r)   )r,   r-   r.   r   r   r   dtype_eligibleoutr5   r5   r6   fused_dense_funcv   s   rV   c                       sF   e Zd Z				ddededededdf
 fd	d
ZdddZ  ZS )
FusedDenseTFNin_featuresout_featuresr.   r   returnc                    s    t  j|||||d || _d S )Nr.   devicer   )super__init__r   )selfrX   rY   r.   r   r\   r   	__class__r5   r6   r^      s   	
zFusedDense.__init__c                 C   s   t || j| j| j|dS )z
        If process_group is not None, we're doing Tensor Parallel with sequence parallelism:
        we do an all_gather of x before doing the matmul.
        )r   r   )rV   r-   r.   r   )r_   r,   r   r5   r5   r6   r7      s   zFusedDense.forward)TFNNrN   )rJ   rK   rL   intboolr^   r7   __classcell__r5   r5   r`   r6   rW      s"    rW   c                       F   e Zd Z					ddededededdf
 fd	d
Zdd Z  ZS )ColumnParallelLinearTr8   NrX   rY   r   r.   rZ   c	                    s   t j|}	|| rtd| d| || }
|
|	 }|
|	 }|tt j||k  }t j||| |||d || _|| _	d S )Nzout_features () must be a multiple of r[   )
r   distributedget_world_size
ValueErrorrb   get_rankr]   r^   r   r   )r_   rX   rY   r   r.   r   multiple_ofr\   r   
world_sizemultipledivmodlocal_multipler`   r5   r6   r^      s   
zColumnParallelLinear.__init__c                 C   s   t || j| j| j| jdS )N)r   r   )rV   r-   r.   r   r   )r_   r,   r5   r5   r6   r7      s   zColumnParallelLinear.forwardTTr8   NN	rJ   rK   rL   rb   r   rc   r^   r7   rd   r5   r5   r`   r6   rf      s$    
rf   c                       re   )RowParallelLinearTr8   NrX   rY   r   r.   rZ   c	                    s   t j|}	t j|}
|| rtd| d| || }||	 }||	 }|tt j||k  }t j|| ||o>|
dk||d || _|| _	d S )Nzin_features (rg   r   r[   )
r   rh   ri   rk   rj   rb   r]   r^   r   r   )r_   rX   rY   r   r.   r   rl   r\   r   rm   rankrn   ro   rp   rq   r`   r5   r6   r^      s"   

zRowParallelLinear.__init__c                 C   s*   t || j| j}| jrtnt}||| jS )z
        We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
        a reduce_scatter of the result.
        )rV   r-   r.   r   r   r   r   )r_   r,   rU   rD   r5   r5   r6   r7      s   zRowParallelLinear.forwardrr   rs   r5   r5   r`   r6   rt      s$    
 rt   c                   @   s<   e Zd Zee							d
ddZeedd	 ZdS )FusedMLPFuncgelu_approxTFr   Nc                    s  d|
  krdksJ  J |dv sJ |dkr|
dksJ |s"d}	|	dv s(J || _ || _|| _|	| _|| _|
| _t rF|jt	 d}|
 }|durZ|rZt||d	d
\}}n|}t rt	   fdd||fD \}}|dur{|j dnd}|dur|j dnd}|
 }|dur|
 nd}|
 }|dur|
 nd}|dur|r|  |jdd |jd }}| }t||g|j|jR  dkrtd|
dkrt|||}|dkrttjddn|dkrtntj}tjd ||}W d   n	1 s	w   Y  n|dk}t||||||||
^}}|r*|d }t|||}|	dks@|	dkrJ|dkrJ| ||||| n|	dkrX| |||| n|	dkre| |||| |jg ||jd R  }|sw|S ||fS )a  
        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
        with sequence parallelism: we do an all_gather of x before doing the matmul.
        If sequence_parallel=False, then the input is already gathered.

        checkpoint_lvl:
        0: no recomputation in the bwd
        1: recompute gelu_out / relu_out in the bwd
        2: recompute pre_act and gelu_out / relu_out in the bwd
        r      rw   relusqrelur{   r9   r   r8   r9   r   NTr   c                    s   g | ]}|j  d qS )r   )r    ).0ar   r5   r6   
<listcomp>.  s    z(FusedMLPFunc.forward.<locals>.<listcomp>r   r   rw   tanhapproximatefuser2r   r8   rz   )r   r   r   checkpoint_lvl
activation	heuristicr   r   r    r!   r"   r   r#   r$   r%   r&   r'   r(   r)   r   gelur   rz   jitfuserr?   linear_act_forwardr;   r*   )r+   r,   weight1bias1weight2bias2r   save_pre_actr   r   r   r   r   r/   r0   r1   r2   r3   pre_actactivation_fnoutput1is_gelurestoutput2r5   r   r6   r7      sv   



zFusedMLPFunc.forwardc                 G   sl  |  }| j}| j}|dkrttjddn|dkrtntj}| jr(|\}|  }| j	}| j
}| j^}	}
}}|d u s;|s=|	}|jd d }| }|dv r|d ur[|r[t|	|dd\}}|d	ksg|d
krl|dkrl|\}}ns|d
kr|\}tjd ||}W d    n1 sw   Y  nQ|dkr|\}|d ur|rt|	|\}}| jdkrt||
|}tjd ||}W d    n1 sw   Y  nt|||jd |
||dkd| j\}}|||jd }|||jd }|||jd }| jd rt||| jd \}}nd }| jd r|nd }| jdkrTt|| }|dkr.tn|dkr5tnt}tjd |||}W d    n	1 sNw   Y  nt||||dk| j\}}| jd sjd }| jd	 r| js}t||
 }nt|||jd ||
}|jg ||jd R  }|d ur|rtnt }|||dd\}}nd }| jdkr| jd
 r|d ur|r|dkr|!  t|||jd || jd \}}n8d }| jd r|nd }n+| jd
 r|d ur|r|dkr|!  t| |||jd  }nd }|d ur(| jd	 r(|!  |||||d d d d d d d fS )Nrw   r   r   r{   r   )r   r8   Tr   r   r8   rz   r   r9      rx   )"r"   r   r   r   r(   r   r   rz   r   r   r   r:   r$   r%   r   r   r   r   r   r)   r?   r   r;   r<   r@   r=   r   r
   r	   bias_act_linear_dgrad_bgradr>   r   r   r#   )r+   rA   rB   r   r   r   rC   r   r   r,   r   r   r   r/   r1   r3   r0   r   r   r   _grad_weight2
grad_bias2grad_output1activation_grad_fngrad_pre_act
grad_bias1rD   rE   grad_weight1r5   r5   r6   rH   [  s   



	



zFusedMLPFunc.backward)rw   TFr   r   NTrI   r5   r5   r5   r6   rv      s    _rv   rw   r   r   r   r   r   r   r   r   c                 C   s  |dv sJ | j tjtjfv p| j tjkot }| p*| jd |dkr&dnd dk}| jrV|jrV|jrV|d u s;|jrV|d u sB|jrV|rV|rVt	| |||||||||	|
|S |
d u s\J t
| ||}|dkrntt
jdd	ntt
jd
d}||}t
|||}|s|S || fS )Nry   r   rz         r   rw   r   r   T)inplace)r   r   rO   rP   rQ   r   r$   rR   rv   rS   r(   r)   r   r   rz   )r,   r   r   r   r   r   r   r   r   r   r   r   rT   dim_eligibler   r   r   r   r5   r5   r6   fused_mlp_func  sP   $r   c                       s<   e Zd Z										d fdd	Zdd	d
Z  ZS )FusedMLPNTrw   Fr   autoc                    s   |dv sJ |dv sJ |
|d}t    |p|}|p|d }|| _|| _|| _|dkr/|	nd| _tj||fd|i|| _tj||fd|i|| _	dS )	a  
        If process_group is not None, we're doing Tensor Parallel with sequence parallelism:
        we do an all_gather of x before doing the matmul, gelu, then matmul.
        Finally we do a reduce_scatter of the output.

        checkpoint_lvl (increasing lvl means slower but more memory saving):
            0: no recomputation in the bwd
            1: recompute gelu_out in the bwd
            2: recompute pre_act and gelu_out in the bwd
        heuristic:
            -1: don't fuse gemm + gelu (separate kernel)
            0..4: use this heuristic for the algo section in the fused gemm + gelu
            'auto': heuristic will be picked automatically:
                For CUDA >= 11.8, we set heuristic=0 for both fp16 and bf16 for best perf.
                For CUDA <= 11.7, we set heuristic=1 for fp16 and heuristic=-1 for bf16.
                For H100, we set heuristic=-1 for both fp16 and bf16 as the fused cuBlasLt implementation
                is slower than the unfused version.
        return_residual: whether to return the input x along with the output. This is for
            performance reason: for post-norm architecture, returning the input allows us
            to fuse the backward of nn.Linear with the residual connection.
        r|   ry   r\   r   rx   r{   r   r.   N)
r]   r^   r   r   r   r   nnLinearfc1fc2)r_   rX   hidden_featuresrY   r   r   r   r   r   r   r\   r   factory_kwargsr`   r5   r6   r^     s   #

zFusedMLP.__init__c                 C   s   t  s|jnt  }| jdkr?| jdkr<t jddkr d}n"tt	t
t jjd}|dkr2dn|t jkr9d	nd}nd}n| j}t|| jj| jj| jj| jj| j| j| j| j||d
}| jrd|\}}|d urmt||}| jsr|S ||fS )Nr   rw   cuda)	   r   r   .   r   r   r8   )r   r   r   r   r   r   )r   r   r   r!   r   r   r   get_device_capabilitytuplemaprb   versionsplitrO   r   r   r-   r   r.   trainingr   r   r   )r_   r,   r   r   r   cuda_verrU   r5   r5   r6   r7   D  s6   

 
zFusedMLP.forward)
NNTTrw   Fr   r   NNrN   )rJ   rK   rL   r^   r7   rd   r5   r5   r`   r6   r     s    0r   c                       sB   e Zd Z											ddef fddZd	d
 Z  ZS )ParallelFusedMLPNrw   Tr   r   r   c                    s   |	dv sJ |dv sJ |dusJ ||d}t    |p|}|p%|d }|| _|| _|| _|	| _|dkr8|
nd| _t|||fd|i|| _t	|||fd|i|| _
dS )	aT  
        process_group is required. We're doing Tensor Parallel with sequence parallelism:
        we do an all_gather of x before doing the matmul, gelu, then matmul.
        Finally we do a reduce_scatter of the output.

        checkpoint_lvl (increasing lvl means slower but more memory saving):
            0: no recomputation in the bwd
            1: recompute gelu_out in the bwd
            2: recompute pre_act and gelu_out in the bwd
        heuristic:
            -1: don't fuse gemm + gelu (separate kernel)
            0..4: use this heuristic for the algo section in the fused gemm + gelu
            'auto': heuristic will be picked automatically:
                For CUDA >= 11.8, we set heuristic=0 for both fp16 and bf16 for best perf.
                For CUDA <= 11.7, we set heuristic=1 for fp16 and heuristic=-1 for bf16.
        r|   ry   Nr   rx   r{   r   r.   )r]   r^   r   r   r   r   r   rf   r   rt   r   )r_   rX   r   rY   r   r   r   r   r   r   r   r\   r   r   r`   r5   r6   r^   f  s4   

zParallelFusedMLP.__init__c                 C   s   t  s|jnt  }| jdkr4| jdkr1tttt j	j
d}|dkr'dn|t jkr.dnd}nd}n| j}t|| jj| jj| jj| jj| j| j| j|| j| jd}| jrXtnt}||| jS )	Nr   rw   r   r   r   r8   r   )r   r   r   r   r   r   )r   r   r   r!   r   r   r   r   rb   r   r   r   rO   r   r   r-   r   r.   r   r   r   r   r   r   )r_   r,   r   r   r   rU   rD   r5   r5   r6   r7     s,   

 zParallelFusedMLP.forward)NNrw   NTTTr   r   NN)rJ   rK   rL   r   r^   r7   rd   r5   r5   r`   r6   r   e  s     2r   )NFNT)	NNrw   TFr   r   NT)-	functoolsr   typingr   fused_dense_libr?   r   torch.nnr   torch.nn.functional
functionalr(   r   torch.distributedr   flash_attn.utils.torchr   r   flash_attn.ops.activationsr   r	   r
   r   flash_attn.utils.distributedr   r   r   r   r   autogradFunctionr   rc   rV   r   rW   rf   rt   rv   strrb   r   Moduler   r   r5   r5   r5   r6   <module>   s   	^
(+ g	

8R