o
    iW                     @   s  d dl Z d dlZd dlmZ d$ddZ		d%ddZ	d&d	d
Z		d%ddZ	d&ddZ		d%ddZ		d&ddZ
G dd dejjZG dd dejjZG dd dejjZdd Z					d'ddZ					 			d(ddZ			d)d d!ZG d"d# d#ejjZdS )*    N)init   c                 C   s   |   | dkr
| S |  S )zBAssume that x already has last dim divisible by alignment_in_bytesr   )data_ptrclone)xalignment_in_bytes r   O/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/ops/layer_norm.pymaybe_align	   s   r
   Fc
                 C   s   |  }
| d|
f}|dur|d|
fnd}|dur!|dnd}t||||||dd||ddd||	\}}}}}||durB|n||||fS )<Assume that arguments are contiguous and aligned to 16 bytesN      ?r   numelviewdropout_layer_normdropout_add_ln_fwd)x0residualgammabetarowscalecolscale	dropout_pepsilonresidual_in_fp32is_rms_normhidden_sizex0matresidualmatzmatxmatdmaskmursigmar   r   r	   _dropout_add_layer_norm_forward   s,   r%   c                 C   s   |  }|d|f}| |j}|dur||jnd}|dur(|d|fnd}|dur3|dnd}|	durA|dusAJ dt||||||||||	dd|
dd||^}}}}}}}|	du rg||||fS |d }|||||fS )Assume that arguments are contiguous and aligned to 16 bytes
    dx == None means that it was a post-norm architecture
    (x = drop(x0) + residual was not returned in the fwd).
    x0 must not be None if we have colscale.
    r   N2x0 is required to compute the gradient of colscaler   r   r   r   shaper   dropout_add_ln_bwd)dzdxr   r   r"   r#   r$   r   r   r   r   has_residualr   r   r!   dzmatdxmatr   dx0matdresidualmatdgammadbeta_rest	dcolscaler   r   r	    _dropout_add_layer_norm_backward7   s>   r7   c                 C   s   |  }| d|f}|dur|d|fnd}|dur!|dnd}|dur,|dnd}t||||d||||||	|
d||\}}}}}||durM|n||||fS r   r   Nr   )r   r   r   r   r   	x0_subset
out_subsetr   r   rowscale_constout_numrowsr   r   r   r   r   r    r!   r"   r#   r$   r   r   r	   &_dropout_add_layer_norm_subset_forwardn   s.   r=   c                 C   s   |  }|d|f}| d|}|dur||jnd}|dur(|d|fnd}|	dur3|	dnd}	|
dur>|
dnd}
|durL|dusLJ dt||||||||d||	|
|||||^}}}}}}}|du rr||||fS |d }|||||fS )r&   r   Nr'   r   r(   )r+   r,   r   r   r"   r#   r$   r   r   r9   r:   r   r;   
x0_numrowsr-   r   r   r!   r.   r/   r   r0   r1   r2   r3   r4   r5   r6   r   r   r	   '_dropout_add_layer_norm_subset_backward   s@   r?   c                 C   s   |  }| d|f}|dur|d|fnd}|dur#|d|fnd}t|||||||||d|	|
\}}}}}}}|||durD|n|||||fS r8   )r   r   r   $dropout_add_ln_parallel_residual_fwd)r   x1r   gamma0beta0gamma1beta1r   r   r   r   r   r   x1matr   z0matz1matr!   dmask0dmask1r#   r$   r   r   r	   1_dropout_add_layer_norm_parallel_residual_forward   s6   	rK   c                 C   s   |  }|d|f}| |j}|dur||jnd}|dur'||jnd}t||||||||||	|
|||^}}}}}}}}|||||||fS )zAssume that arguments are contiguous and aligned to 16 bytes
    dx == None means that it was a post-norm architecture
    (x = drop(x0) + residual was not returned in the fwd).
    r   N)r   r   r)   r   $dropout_add_ln_parallel_residual_bwd)dz0dz1r,   r   rI   rJ   r#   r$   rB   rD   r   has_x1r-   r   r   r!   dz0matdz1matr/   r0   dx1matr1   dgamma0dbeta0dgamma1dbeta1r5   r   r   r	   2_dropout_add_layer_norm_parallel_residual_backward  s>   
rW   c                   @   .   e Zd Ze				dddZedd ZdS )DropoutAddLayerNormFnFc                 C   s  t | d}|d urt | dnd }t | d}|d ur&t | dnd }|d ur3t | dnd }|d ur@t | dnd }t|||||||||	|
\}}}}}|d urZ|nd }| ||j||||||| |
| _|| _|d u| _|| _	|d u| _
|s|
s||jS ||j||jfS |dkr||jn
tj|jtj|jd}| | |
s||j|fS ||j||j|fS Nr           dtypedevice)r
   
contiguousr%   save_for_backwardr   r)   prenormr   r-   r   has_betatorchonesuint8r^   mark_non_differentiable)ctxr   r   r   r   r   r   r   r   r   ra   r   return_dmaskr    r!   r"   r#   r$   x0_savedr   r   r	   forward8  sT   


zDropoutAddLayerNormFn.forwardc                 G   s   t | d}| jrt |d  dnd }| j\}}}}}}	}
}| j}| j}t|||||||	||
|||| j^}}}}}||j	}|d urL||j	nd }|d urV|d nd }|||| j
r`|nd d |d d d d d d fS Nr   r   )r
   r_   ra   saved_tensorsr   r-   r7   r   r   r)   rb   )rg   r+   argsr,   r   r   r"   r   r#   r$   r   r   r   r-   r0   r1   r2   r3   r5   dx0	dresidualr6   r   r   r	   backwardu  sH   zDropoutAddLayerNormFn.backwardNFFFF__name__
__module____qualname__staticmethodrj   rp   r   r   r   r	   rY   7  s    <rY   c                   @   rX   )DropoutAddLayerNormSubsetFnFc                 C   s  t | d}|d urt | dnd }t | d}|d ur&t | dnd }|d ur3t | dnd }t|||||||||	|
|||\}}}}}|d urP|nd }dg|jdd  R }| ||||||||||	 || _|| _|
| _|jd d 	 | _
|d u| _|| _|d u| _dg|jdd  R }|s|s||S ||||jfS ||}|dkr||jn
tj|jtj|jd}| | |s||fS ||||fS )Nr   r      r[   r\   )r
   r_   r=   r)   r`   r   ra   r   r;   r   r>   r-   r   rb   rc   rd   re   r^   rf   )rg   r   r   r   r   r   r9   r:   r   r   r;   r<   r   ra   r   rh   r    r!   r"   r#   r$   ri   x_shapez_shapezr   r   r	   rj     sT   

$

z#DropoutAddLayerNormSubsetFn.forwardc                 G   s   t | d}| jrt |d  dnd }| j\	}}}}}}	}
}}| j}| j}t|||||||	||
|||| j| j|| j	^}}}}}|j
dg|jdd  R  }|d urZ|
|jnd }|
d urd|d nd }|||| jrn|nd |d d d d d d d d d d fS )Nr   r   r   rx   )r
   r_   ra   rl   r   r-   r?   r;   r>   r   r   r)   rb   )rg   r+   rm   r,   r   r   r"   r   r#   r$   r   r9   r:   r   r-   r0   r1   r2   r3   r5   rn   ro   r6   r   r   r	   rp     sT   z$DropoutAddLayerNormSubsetFn.backwardNrq   rr   r   r   r   r	   rw     s    @rw   c                   @   rX   )%DropoutAddLayerNormParallelResidualFnFc                 C   s   t | d}|d urt | dnd }|d urt | dnd }t | d}|d ur3t | dnd }|d ur@t | dnd }|d urMt | dnd }t|||||||||	|
|\}}}}}}}| ||j|||||| || _|| _|d u| _|d u| _	|| _
|d u| _||j|d ur||jnd f}|s|s|S g |||jR S |dkr||jn
tj|jtj|jd}|dkr|d ur||jn
tj|jtj|jd}| | | | |sg |||R S g |||j||R S rZ   )r
   r_   rK   r`   r   r)   ra   r   rO   r-   r   rb   rc   rd   re   r^   rf   )rg   r   rA   r   rB   rC   rD   rE   r   r   r   ra   r   rh   rG   rH   r!   rI   rJ   r#   r$   r{   r   r   r	   rj     sh   	


$

z-DropoutAddLayerNormParallelResidualFn.forwardc                 G   s
  t | d}|d urt | dnd }| jr t |d  dnd }| j\}}}}}	}
}| j}| j}| j}t|||||||
|||	|||| j\}}}}}}}|	|j
}|d ur^|	|j
nd }|d urj|	|j
nd }||||| jru|nd || jr||nd d d d d d d fS rk   )r
   r_   ra   rl   r   rO   r-   rW   r   r   r)   rb   )rg   rM   rN   rm   r,   r   rI   rJ   rB   rD   r#   r$   r   rO   r-   r0   rR   r1   rS   rT   rU   rV   rn   dx1ro   r   r   r	   rp   \  s`   	z.DropoutAddLayerNormParallelResidualFn.backwardNrq   rr   r   r   r   r	   r|     s    Gr|   c                 C   s   t | d ||d d d|d	S )Nr[   FrY   apply)r   weightbiasr   r   r   r	   
layer_norm  s   r   c                 C   s    t | ||||||||	|d|
S zmresidual_in_fp32 only has an effect if residual is None.
    Otherwise residual dtype is residual.dtype.
    Fr~   )r   r   r   r   r   r   r   
layerscalera   r   return_dropout_maskr   r   r	   dropout_add_layer_norm  s   r   r   c                 C   s&   t | |||||||||	|
||d|S r   )rw   r   )r   r   r   r   r   r   r   r9   r:   r;   r<   ra   r   r   r   r   r	   dropout_add_layer_norm_subset  s"   r   c                 C   s"   t | |||||||||
|	d|S r   )r|   r   )r   rA   r   weight0bias0weight1bias1r   r   ra   r   r   r   r   r	   (dropout_add_layer_norm_parallel_residual  s   r   c                       s<   e Zd Z						d fdd	Zdd Zdd	d
Z  ZS )DropoutAddLayerNormFr[   h㈵>Nc           	         sp   ||d}t    || _|| _|| _|| _tjtj	|fi || _
tjtj	|fi || _|   d S )N)r^   r]   )super__init__ra   pepsr   rc   nn	Parameteremptyr   r   reset_parameters)	selfr   ra   r   r   r   r^   r]   factory_kwargs	__class__r   r	   r     s   


zDropoutAddLayerNorm.__init__c                 C   s   t | j t | j d S N)r   ones_r   zeros_r   )r   r   r   r	   r     s   z$DropoutAddLayerNorm.reset_parametersc              
   C   s.   t ||| j| j| jr| jnd| j| j| jdS )Nr[   )ra   r   )r   r   r   trainingr   r   ra   r   )r   r   r   r   r   r	   rj     s   zDropoutAddLayerNorm.forward)Fr[   r   FNNr   )rs   rt   ru   r   r   rj   __classcell__r   r   r   r	   r     s    r   )r   )FF)F)NNFFF)NNNr   r   FFF)FFF)r   rc   torch.nnr   r
   r%   r7   r=   r?   rK   rW   autogradFunctionrY   rw   r|   r   r   r   r   r   Moduler   r   r   r   r	   <module>   sV   

4
C
;
E
;
6is~
'
0
"