o
    iC                     @   s   d dl mZ d dlmZ d dlZd dlmZ d dlm  mZ	 d dlm
Z
 d dlmZ d dlmZ d dlmZ z
d dlmZmZ W n eyN   d	\ZZY nw G d
d dejZG dd dejZdS )    )partial)OptionalN)Tensor)StochasticDepth)MHA)Mlp)layer_norm_fnRMSNormNNc                       sf   e Zd Zddejejddddddddddf fdd	ZdddZ			dd	ed
e	e fddZ
  ZS )BlockNT        Fc                    s  t    || _|| _|| _|| _| jr| jsJ d|du r'tt|d d}|du r3ttd| d}||| _	||| _
t|	dd| _||| _||| _t| jtjsf||| _t|
dd| _||| _| jrtdusqJ d	t| jtjtfrt| j
tjsJ |r| j D ]}d
|_qt| dr| j D ]}d
|_q|r| j D ]}d
|_qt| dr| j D ]	}d
|_qdS dS dS )a  
        For prenorm=True, this Block has a slightly different structure compared to a regular
        prenorm Transformer block.
        The standard block is: LN -> MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add.
        [Ref: https://arxiv.org/abs/2002.04745]
        Here we have: Dropout -> Add -> LN -> MHA -> Dropout -> Add -> LN -> MLP, returning both
        the hidden_states (output of the MLP) and the residual.
        This is for performance reasons, as we can fuse the dropout, add and LayerNorm.
        The residual needs to be provided (except for the very first block).

        For prenorm=False, this Block has the same structure as a regular postnorm Transformer
        block: MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add -> LN.

        return_residual: whether each of the sub-layers (mixer and mlp) will return the residual.
        This is for performance reason: for post-norm architecture, returning the input allows us
        to fuse the backward of nn.Linear with the residual connection.
        z5residual_in_fp32 is only compatible with prenorm=TrueN@   	num_heads   hidden_featuresrow)modeTriton is not installedTnorm2)super__init__prenormfused_dropout_add_lnreturn_residualresidual_in_fp32r   r   r   mixerdropout1r   
drop_path1norm1mlp
isinstancennIdentitydropout2
drop_path2r   r   	LayerNormr	   Dropout
parameters_sequence_parallelhasattr_shared_params)selfdim	mixer_clsmlp_clsnorm_clsdropout_clsr   resid_dropout1resid_dropout2r   r&   r   r   r   sequence_parallelmark_shared_paramsp	__class__ N/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/modules/block.pyr      sP   
#







zBlock.__init__c                 K      | j j||fd|i|S Ndtyper   allocate_inference_cacher-   
batch_size
max_seqlenr>   kwargsr:   r:   r;   r@   i      zBlock.allocate_inference_cachehidden_statesresidualc           
      C   s  | j r| js/| | |}|dur|| n|}| |j| jjjd}| jr.|t	j
}n@| jjdks8| js;d}n| t	j|jdd |j|jd}t|| jj| jj|| jj| jr`| jjnd|d| jt| jtd
\}}|du rui }|dur}||d	< | j|fi |}|dur|dd|f }t| jtjs
| js| | |}|dur|| n|}| |j| jjjd}| jr|t	j
}n@| jjdks| jsd}n| t	j|jdd |j|jd}t|| jj| jj|| jj| jr| jjnd|d| jt| jtd
\}}| |}||fS |du sJ | j|fi |dur!|ni }| jr-|\}}| jsF| | | || j| jjjd}n?| jjdksQ| jsTd}n| t	j|jdd |j|jd}t|| jj| jj|| jj| jrz| jjnd|d
t| jtd	}t| jtjs| |}	| jr|	\}	}| js| | | |	| j| jjjd}|S | jjdks| jsd}n| t	j|	jdd |	j|	jd}t|	| jj| jj|| jj| jr| jjnd|d
t| jtd	}|S )a  Pass the input through the encoder layer.

        Args:
            hidden_states: the sequence to the encoder layer (required).
            residual: if postnorm, residual=None, If prenorm, hidden_states = Attn/MLP(LN(residual))
            mixer_subset: for cross-attention only. If not None, will take a subset of x
                before applying the query projection. Useful for e.g., ViT where we only care
                about the CLS token in the last layer.
        Nr>   r   )devicer>   r   T)rG   eps	dropout_prowscaler   r   is_rms_normmixer_subsetF)rG   rK   rL   rM   r   rN   )r   r   r   r   r    toweightr>   r   torchfloat32r7   trainingonesshaperJ   r   biasrK   r"   r	   r   r!   r#   r$   r&   r%   r   r   )
r-   rF   rG   rO   mixer_kwargsdropped	rowscale1	rowscale2	mixer_outmlp_outr:   r:   r;   forwardl   s  







zBlock.forwardNNNN)__name__
__module____qualname__r#   r'   r(   r   r@   r   r   r^   __classcell__r:   r:   r8   r;   r      s2    
Sr   c                       sl   e Zd ZdZddejejdddddddf fdd	ZdddZ			dd	e	d
e
e	 de
e	 fddZ  ZS )ParallelBlockziThe attention (mixer) and MLP blocks are done in parallel, similar to GPT-J, GPT-NeoX,
    and PaLM.
    Nr   Fc                    sH  t    || _|	| _|
| _|du rtt|d d}|du r&ttd| d}||| _||| _	||| _
||| _||| _| jsG||| _| jrdtdusRJ dt| j
tjtfrbt| j	tjsdJ |r| j
 D ]}d|_qkt| dr| j D ]}d|_q{|r| j
 D ]}d|_qt| dr| j D ]	}d|_qdS dS dS )	a  
        This Block has a slightly different structure compared to a regular
        prenorm Transformer block.
        The standard block is: LN -> MHA / MLP -> Dropout -> Add.
        [Ref: https://arxiv.org/abs/2002.04745]
        Here we have: Dropout -> Add -> LN -> MHA / MLP, returning both
        the hidden_states (output1 of the MHA / MLP) and the residual.
        This is for performance reasons, as we can fuse the dropout, add and LayerNorm.
        The residual needs to be provided (except for the very first block).
        Nr   r   r   r   r   Tr   )r   r   	tied_normr   r   r   r   r   r   r   r    r!   r%   r   r   r"   r#   r'   r	   r(   r)   r*   r+   r,   )r-   r.   r/   r0   r1   r2   r3   r4   rf   r   r   r5   r6   r7   r8   r:   r;   r     sF   








zParallelBlock.__init__c                 K   r<   r=   r?   rA   r:   r:   r;   r@   L  rE   z&ParallelBlock.allocate_inference_cachehidden_states1hidden_states2rG   c           
      C   sP  | j sQ| |}|dur | |}|dur|| | n|| }n
|dur(|| n|}| |j| jjjd}| jsE| |j| jjjdn|}| j	rP|t
j}n>| js\| jj| jjfnd\}}t|| jj| jj||||| jj| jrv| jjndd| j	t| jtd^}}	}| jr|}n|	\}|du ri }| j|fi |}| |}|||fS )a  Pass the input through the encoder layer.

        Args:
            hidden_states1: the output of the previous attention (mixer) or embedding layer.
            hidden_states2: the output of the previous MLP layer (if None, will use hidden_states1).
            residual.
        NrH   r
   r   T)	rG   x1weight1bias1rK   rL   r   r   rN   )r   r   r%   r    rP   rQ   r>   rf   r   r   rR   rS   rW   r   rK   rT   r7   r"   r	   r   r!   )
r-   rg   rh   rG   rX   dropped1dropped2weight2bias2restr:   r:   r;   r^   O  sR   




zParallelBlock.forwardr_   r`   )ra   rb   rc   __doc__r#   r'   r(   r   r@   r   r   r^   rd   r:   r:   r8   r;   re     s2    
Dre   )	functoolsr   typingr   rR   torch.nnr#   torch.nn.functional
functionalFr   torchvision.opsr   flash_attn.modules.mhar   flash_attn.modules.mlpr    flash_attn.ops.triton.layer_normr   r	   ImportErrorModuler   re   r:   r:   r:   r;   <module>   s"    o