o
    i|                     @   sd  d dl Z d dlmZ d dlZd dlmZ d dlmZmZ d dl	m
Z
 zd dlmZmZmZmZmZ W n eyE   d\ZZd\ZZdZY nw z
d dlmZmZ W n ey]   d\ZZY nw zd dlmZ W n eyq   dZY nw d	d
 ZG dd dejZG dd dejZG dd dejZG dd dejZdd ZG dd dejZG dd dejZdS )    N)partial)	rearrangerepeat)get_dim_for_local_rank)flash_attn_kvpacked_funcflash_attn_qkvpacked_funcflash_attn_varlen_kvpacked_func flash_attn_varlen_qkvpacked_funcflash_attn_with_kvcacheNN)ColumnParallelLinearRowParallelLinear)RotaryEmbeddingc                 C   s\   dd }t |  r|| S dt t |  }||td| dd d d | |   S )Nc                    s6   ddt | d       fddt| D S )N      c                    s   g | ]} |  qS  r   ).0iratiostartr   L/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/modules/mha.py
<listcomp>)   s    zCget_alibi_slopes.<locals>.get_slopes_power_of_2.<locals>.<listcomp>)mathlog2range)nheadsr   r   r   get_slopes_power_of_2&   s   z/get_alibi_slopes.<locals>.get_slopes_power_of_2r   r   )r   r   
is_integerfloorget_alibi_slopes)r   r   closest_power_of_2r   r   r   r    %   s    r    c                       s8   e Zd ZdZ						d
 fdd	Zddd	Z  ZS )FlashSelfAttention|  Implement the scaled dot product attention with softmax.
    Arguments
    ---------
        softmax_scale: The temperature to use for the softmax attention.
                      (default: 1/sqrt(d_keys) where d_keys is computed at
                      runtime)
        attention_dropout: The dropout rate to apply to the attention
                           (default: 0.0)
    FN        r&   c                    sb   t    td usJ dtd usJ d|| _|| _t|| _| j	d|dd || _
|| _d S NzFlashAttention is not installedalibi_slopesF)
persistent)super__init__r	   r   causalsoftmax_scalennDropoutdropregister_bufferwindow_sizedeterministic)selfr,   r-   attention_dropoutr2   r(   r3   	__class__r   r   r+   @      
	
zFlashSelfAttention.__init__c                 C   s   |j tjtjfv sJ |jsJ |du r| jn|}|du}| jdur*| jtj| _|rY|j tj	ks4J |dus:J t
|tsAJ t|||| jrL| jjnd| j|| j| j| jd	S t|| jrb| jjnd| j|| j| j| jdS )ao  Implements the multihead softmax attention.
        Arguments
        ---------
            qkv: The tensor containing the query, key, and value.
                If cu_seqlens is None and max_seqlen is None, then qkv has shape (B, S, 3, H, D).
                If cu_seqlens is not None and max_seqlen is not None, then qkv has shape
                (total, 3, H, D), where total is the sum of the sequence lengths in the batch.
            causal: if passed, will override self.causal
            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
                of the sequences in the batch, used to index into qkv.
            max_seqlen: int. Maximum sequence length in the batch.
        Returns:
        --------
            out: (total, H, D) if cu_seqlens is not None and max_seqlen is not None,
                else (B, S, H, D).
        Nr$   r-   r,   r(   r2   r3   )dtypetorchfloat16bfloat16is_cudar,   r(   tofloat32int32
isinstanceintr	   trainingr0   pr-   r2   r3   r   )r4   qkvr,   
cu_seqlens
max_seqlenunpaddedr   r   r   forwardS   s<   

zFlashSelfAttention.forward)FNr$   r%   NF)NNN__name__
__module____qualname____doc__r+   rJ   __classcell__r   r   r6   r   r"   5   s    r"   c                       sB   e Zd ZdZ						d
 fdd	Z					ddd	Z  ZS )FlashCrossAttentionr#   FNr$   r%   c                    sb   t    td usJ dtd usJ d|| _|| _t|| _| j	d|dd || _
|| _d S r'   )r*   r+   r   r   r,   r-   r.   r/   r0   r1   r2   r3   )r4   r,   r-   r5   r(   r2   r3   r6   r   r   r+      r8   zFlashCrossAttention.__init__c                 C   sh  |j tjtjfv sJ |jr|jsJ |du r| jn|}|du}| jdur-| jtj| _|rz|j tj	ks7J |dus=J t
|tsDJ |dusJJ |j tj	ksRJ |dusXJ t
|ts_J t||||||| jrm| jjnd| j|| j| j| jdS |jd |jd }	}
|jd }|jd |	kr|jd |jd ksJ t||| jr| jjnd|| j| j| j| jdS )	a  Implements the multihead softmax attention.
        Arguments
        ---------
            q: The tensor containing the query. (B, Sq, H, D)
            kv: The tensor containing the key and value. (B, Sk, 2, H_k, D)
            causal: if passed, will override self.causal
            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
                of the sequences in the batch, used to index into q.
            max_seqlen: int. Maximum sequence length in the batch of q.
            cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
                of the sequences in the batch, used to index into kv.
            max_seqlen_k: int. Maximum sequence length in the batch of k and v.
        Nr$   r9   r         r   )r,   r-   r(   r2   r3   )r:   r;   r<   r=   r>   r,   r(   r?   r@   rA   rB   rC   r   rD   r0   rE   r-   r2   r3   shaper   )r4   qkvr,   rG   rH   cu_seqlens_kmax_seqlen_krI   
batch_sizeseqlen_qseqlen_kr   r   r   rJ      sR   

&zFlashCrossAttention.forward)FNr$   Nr%   F)NNNNNrK   r   r   r6   r   rQ      s    rQ   c                       ,   e Zd ZdZd	 fdd	Zd
ddZ  ZS )SelfAttentionr#   FNr$   c                    &   t    || _|| _t|| _d S Nr*   r+   r,   r-   r.   r/   r0   r4   r,   r-   r5   r6   r   r   r+         
zSelfAttention.__init__c                 C   s  |j d |j d }}|du r| jn|}|jdd\}}}| jp)dt|j d  }	td|||	 }
|durQtj||fd	|
j	|
j
d
}||d |
t|d }
|rkttj||fd	|
j
dd}|
|j|
j	d }
tj|
d|j	d}| |}td||}|S )au  Implements the multihead softmax attention.
        Arguments
        ---------
            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D)
            causal: if passed, will override self.causal
            key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
                False means to mask out. (B, S)
        r   rR   Nr   dim      ?r&   bthd,bshd->bhts     r:   devicer$   b s -> b 1 1 sri   )r:   rd   r:   bhts,bshd->bthd)rT   r,   unbindr-   r   sqrtr;   einsumfullr:   ri   masked_fill_r   triur?   softmaxr0   )r4   rF   r,   key_padding_maskrY   seqlenrU   kvr-   scorespadding_maskcausal_mask	attentionattention_dropoutputr   r   r   rJ      s(   	
zSelfAttention.forwardFNr$   r   rK   r   r   r6   r   r]          
r]   c                       r\   )CrossAttentionr#   FNr$   c                    r^   r_   r`   ra   r6   r   r   r+   %  rb   zCrossAttention.__init__c                 C   s  |j d |j d }}|du r| jn|}|j d }|j d |kr*|j d |j d ks,J |j d |j d krEt|d|j d |j d  d}|jdd	\}}	| jpYd
t|j d  }
td|||
 }|durtj	||fd|j
|jd}||d |t|d }|rttj||jtjdd}tj||jtjd}|du r|nt|dd}||| | k}||d}tj|d|	j
d}| |}td||	}|S )a  Implements the multihead softmax attention.
        Arguments
        ---------
            q: The tensor containing the query. (B, Sq, H, D)
            kv: The tensor containing the key and value. (B, Sk, 2, H_k, D)
            causal: if passed, will override self.causal
            key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
                False means to mask out. (B, Sk)
        r   rR   NrS   r   r   z... hkv d -> ... (hkv g) d)grc   re   r&   rf   rg   rh   r$   rj   ri   r:   zs -> s 1zb -> b 1 1 1rl   rm   )rT   r,   r   rn   r-   r   ro   r;   rp   rq   r:   ri   rr   r   arangelongsummasked_fillrt   r0   )r4   rU   rV   r,   ru   rY   rZ   r[   rw   rx   r-   ry   rz   row_idxcol_idxskr{   r|   r}   r~   r   r   r   rJ   +  s<   

&
zCrossAttention.forwardr   r   rK   r   r   r6   r   r     r   r   c           
   	   C   s   | j dd \}}||jvr$tj|j|jd||| j| jd}||j|< n|j| }|j}|| j d  }|j	}|| j d  }	||j d ksFJ |	|j d ksOJ |dusUJ | |||||	df< |||d|	df S )Ukv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)Nr   rh   r   rR   .)
rT   key_value_memory_dictr;   emptymax_batch_sizerH   r:   ri   batch_size_offsetseqlen_offset)
rV   inference_params	layer_idx	num_headshead_dimkv_cachebatch_start	batch_endsequence_startsequence_endr   r   r   _update_kv_cacheX  s,   
	
r   c                       s   e Zd ZdZ																					d	d fd
dZdddZdd Zdd Zdd Z						dddZ	  Z
S )MHA-Multi-head self-attention and cross-attentionNFTr$   r        @r%   returnc                    sR  ||d}t    || _|| _|	| _|
| _|| _|| _|| _|| _	|| _
|r7|s-J dtjt||d}nd}|dkrC|sCJ d|| _|durL|n|| _| j| j dks[J d| j| dksfJ d	| j| | _| j| jd
| j   }d
| j | j }| jdkr|rJ dtdusJ dt| j||||d| _|rtt||dnt}|rtt||dnt}| jstj||fd|i|| _ntj||fd|i|| _tj||fd|i|| _| jr
| j| jkrtj||dd
|d| _ntj||dd
|d| _tj||dd
|d| _ ||	||d| _!||	||d| _"tj||fd|i|| _#dS )aX  
        num_heads_kv: can be used to toggle MQA / GQA. If None, use num_heads.
        return_residual: whether to return the input x along with the output. This is for
            performance reason: for post-norm architecture, returning the input allows us
            to fuse the backward of nn.Linear with the residual connection.
        r   #ALiBi code path requires flash_attnrk   Nr%   >Local (sliding window) attention code path requires flash_attnr   +num_heads must be divisible by num_heads_kv(embed_dim must be divisible by num_headsr   z>MHA with rotary embedding does not support cross-attention yetrotary_emb is not installedbase
scale_baseinterleavedri   r(   r2   biasr   )kernel_sizepaddinggroupsr,   r-   r5   )$r*   r+   	embed_dim
cross_attnr,   r   dwconvrotary_emb_dimuse_flash_attnreturn_residualcheckpointingr;   tensorr    r   num_heads_kvr   r   
rotary_embr   r"   r]   rQ   r   r.   LinearWqkvWqWkvConv1d
dwconv_qkvdwconv_q	dwconv_kv
inner_attninner_cross_attnout_proj)r4   r   r   r   r   qkv_proj_biasout_proj_biasdropoutr-   r,   r   r   r   rotary_emb_baserotary_emb_scale_baserotary_emb_interleaved	use_alibir2   fused_bias_fcr   r   r   ri   r:   factory_kwargsr(   qkv_dimkv_diminner_attn_clsinner_cross_attn_clsr6   r   r   r+   x  s   
 





zMHA.__init__c              	   C   <   |d u r	| j jjn|}| j jj}tj||d| j| j||dS Nr   rh   )r   weightr:   ri   r;   r   r   r   r4   rY   rH   r:   ri   r   r   r   allocate_inference_cache     
zMHA.allocate_inference_cachec                 C   s.   | j rJ d| jdusJ dt||| jS )r   z&Generation does not support dwconv yetN0Generation requires layer_idx in the constructor)r   r   r   r4   rV   r   r   r   r   r     s   zMHA._update_kv_cachec                 C   B  |dur	|j dksJ | jsJ | jdkr5| jjdu sJ d| jj|j|j|jd | jj	| jj
}}nd\}}|jd }|j| j d| }|jdurT|jd| n|j }t| jdd}	t||dddddf |dddddf |dddddf |dddddf |||| jj| jj| jdkr| jjnd|	d	}
|
S 
z
        Fast path that combine 3 steps: apply rotary to Q and K, update kv cache, and apply attention.
        q: (batch_size, seqlen_q, nheads, head_dim)
        kv: (batch_size, seqlen_k, 2, nheads_kv, head_dim)
        Nr   z$This code path does not support xPosr   r   r(   rR   F)
rotary_cos
rotary_sincache_seqlensr-   r,   rotary_interleavedr(   r   r   r   r   scale_update_cos_sin_cacherH   ri   r:   _cos_cached_sin_cachedrT   r   r   lengths_per_samplegetattrr   r
   r-   r,   r   r4   rU   rV   r   r   r   batchr   r   r(   contextr   r   r   &_apply_rotary_update_kvcache_attention  >   



z*MHA._apply_rotary_update_kvcache_attentionc                 C   s   |j dkstdu s| js| ||}| ||S |jd }|j| j d| }|jdur3|jd| n|j }t	| jdd}t||dddddf |dddddf |dddddf |dddddf || jj
| jj|d	S z/Write kv to inference_params, then do attentionr   Nr(   rR   )r   r-   r,   r(   )r   r
   r   r   r   rT   r   r   r   r   r-   r,   )r4   rU   rV   r   r   r   r   r(   r   r   r   _update_kvcache_attention  s0   


zMHA._update_kvcache_attentionc                 K   sd  |dur!|dus
J |du sJ | j sJ | jrJ | jdks!J |dur6|du s+J |du s1J | j r6J |durO|du s@J |du rH|du sJJ | jrOJ | j rY||d|nd|i|}|du redn
|jdurm|jn|j}	|durw|jnd}
|jdd \}}| js<| j| j	kr<|du r|du sJ | 
|}| jrt| t|ddddf d	 }t|d
d| jd}|du s|jdks| jdks| jd dks| j s | jdkr| j||	|
d}|du r| js| j|fi |}n,tjjj| j|fi |}n| |dddddf |ddddddf |}n | |dddddf |ddddddf |}n| jr_| |du rI|n|dd|f }| |dur[|n|}n&| j| j	kshJ | 
|}|dd| j	| j f }|d| j	| j df }t|d| jd}t|dd| jd}| jrt| t|ddddf d	 }t| t|ddddf d	 }|du s|jdks| jdks| jd dks| j s| jdkr| j|||	|
d\}}|du r| js| j||fi |}ntjjj| j||fi |}n| |||}n| |||}| t|d}| js.|S ||fS )a  
        Arguments:
            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if
                cu_seqlens is None and max_seqlen is None, else (total, hidden_dim) where total
                is the is the sum of the sequence lengths in the batch.
            x_kv: (batch, seqlen, hidden_dim), only applicable for cross-attention. If None, use x.
            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
                of the sequences in the batch, used to index into x. Only applicable when using
                FlashAttention.
            max_seqlen: int. Maximum sequence length in the batch.
            key_padding_mask: boolean mask, True means to keep, False means to mask out.
                (batch, seqlen). Only applicable when not using FlashAttention.
            mixer_subset: for cross-attention only. If not None, will take a subset of x
                before applying the query projection. Useful for e.g., ViT where we only care
                about the CLS token in the last layer.
            inference_params: for generation. Adapted from Megatron-LM (and Apex)
            https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
        Nr   )rG   rH   ru   r   zb s d -> b d s.r   zb d s -> b s dz ... (three h d) -> ... three h dr   threed   r   rH   rR   ... (h d) -> ... h dr    ... (two hkv d) -> ... two hkv dtwor   z... h d -> ... (h d))r   r   r   r   r   rH   rT   r   r   r   r   r   r   
contiguousr   r   r   r   r;   utils
checkpointr   r   r   r   r   r   r   r   r   )r4   xx_kvru   rG   rH   mixer_subsetr   kwargsr   rotary_max_seqlenr   rv   rF   r   rU   rV   outr   r   r   rJ   =  s   





	



..$



zMHA.forward)NFTTr$   NFNFr   r   NFFr%   FFFFNNr   Nr_   )NNNNNNrL   rM   rN   rO   r+   r   r   r   r   rJ   rP   r   r   r6   r   r   u  sJ    
k("r   c                       sv   e Zd ZdZ																		d	d fd
dZdddZdd Zdd Zdd ZdddZ	  Z
S )ParallelMHAr   NTr$   Fr   r   r%   r   c                    s6  ||d}t    || _|	| _|
| _|| _|| _|| _|| _|	 | _
tj|| _|| _| j| j dks:J d|d ur@|n|| _| j| j dksOJ dt| j| j
| j| _t| j| j
| j| _| j| | _| j| jd| j   }|r|s|J dt| j| j
 }tjt|| j| | jd |  |d}nd }|d	kr|sJ d
| jdkrtd usJ dt| j||||d| _td u std u rtdt|||f||| j| j| j d  d|| _|rtt ||dnt!}|rtt"||dnt#}||	||d| _$||	||d| _%t|||f||| jd|| _&d S )Nr   r   r   r   r   r   rR   rk   r%   r   r   r   zfused_dense is not installed)r   sequence_parallelmultiple_ofr   r   )'r*   r+   r   r,   r   r   r   r   process_groupsize
world_sizer;   distributedget_rank
local_rankr   r   r   num_heads_per_ranknum_heads_kv_per_rankr   r   ceilr   r    r   r   r   r   ImportErrorr   r   r"   r]   rQ   r   r   r   r   )r4   r   r   r  r   r   r   r   r-   r,   r   r   r   r   r   r   r2   r   r   r  ri   r:   r   r   num_heads_localr(   r   r   r6   r   r   r+     s   



zParallelMHA.__init__c              	   C   r   r   )r   r   r:   ri   r;   r   r  r   r   r   r   r   r   8  r   z$ParallelMHA.allocate_inference_cachec                 C   s    | j dus	J dt||| j S )r   Nr   )r   r   r   r   r   r   r   E  s   zParallelMHA._update_kv_cachec                 C   r   r   r   r   r   r   r   r   J  r   z2ParallelMHA._apply_rotary_update_kvcache_attentionc           	      C   s   |j dks| js| ||}| ||S |jd }|j| j d| }|jdur/|jd| n|j }t| jdd}t	||dddddf |dddddf |dddddf |dddddf || jj
| jj|d	}|S r   )r   r   r   r   rT   r   r   r   r   r
   r-   r,   )	r4   rU   rV   r   r   r   r   r(   r   r   r   r   r   r  s,   

z%ParallelMHA._update_kvcache_attentionc                 K   s  |  |}|durt|d|d}|du rdn
|jdur|jn|j}|dur(|jnd}| j| jkrt|dd| jd}|du sQ|jdksQ| jdksQ| jd dksQ| j	s| jdkr^| j
|||d	}|du r}| jso| j|fi |}ntjjj| j|fi |}n| |dddddf |ddddd
df |}n| |dddddf |ddddd
df |}nt|dd| j| j f d| jd}	t|d| j| j df dd| jd}
|du s|jdks| jdks| jd dks| j	s/| jdkr| j
|	|
||d	\}	}
|du r'| js| j|	|
fi |}ntjjj| j|	|
fi |}n| |	|
|}n| |	|
|}t|d}|durEt|d}| |}|S )ae  
        Arguments:
            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if seqlen=None.
                If seqlen is not None, x is (batch * seqlen, hidden_dim). This is so that when we
                split x during sequence parallel, we split the batch * seqlen dimension
                (in case batch is small).
        Nz(b s) ... -> b s ...)sr   z b s (three h d) -> b s three h dr   r   r   r   rR   .r   r   r   r   r   zb s h d -> b s (h d)zb s d -> (b s) d)r   r   r   r   rH   r   r   r   r   r   r   r   r   r;   r   r   r   r   r  r   r   )r4   r   rv   r   r   rF   r   r   r   rU   rV   r   r   r   r   rJ     s   

	

..






zParallelMHA.forward)NTTr$   NFNr   r   NFFr%   FFTNNr   r_   r   r  r   r   r6   r   r    s8    
r(r  ) r   	functoolsr   r;   torch.nnr.   einopsr   r   flash_attn.utils.distributedr   
flash_attnr   r   r   r	   r
   r  flash_attn.ops.fused_denser   r   flash_attn.layers.rotaryr   r    Moduler"   rQ   r]   r   r   r   r  r   r   r   r   <module>   sB    Pa4>  P