o
     i!                     @   s|  d dl Z d dlmZ d dlZd dlmZ d dlmZmZ d dl	m
Z
 zd dlmZmZmZmZmZ W n eyE   d\ZZd\ZZdZY nw zd dlmZmZmZ W n ey`   d\ZZZY nw zd d	lmZ W n eyt   dZY nw d
d ZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdd Z G dd dejZ!G dd dejZ"dS )    N)partial)	rearrangerepeat)get_dim_for_local_rank)flash_attn_kvpacked_funcflash_attn_qkvpacked_funcflash_attn_varlen_kvpacked_func flash_attn_varlen_qkvpacked_funcflash_attn_with_kvcacheNN)ColumnParallelLinear
FusedDenseRowParallelLinearNNN)RotaryEmbeddingc                 C   s\   dd }t |  r|| S dt t |  }||td| dd d d | |   S )Nc                    s6   ddt | d       fddt| D S )N      c                    s   g | ]} |  qS  r   ).0iratiostartr   T/home/ubuntu/.local/lib/python3.10/site-packages/xformers/_flash_attn/modules/mha.py
<listcomp>)   s    zCget_alibi_slopes.<locals>.get_slopes_power_of_2.<locals>.<listcomp>)mathlog2range)nheadsr   r   r   get_slopes_power_of_2&   s   z/get_alibi_slopes.<locals>.get_slopes_power_of_2r   r   )r   r   
is_integerfloorget_alibi_slopes)r   r   closest_power_of_2r   r   r   r"   %   s    r"   c                       s8   e Zd ZdZ						d
 fdd	Zddd	Z  ZS )FlashSelfAttention|  Implement the scaled dot product attention with softmax.
    Arguments
    ---------
        softmax_scale: The temperature to use for the softmax attention.
                      (default: 1/sqrt(d_keys) where d_keys is computed at
                      runtime)
        attention_dropout: The dropout rate to apply to the attention
                           (default: 0.0)
    FN        r(   c                    sb   t    td usJ dtd usJ d|| _|| _t|| _| j	d|dd || _
|| _d S NzFlashAttention is not installedalibi_slopesF)
persistent)super__init__r	   r   causalsoftmax_scalennDropoutdropregister_bufferwindow_sizedeterministic)selfr.   r/   attention_dropoutr4   r*   r5   	__class__r   r   r-   @      
	
zFlashSelfAttention.__init__c                 C   s   |j tjtjfv sJ |jsJ |du r| jn|}|du}| jdur*| jtj| _|rY|j tj	ks4J |dus:J t
|tsAJ t|||| jrL| jjnd| j|| j| j| jd	S t|| jrb| jjnd| j|| j| j| jdS )ao  Implements the multihead softmax attention.
        Arguments
        ---------
            qkv: The tensor containing the query, key, and value.
                If cu_seqlens is None and max_seqlen is None, then qkv has shape (B, S, 3, H, D).
                If cu_seqlens is not None and max_seqlen is not None, then qkv has shape
                (total, 3, H, D), where total is the sum of the sequence lengths in the batch.
            causal: if passed, will override self.causal
            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
                of the sequences in the batch, used to index into qkv.
            max_seqlen: int. Maximum sequence length in the batch.
        Returns:
        --------
            out: (total, H, D) if cu_seqlens is not None and max_seqlen is not None,
                else (B, S, H, D).
        Nr&   r/   r.   r*   r4   r5   )dtypetorchfloat16bfloat16is_cudar.   r*   tofloat32int32
isinstanceintr	   trainingr2   pr/   r4   r5   r   )r6   qkvr.   
cu_seqlens
max_seqlenunpaddedr   r   r   forwardS   s<   

zFlashSelfAttention.forward)FNr&   r'   NFr   __name__
__module____qualname____doc__r-   rL   __classcell__r   r   r8   r   r$   5   s    r$   c                       sB   e Zd ZdZ						d
 fdd	Z					ddd	Z  ZS )FlashCrossAttentionr%   FNr&   r'   c                    sb   t    td usJ dtd usJ d|| _|| _t|| _| j	d|dd || _
|| _d S r)   )r,   r-   r   r   r.   r/   r0   r1   r2   r3   r4   r5   )r6   r.   r/   r7   r*   r4   r5   r8   r   r   r-      r:   zFlashCrossAttention.__init__c                 C   sh  |j tjtjfv sJ |jr|jsJ |du r| jn|}|du}| jdur-| jtj| _|rz|j tj	ks7J |dus=J t
|tsDJ |dusJJ |j tj	ksRJ |dusXJ t
|ts_J t||||||| jrm| jjnd| j|| j| j| jdS |jd |jd }	}
|jd }|jd |	kr|jd |jd ksJ t||| jr| jjnd|| j| j| j| jdS )	a  Implements the multihead softmax attention.
        Arguments
        ---------
            q: The tensor containing the query. (B, Sq, H, D)
            kv: The tensor containing the key and value. (B, Sk, 2, H_k, D)
            causal: if passed, will override self.causal
            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
                of the sequences in the batch, used to index into q.
            max_seqlen: int. Maximum sequence length in the batch of q.
            cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
                of the sequences in the batch, used to index into kv.
            max_seqlen_k: int. Maximum sequence length in the batch of k and v.
        Nr&   r;   r         r   )r.   r/   r*   r4   r5   )r<   r=   r>   r?   r@   r.   r*   rA   rB   rC   rD   rE   r   rF   r2   rG   r/   r4   r5   shaper   )r6   qkvr.   rI   rJ   cu_seqlens_kmax_seqlen_krK   
batch_sizeseqlen_qseqlen_kr   r   r   rL      sR   

&zFlashCrossAttention.forward)FNr&   Nr'   F)NNNNNrM   r   r   r8   r   rS      s    rS   c                       ,   e Zd ZdZd	 fdd	Zd
ddZ  ZS )SelfAttentionr%   FNr&   c                    &   t    || _|| _t|| _d S Nr,   r-   r.   r/   r0   r1   r2   r6   r.   r/   r7   r8   r   r   r-         
zSelfAttention.__init__c                 C   s  |j d |j d }}|du r| jn|}|jdd\}}}| jp)dt|j d  }	td|||	 }
|durQtj||fd	|
j	|
j
d
}||d |
t|d }
|rkttj||fd	|
j
dd}|
|j|
j	d }
tj|
d|j	d}| |}td||}|S )au  Implements the multihead softmax attention.
        Arguments
        ---------
            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D)
            causal: if passed, will override self.causal
            key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
                False means to mask out. (B, S)
        r   rT   Nr   dim      ?r(   bthd,bshd->bhts     r<   devicer&   b s -> b 1 1 srk   )r<   rf   r<   bhts,bshd->bthd)rV   r.   unbindr/   r   sqrtr=   einsumfullr<   rk   masked_fill_r   triurA   softmaxr2   )r6   rH   r.   key_padding_maskr[   seqlenrW   kvr/   scorespadding_maskcausal_mask	attentionattention_dropoutputr   r   r   rL      s(   	
zSelfAttention.forwardFNr&   r   rM   r   r   r8   r   r_          
r_   c                       r^   )CrossAttentionr%   FNr&   c                    r`   ra   rb   rc   r8   r   r   r-   %  rd   zCrossAttention.__init__c                 C   s  |j d |j d }}|du r| jn|}|j d }|j d |kr*|j d |j d ks,J |j d |j d krEt|d|j d |j d  d}|jdd	\}}	| jpYd
t|j d  }
td|||
 }|durtj	||fd|j
|jd}||d |t|d }|rttj||jtjdd}tj||jtjd}|du r|nt|dd}||| | k}||d}tj|d|	j
d}| |}td||	}|S )a  Implements the multihead softmax attention.
        Arguments
        ---------
            q: The tensor containing the query. (B, Sq, H, D)
            kv: The tensor containing the key and value. (B, Sk, 2, H_k, D)
            causal: if passed, will override self.causal
            key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
                False means to mask out. (B, Sk)
        r   rT   NrU   r   r   z... hkv d -> ... (hkv g) d)gre   rg   r(   rh   ri   rj   r&   rl   rk   r<   zs -> s 1zb -> b 1 1 1rn   ro   )rV   r.   r   rp   r/   r   rq   r=   rr   rs   r<   rk   rt   r   arangelongsummasked_fillrv   r2   )r6   rW   rX   r.   rw   r[   r\   r]   ry   rz   r/   r{   r|   row_idxcol_idxskr}   r~   r   r   r   r   r   rL   +  s<   

&
zCrossAttention.forwardr   r   rM   r   r   r8   r   r     r   r   c                       s.   e Zd ZdZdejdejf fddZ  ZS )LinearResidualzQWrap nn.Linear to return the residual as well. For compatibility with FusedDense.inputreturnc                    s   t  ||fS ra   )r,   rL   )r6   r   r8   r   r   rL   [  s   zLinearResidual.forward)rN   rO   rP   rQ   r=   TensorrL   rR   r   r   r8   r   r   X  s    "r   c           
   	   C   s   | j dd \}}||jvr$tj|j|jd||| j| jd}||j|< n|j| }|j}|| j d  }|j	}|| j d  }	||j d ksFJ |	|j d ksOJ |dusUJ | |||||	df< |||d|	df S )Ukv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)Nr   rj   r   rT   .)
rV   key_value_memory_dictr=   emptymax_batch_sizerJ   r<   rk   batch_size_offsetseqlen_offset)
rX   inference_params	layer_idx	num_headshead_dimkv_cachebatch_start	batch_endsequence_startsequence_endr   r   r   _update_kv_cache_  s,   
	
r   c                       s   e Zd ZdZ																					d	d fd
dZdddZdd Zdd Zdd Z						dddZ	  Z
S )MHA-Multi-head self-attention and cross-attentionNFTr&   r        @r'   r   c           !         s  ||d}t    || _|| _|	| _|
| _|| _|| _|| _|| _	|| _
|r7|s-J dtjt||d}nd}|dkrC|sCJ d|| _|durL|n|| _| j| j dks[J d| j| dksfJ d	| j| | _| j| jd
| j   }d
| j | j }| jdkr|rJ dtdusJ dt| j||||d| _|rtdu rtd|stjnt}|stnttdd}| j	s|n|}|rtt||dnt}|rtt||dnt} | js|||fd|i|| _n|||fd|i|| _|||fd|i|| _| jr*| j| jkrtj ||dd
|d| _!ntj ||dd
|d| _"tj ||dd
|d| _#||	||d| _$| |	||d| _%|||fd|i|| _&dS )aX  
        num_heads_kv: can be used to toggle MQA / GQA. If None, use num_heads.
        return_residual: whether to return the input x along with the output. This is for
            performance reason: for post-norm architecture, returning the input allows us
            to fuse the backward of nn.Linear with the residual connection.
        r   #ALiBi code path requires flash_attnrm   Nr'   >Local (sliding window) attention code path requires flash_attnr   +num_heads must be divisible by num_heads_kv(embed_dim must be divisible by num_headsr   z>MHA with rotary embedding does not support cross-attention yetrotary_emb is not installedbase
scale_baseinterleavedrk   fused_dense is not installedT)return_residualr*   r4   biasr   )kernel_sizepaddinggroupsr.   r/   r7   )'r,   r-   	embed_dim
cross_attnr.   r   dwconvrotary_emb_dimuse_flash_attnr   checkpointingr=   tensorr"   r   num_heads_kvr   r   
rotary_embr   ImportErrorr0   Linearr   r   r$   r_   rS   r   WqkvWqWkvConv1d
dwconv_qkvdwconv_q	dwconv_kv
inner_attninner_cross_attnout_proj)!r6   r   r   r   r   qkv_proj_biasout_proj_biasdropoutr/   r.   r   r   r   rotary_emb_baserotary_emb_scale_baserotary_emb_interleaved	use_alibir4   fused_bias_fcr   r   r   rk   r<   factory_kwargsr*   qkv_dimkv_dim
linear_clslinear_resid_clswqkv_clsinner_attn_clsinner_cross_attn_clsr8   r   r   r-     s   
 




zMHA.__init__c              	   C   <   |d u r	| j jjn|}| j jj}tj||d| j| j||dS Nr   rj   )r   weightr<   rk   r=   r   r   r   r6   r[   rJ   r<   rk   r   r   r   allocate_inference_cache     
zMHA.allocate_inference_cachec                 C   s.   | j rJ d| jdusJ dt||| jS )r   z&Generation does not support dwconv yetN0Generation requires layer_idx in the constructor)r   r   r   r6   rX   r   r   r   r   r     s   zMHA._update_kv_cachec                 C   B  |dur	|j dksJ | jsJ | jdkr5| jjdu sJ d| jj|j|j|jd | jj	| jj
}}nd\}}|jd }|j| j d| }|jdurT|jd| n|j }t| jdd}	t||dddddf |dddddf |dddddf |dddddf |||| jj| jj| jdkr| jjnd|	d	}
|
S 
z
        Fast path that combine 3 steps: apply rotary to Q and K, update kv cache, and apply attention.
        q: (batch_size, seqlen_q, nheads, head_dim)
        kv: (batch_size, seqlen_k, 2, nheads_kv, head_dim)
        Nr   z$This code path does not support xPosr   r   r*   rT   F)
rotary_cos
rotary_sincache_seqlensr/   r.   rotary_interleavedr*   r   r   r   r   scale_update_cos_sin_cacherJ   rk   r<   _cos_cached_sin_cachedrV   r   r   lengths_per_samplegetattrr   r
   r/   r.   r   r6   rW   rX   r   r   r   batchr   r   r*   contextr   r   r   &_apply_rotary_update_kvcache_attention  >   



z*MHA._apply_rotary_update_kvcache_attentionc                 C   s   |j dkstdu s| js| ||}| ||S |jd }|j| j d| }|jdur3|jd| n|j }t	| jdd}t||dddddf |dddddf |dddddf |dddddf || jj
| jj|d	S z/Write kv to inference_params, then do attentionr   Nr*   rT   )r   r/   r.   r*   )r   r
   r   r   r   rV   r   r   r   r   r/   r.   )r6   rW   rX   r   r   r   r   r*   r   r   r   _update_kvcache_attention,  s0   


zMHA._update_kvcache_attentionc                 K   s  |dur!|dus
J |du sJ | j sJ | jrJ | jdks!J |dur6|du s+J |du s1J | j r6J |durO|du s@J |du rH|du sJJ | jrOJ | j rY||d|nd|i|}|du redn
|jdurm|jn|j}	|durw|jnd}
|jdd \}}| jsH| j| j	krH|du r|du sJ | j
s| |}n| |\}}| jrt| t|ddddf d	 }t|d
d| jd}|du s|jdks| jdks| jd dks| j s+| jdkr| j||	|
d}|du r| js| j|fi |}ndtjjj| j|fi |}nU| |dddddf |ddddddf |}n8| |dddddf |ddddddf |}n| jr| j
so| |du rY|n|dd|f }| |durk|n|}nY|dur|| |\}}n| |\}}| |du r|n|dd|f }n2| j| j	ksJ | j
s| |}n| |\}}|dd| j	| j f }|d| j	| j df }t|d| jd}t|dd| jd}| jrt| t|ddddf d	 }t| t|ddddf d	 }|du s |jdks | jdks | jd dks | j s\| jdkr1| j|||	|
d\}}|du rT| jsE| j||fi |}ntjjj| j||fi |}n| |||}n| |||}| t|d}| j
sq|S ||fS )a  
        Arguments:
            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if
                cu_seqlens is None and max_seqlen is None, else (total, hidden_dim) where total
                is the is the sum of the sequence lengths in the batch.
            x_kv: (batch, seqlen, hidden_dim), only applicable for cross-attention. If None, use x.
            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
                of the sequences in the batch, used to index into x. Only applicable when using
                FlashAttention.
            max_seqlen: int. Maximum sequence length in the batch.
            key_padding_mask: boolean mask, True means to keep, False means to mask out.
                (batch, seqlen). Only applicable when not using FlashAttention.
            mixer_subset: for cross-attention only. If not None, will take a subset of x
                before applying the query projection. Useful for e.g., ViT where we only care
                about the CLS token in the last layer.
            inference_params: for generation. Adapted from Megatron-LM (and Apex)
            https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
        Nr   )rI   rJ   rw   r   zb s d -> b d s.r   zb d s -> b s dz ... (three h d) -> ... three h dr   threed   r   rJ   rT   ... (h d) -> ... h dr    ... (two hkv d) -> ... two hkv dtwor   z... h d -> ... (h d))r   r   r   r   r   rJ   rV   r   r   r   r   r   r   r   
contiguousr   r   r   r   r=   utils
checkpointr   r   r   r   r   r   r   r   )r6   xx_kvrw   rI   rJ   mixer_subsetr   kwargsr   rotary_max_seqlenr   rx   rH   r   rW   rX   outr   r   r   rL   K  s   





	


..$
&


zMHA.forward)NFTTr&   NFNFr   r   NFFr'   FFFFNNr   Nra   )NNNNNNrN   rO   rP   rQ   r-   r   r   r   r   rL   rR   r   r   r8   r   r   |  sJ    
r("r   c                       sv   e Zd ZdZ																		d	d fd
dZdddZdd Zdd Zdd ZdddZ	  Z
S )ParallelMHAr   NTr&   Fr   r   r'   r   c                    s6  ||d}t    || _|	| _|
| _|| _|| _|| _|| _|	 | _
tj|| _|| _| j| j dks:J d|d ur@|n|| _| j| j dksOJ dt| j| j
| j| _t| j| j
| j| _| j| | _| j| jd| j   }|r|s|J dt| j| j
 }tjt|| j| | jd |  |d}nd }|d	kr|sJ d
| jdkrtd usJ dt| j||||d| _td u std u rtdt|||f||| j| j| j d  d|| _|rtt ||dnt!}|rtt"||dnt#}||	||d| _$||	||d| _%t|||f||| jd|| _&d S )Nr   r   r   r   r   r   rT   rm   r'   r   r   r   r   )r   sequence_parallelmultiple_ofr   r   )'r,   r-   r   r.   r   r   r   r   process_groupsize
world_sizer=   distributedget_rank
local_rankr   r   r   num_heads_per_ranknum_heads_kv_per_rankr   r   ceilr   r"   r   r   r   r   r   r   r   r$   r_   rS   r   r   r   r   )r6   r   r   r  r   r   r   r   r/   r.   r   r   r   r   r   r   r4   r   r   r  rk   r<   r   r   num_heads_localr*   r   r   r8   r   r   r-     s   



zParallelMHA.__init__c              	   C   r   r   )r   r   r<   rk   r=   r   r  r   r   r   r   r   r   S  r   z$ParallelMHA.allocate_inference_cachec                 C   s    | j dus	J dt||| j S )r   Nr   )r   r   r   r   r   r   r   `  s   zParallelMHA._update_kv_cachec                 C   r   r   r   r   r   r   r   r   e  r   z2ParallelMHA._apply_rotary_update_kvcache_attentionc           	      C   s   |j dks| js| ||}| ||S |jd }|j| j d| }|jdur/|jd| n|j }t| jdd}t	||dddddf |dddddf |dddddf |dddddf || jj
| jj|d	}|S r   )r   r   r   r   rV   r   r   r   r   r
   r/   r.   )	r6   rW   rX   r   r   r   r   r*   r   r   r   r   r     s,   

z%ParallelMHA._update_kvcache_attentionc                 K   s  |  |}|durt|d|d}|du rdn
|jdur|jn|j}|dur(|jnd}| j| jkrt|dd| jd}|du sQ|jdksQ| jdksQ| jd dksQ| j	s| jdkr^| j
|||d	}|du r}| jso| j|fi |}ntjjj| j|fi |}n| |dddddf |ddddd
df |}n| |dddddf |ddddd
df |}nt|dd| j| j f d| jd}	t|d| j| j df dd| jd}
|du s|jdks| jdks| jd dks| j	s/| jdkr| j
|	|
||d	\}	}
|du r'| js| j|	|
fi |}ntjjj| j|	|
fi |}n| |	|
|}n| |	|
|}t|d}|durEt|d}| |}|S )ae  
        Arguments:
            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if seqlen=None.
                If seqlen is not None, x is (batch * seqlen, hidden_dim). This is so that when we
                split x during sequence parallel, we split the batch * seqlen dimension
                (in case batch is small).
        Nz(b s) ... -> b s ...)sr   z b s (three h d) -> b s three h dr   r   r   r   rT   .r   r   r   r   r   zb s h d -> b s (h d)zb s d -> (b s) d)r   r   r   r   rJ   r   r   r   r   r   r   r   r   r=   r  r  r   r   r  r   r   )r6   r  rx   r   r  rH   r   r  r   rW   rX   r	  r   r   r   rL     s   

	

..






zParallelMHA.forward)NTTr&   NFNr   r   NFFr'   FFTNNr
  ra   r   r  r   r   r8   r   r    s8    
r(r  )#r   	functoolsr   r=   torch.nnr0   einopsr   r   flash_attn.utils.distributedr   
flash_attnr   r   r   r	   r
   r   flash_attn.ops.fused_denser   r   r   flash_attn.layers.rotaryr   r"   Moduler$   rS   r_   r   r   r   r   r   r  r   r   r   r   <module>   sD    Pa4>  d