o
    iG                     @   st  d dl Z d dlmZ d dlmZmZmZ d dlZd dlmZ d dl	m
Z
mZ d dlmZ d dd	Zd d
dZG dd dejjZ			 		d!deeef dee dee fddZeZ						 	d"deeef dee fddZG dd dejjZ				 	d#deeejf dee fddZG dd dejjZejZ		 d$deeejf fddZG dd dejjZdS )%    N)partial)OptionalTupleUnion)Tensor)	rearrangerepeat)apply_rotaryFc                 C   sn   |s| j ddd\}}tj| |fddS | dd d df | ddd df }}ttj| |fdddddS )N   dim.   z... d two -> ... (d two))two)chunktorchcatr   stack)xinterleavedx1x2 r   N/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/layers/rotary.pyrotate_half   s
   &r   c                 C   s   |j d d }|| j d ksJ t||sdnd}t||sdnd}tj| dd|f | t| dd|f ||  | d|df gddS )z
    x: (batch_size, seqlen, nheads, headdim)
    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
    r   r
   z... d -> ... 1 (2 d)z... d -> ... 1 (d 2).Nr   )shaper   r   r   r   )r   cossinr   ro_dimr   r   r   apply_rotary_emb_torch   s   <r   c                	   @   sN   e Zd Ze					ddeeef dee dee fddZed	d
 Z	dS )ApplyRotaryEmbFr   Nseqlen_offsets
cu_seqlens
max_seqlenc	           
   
   C   sl   t ||||||||d}	t|tr| ||| || _n| |||| d | _|| _|| _|| _|s4|	S |S )N)r!   r"   r#   r   inplace)r	   
isinstanceintsave_for_backwardr!   r   r$   r#   )
ctxr   r   r   r   r$   r!   r"   r#   outr   r   r   forward'   s&   

zApplyRotaryEmb.forwardc                 C   s^   | j }|d u r| j\}}}}n| j\}}}t|||||| j| j| jdd	}|d d d d d d d fS )NT)r!   r"   r#   r   r$   	conjugate)r!   saved_tensorsr	   r#   r   r$   )r(   dor!   r   r   r"   dxr   r   r   backwardH   s    zApplyRotaryEmb.backwardFFr   NN)
__name__
__module____qualname__staticmethodr   r&   r   r   r*   r/   r   r   r   r   r    &   s     
	 r    r!   r"   r#   c              
   C      t | |||||||S )aM  
    Arguments:
        x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
            else (total_seqlen, nheads, headdim)
        cos, sin: (seqlen_rotary, rotary_dim / 2)
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
            of 1st half and 2nd half (GPT-NeoX style).
        inplace: if True, apply rotary embedding in-place.
        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
            Most commonly used in inference when we have KV cache.
        cu_seqlens: (batch + 1,) or None
        max_seqlen: int
    Return:
        out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
            else (total_seqlen, nheads, headdim)
    rotary_dim must be <= headdim
    Apply rotary embedding to the first rotary_dim of x.
    )r    apply)r   r   r   r   r$   r!   r"   r#   r   r   r   apply_rotary_emb]   s   r7   num_heads_qc
                 C   s  t t||||d}
|d u r|d u r|  r|  dkrC| j\}}}}}|dks)J | d d d d d df ||d|}|
|||}n9|  dksKJ |	d usQJ | jd |	 d }| jd |	d|  ksgJ | d d d d d |	| f }|
|||}|s|  dkrtjt|ddd| d d d d dd f gdd	} | S tj|| d d d d |	| d f gdd	} | S |d u r|n|}|d u r|n|}|  dkr| j\}}}}}|dksJ | d d d d d
f | d d d d df }}nC|  dksJ |	d usJ | jd |	 d }| jd |	d|  ksJ | d d d d d |	f | d d d d |	|	| f }}|
|||}|
|||}|sx|  dkratj	||| d d d d df gdd	} | S tj||| d d d d |	| d f gdd	} | S )N)r   r$   r+   r!         r
   r      zb s (t h) d -> b s t h d)tr   r   r   )
r   r	   is_contiguousr   r   reshaper   r   r   r   )qkvr   r   cos_ksin_kr   r$   r+   r!   r8   apply_rotary_fnbatchseqlenthreenheadsheaddimqknum_heads_kqkr   r   r   _apply_rotary_emb_qkv   sX   &2,0:&.rL   c                   @   sH   e Zd Ze					d
deeejf dee fddZ	edd	 Z
dS )ApplyRotaryEmbQKV_NFr   r!   r8   c	           	      C   sd   t ||||||d||d	}t|tr| |||| || _n| ||||| d | _|| _|| _|S )NT)r   r$   r!   r8   )rL   r%   r&   r'   r!   r   r8   )	r(   r?   r   r   r@   rA   r   r!   r8   r   r   r   r*      s   
zApplyRotaryEmbQKV_.forwardc                 C   sb   | j }|d u r| j\}}}}}n| j\}}}}t|||||| jd|| jdd
}|d d d d d d d fS )NT)r   r$   r!   r8   r+   )r!   r,   rL   r   r8   )r(   dqkvr!   r   r   r@   rA   r   r   r   r/      s   zApplyRotaryEmbQKV_.backwardNNFr   N)r1   r2   r3   r4   r   r&   r   r   r   r*   r/   r   r   r   r   rM      s    	rM   c              
   C   r5   )a  
    Arguments:
        qkv: (batch_size, seqlen, 3, nheads, headdim) or (batch_size, seqlen, num_heads_q + 2 * num_heads_k, headdim).
            If qkv has shape (batch_size, seqlen, num_heads_q + 2 * num_heads_k, headdim) (e.g. MQA / GQA),
            then num_heads_q must be provided.
        cos, sin: (seqlen, rotary_dim / 2)
        cos_k, sin_k: (seqlen, rotary_dim / 2), optional
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
            1st half and 2nd half (GPT-NeoX style).
        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
            Most commonly used in inference when we have KV cache.
    Return:
        qkv: (batch_size, seqlen, 3, nheads, headdim) or (batch_size, seqlen, num_heads_q + 2 * num_heads_k, headdim)
    rotary_dim must be <= headdim
    Apply rotary embedding *inplace* to the first rotary_dim of Q and K.
    )rM   r6   )r?   r   r   r@   rA   r   r!   r8   r   r   r   apply_rotary_emb_qkv_   s   rP   c                   @   s6   e Zd Zed	deeejf fddZedd Z	dS )
ApplyRotaryEmbKV_Fr   r!   c                 C   s   |j \}}}}	}
|dksJ |d d d d df }t|||||dd t|tr2| || || _n
| ||| d | _|| _|S )Nr
   r   T)r!   r   r$   )r   r	   r%   r&   r'   r!   r   )r(   kvr   r   r   r!   rC   rD   r   rF   rG   rK   r   r   r   r*     s   
zApplyRotaryEmbKV_.forwardc              	   C   s^   | j }|d u r| j\}}}n| j\}}t|d d d d df |||| jddd |d d d d fS )Nr   T)r!   r   r$   r+   )r!   r,   r	   r   )r(   dkvr!   r   r   r   r   r   r/     s   
	zApplyRotaryEmbKV_.backwardNFr   )
r1   r2   r3   r4   r   r&   r   r   r*   r/   r   r   r   r   rQ     s
    rQ   c                 C   s   t | ||||S )aR  
    Arguments:
        kv: (batch_size, seqlen, 2, nheads, headdim)
        cos, sin: (seqlen, rotary_dim / 2)
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
            1st half and 2nd half (GPT-NeoX style).
        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
            Most commonly used in inference when we have KV cache.
    Return:
        kv: (batch_size, seqlen, 2, nheads, headdim)
    rotary_dim must be <= headdim
    Apply rotary embedding *inplace* to the first rotary_dim of K.
    )rQ   r6   )rR   r   r   r   r!   r   r   r   apply_rotary_emb_kv_4  s   rU   c                       s   e Zd ZdZ				ddef fddZddd	Zdd
dZ				ddej	de
ej	 deeej	f de
e de
e deej	eej	ej	f f fddZ  ZS )RotaryEmbeddinga5  
    The rotary position embeddings from RoFormer_ (Su et. al).
    A crucial insight from the method is that the query and keys are
    transformed by rotation matrices which depend on the relative positions.

    Other implementations are available in the Rotary Transformer repo_ and in
    GPT-NeoX_, GPT-NeoX was an inspiration

    .. _RoFormer: https://arxiv.org/abs/2104.09864
    .. _repo: https://github.com/ZhuiyiTechnology/roformer
    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox

    If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
    A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
    Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
         @FNr   c                    s   t    || _t|| _| |}| jd|dd || _|| _|dur7t	j
d|d|t	jdd|  d	|  nd}| jd
|dd d| _d| _d| _d| _d| _dS )z
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
            of 1st half and 2nd half (GPT-NeoX style).
        inv_freqF)
persistentNr   r
   devicedtypeg?gffffff?scale)super__init__r   floatbase_compute_inv_freqregister_bufferr   
scale_baser   arangefloat32_seq_len_cached_cos_cached_sin_cached_cos_k_cached_sin_k_cached)selfr   ra   r   rd   r[   rX   r]   	__class__r   r   r_   ]  s"   


&
zRotaryEmbedding.__init__c              	   C   s(   d| j tjd| jd|tjd| j   S )Ng      ?r   r
   rZ   )ra   r   re   r   rf   )rl   r[   r   r   r   rb   ~  s
   z!RotaryEmbedding._compute_inv_freqc           	      C   sX  || j ks| jd u s| jj|ks| jj|ks| jr| j r|| _ tj||tjd}| j	jtjkr8| j
|d}n| j	}t||}| jd u rZt||| _t||| _d S tj|| jj| jjd|d  | j }| jj|jdt|d }t|| || _t|| || _t|| || _t|| || _d S d S d S )NrZ   )r[   )r\   r[   r
   zs -> s 1)rg   rh   r[   r\   trainingis_inferencer   re   rf   rX   rb   outerr]   r   tor   ri   rd   r   rj   rk   )	rl   rD   r[   r\   r<   rX   freqspowerr]   r   r   r   _update_cos_sin_cache  s8   


z%RotaryEmbedding._update_cos_sin_cacher   r?   rR   seqlen_offsetr#   r8   returnc              
   C   s   |j d }|dur| j||j|jd nt|tr%| j|| |j|jd |du rHt|| j| j| j	dur7| j
nd| j	dur@| jnd| j||dS |}t|| j| j| jd|d}t|| j	du ra| jn| j
| j	du rk| jn| j| j|d}||fS )a*  
        qkv: (batch, seqlen, 3, nheads, headdim) or (batch, seqlen, num_heads_q + 2 * num_heads_k, headdim)
            if kv is none, else it's just q of shape (batch, seqlen, nheads, headdim).
            If qkv has shape (batch, seqlen, num_heads_q + 2 * num_heads_k, headdim) (e.g. MQA / GQA),
            then num_heads_q must be provided.
        kv: (batch, seqlen, 2, nheads, headdim)
        seqlen_offset: (batch_size,) or int. Each sequence in x is shifted by this amount.
            Most commonly used in inference when we have KV cache.
            If it's a tensor of shape (batch_size,), then to update the cos / sin cache, one
            should pass in max_seqlen, which will update the cos / sin cache up to that length.
        Apply rotary embedding *inplace* to qkv and / or kv.
        r   NrZ   )r   r!   r8   T)r   r$   r!   )r   r!   )r   ru   r[   r\   r%   r&   rP   rh   ri   r]   rj   rk   r   apply_rotary_emb_funcrU   )rl   r?   rR   rv   r#   r8   rD   rJ   r   r   r   r*     sB   

zRotaryEmbedding.forward)rW   FNN)N)NN)Nr   NN)r1   r2   r3   __doc__r&   r_   rb   ru   r   r   r   r   r   r*   __classcell__r   r   rm   r   rV   K  s8    
!
,rV   )Fr0   )NNFFFr   NrO   rT   )math	functoolsr   typingr   r   r   r   r   einopsr   r   flash_attn.ops.triton.rotaryr	   r   r   autogradFunctionr    r&   r7   rx   rL   rM   rP   rQ   r6   rU   nnModulerV   r   r   r   r   <module>   sl   

	;

"
	

@.
&
