o
    پi                     @   s   d dl mZmZmZ d dlZd dlmZmZ dejdejdejde	dejf
d	d
Z
G dd dejjZG dd deZG dd deZG dd dZdedededejdedefddZdS )    )OptionalTupleUnionN)FusedSetKVBufferArg%apply_rope_with_cos_sin_cache_inplacexcossinis_neox_stylereturnc                 C   s   | d| j}| d| j}|rtj| ddd\}}n| ddddf }| ddddf }|| ||  }|| ||  }|rLtj||fddS tj||fdddS )z
    Args:
        x: [num_tokens, num_heads, head_size]
        cos: [num_tokens, head_size // 2]
        sin: [num_tokens, head_size // 2]
        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
            positional embeddings.
       dim.N   )	unsqueezetodtypetorchchunkcatstackflatten)r   r   r	   r
   x1x2o1o2 r   W/home/ubuntu/.local/lib/python3.10/site-packages/sgl_kernel/testing/rotary_embedding.py_apply_rotary_emb   s   r    c                       s   e Zd Zdedededededejddf fd	d
Zdeee	f dej
fddZdej
fddZ		ddej
dej
dej
deej
 dee deej
ej
f fddZ  ZS )RotaryEmbedding	head_size
rotary_dimmax_position_embeddingsbaser
   r   r   Nc                    sN   t    || _|| _|| _|| _|| _|| _|  }|  | j	d|dd d S )Ncos_sin_cacheF)
persistent)
super__init__r"   r#   r$   r%   r
   r   _compute_cos_sin_cacheregister_buffer)selfr"   r#   r$   r%   r
   r   cache	__class__r   r   r)   '   s   
	zRotaryEmbedding.__init__c                 C   s(   d|t jd| jdt jd| j   }|S )Ng      ?r   r   r   )r   aranger#   float)r,   r%   inv_freqr   r   r   _compute_inv_freq<   s   z!RotaryEmbedding._compute_inv_freqc                 C   sR   |  | j}tj| jtjd}td||}| }| }tj	||fdd}|S )zCompute the cos and sin cache.r0   z	i,j -> ijr   r   )
r4   r%   r   r1   r$   r2   einsumr   r	   r   )r,   r3   tfreqsr   r	   r-   r   r   r   r*   E   s   z&RotaryEmbedding._compute_cos_sin_cache	positionsquerykeyoffsetsfused_set_kv_buffer_argc                 C   s6  |du sJ d|dur|| }|  }|jd }| jd|}|jddd\}}	|j}
||d| j}|dd| jf }|d| jdf }t|||	| j	}t
j||fdd|
}|j}||d| j}|dd| jf }|d| jdf }t|||	| j	}t
j||fdd|}|| j}|| j}||fS )z-A PyTorch-native implementation of forward().NzBfused_set_kv_buffer_arg is not supported for native implementationr   r   r   r   .)r   shaper&   index_selectr   viewr"   r#   r    r
   r   r   reshaper   r   )r,   r8   r9   r:   r;   r<   
num_tokenscos_sinr   r	   query_shape	query_rot
query_pass	key_shapekey_rotkey_passr   r   r   forward_nativeP   s0   


zRotaryEmbedding.forward_nativeNN)__name__
__module____qualname__intboolr   r   r)   r   r2   Tensorr4   r*   r   r   r   rI   __classcell__r   r   r.   r   r!   %   sB    	r!   c                   @   P   e Zd Z		d
dejdejdejdeej dee deejejf fdd	ZdS )FlashInferRotaryEmbeddingNr8   r9   r:   r;   r<   r   c              	   C   s$   t ||||| j| j| jd ||fS )N)r8   r9   r:   r<   r"   r&   is_neox)r   r"   r&   r
   r,   r8   r9   r:   r;   r<   r   r   r   forward_cuda{   s   	
z&FlashInferRotaryEmbedding.forward_cudarJ   	rK   rL   rM   r   rP   r   r   r   rV   r   r   r   r   rS   z        rS   c                   @   rR   )SglKernelRotaryEmbeddingNr8   r9   r:   r;   r<   r   c                 C   sT   |d u sJ d| j j|jkr| j |j| _ tjj|||| j| j | j ||fS )NzFfused_set_kv_buffer_arg is not supported for sgl-kernel implementation)	r&   r   r   r   ops
sgl_kernelrotary_embeddingr"   r
   rU   r   r   r   rV      s   
	z%SglKernelRotaryEmbedding.forward_cudarJ   rW   r   r   r   r   rY      rX   rY   c                   @   sF   e Zd ZdZdedefddZdd Zdejd	ejd
ejfddZ	dS )MHATokenToKVPooli @  head_numhead_dimc                 C   s@   || _ || _tj| _d| _tj| _d| _	d| _
d| _|   d S )Nr   cudar   )r^   r_   r]   KV_POOL_SIZEsize	page_sizer   bfloat16store_dtypedevice	layer_numstart_layer_create_buffers)r,   r^   r_   r   r   r   r)      s   zMHATokenToKVPool.__init__c                    s8    fddt  jD  _ fddt  jD  _d S )Nc                    2   g | ]}t j j j  j jf j jd qS r   rf   r   zerosrb   rc   r^   r_   re   rf   .0_r,   r   r   
<listcomp>       z4MHATokenToKVPool._create_buffers.<locals>.<listcomp>c                    rj   rk   rm   ro   rr   r   r   rs      rt   )rangerg   k_bufferv_bufferrr   r   rr   r   ri      s   

z MHATokenToKVPool._create_buffersloccache_kcache_vc                 C   s0   d}|| j || j  |< || j|| j  |< d S )Nr   )rv   rh   rw   )r,   rx   ry   rz   layer_idr   r   r   set_kv_buffer   s   zMHATokenToKVPool.set_kv_bufferN)
rK   rL   rM   ra   rN   r)   ri   r   rP   r|   r   r   r   r   r]      s    
r]   r"   
batch_sizeseq_lenr   num_q_headsnum_kv_headsc                 C   s   t j||d|}t j|| ||  ||d}t j|| ||  ||d}	t j|| ||  ||d}
t jtjt j|dd ||   }t	|||	|
|dS )N)rf   rl   )pos_idsr9   r:   valueout_cache_loc)
r   r1   repeatrandnrandpermr]   ra   int64clonedict)r"   r}   r~   rf   r   r   r   r   r9   r:   r   r   r   r   r   create_inputs   s(   	


r   )typingr   r   r   r   r[   r   r   rP   rO   r    nnModuler!   rS   rY   r]   rN   r   r   r   r   r   r   <module>   s>    
U/