o
    ۾i!"                     @   s   d dl mZmZ d dlZd dlZd dlmZ ejdej	dej	dej	dej	dej	dej	fd	d
Z
	 					ddejdejdejdeeejf deej dee dejfddZdS )    )OptionalUnionNBLOCK_KIS_SEQLEN_OFFSETS_TENSOR	IS_VARLENINTERLEAVED	CONJUGATEBLOCK_Mc           -      C   s  t jdd}t jdd}t jdd}|d }|s-|||  ||  }| ||	  ||  } n&t || }t || d | }|||  ||  }| ||
  ||  } || |kr[d S || t d| }|sl|| }n	|t ||  }t d|}t d|d }|s||d d d f | |d d d f |   }||d d d f | |d d d f   }||d d d f | |d d d f   }t j||d d d f |k |d d d f |k @ ddt j} t j||d d d f |k |d d d f |k @ ddt j}!t j||d d d f |k |d d d f |k @ ddt j}"t j|||  |d d d f |k |d d d f |k @ ddt j}#|rF|! }!|"|  |#|!  }$|"|! |#|   }%| |d d d f |
 |d d d f |   } t j| |$|d d d f |k |d d d f |k @ d t j| ||  |%|d d d f |k |d d d f |k @ d d S ||d d d  d }&t d|d }'||d d d f | |d d d f |   }(||d d d f | |&d d d f |   })||d d d f | |'d d d f   }||d d d f | |'d d d f   }t j||d d d f |k |'d d d f |k @ ddt j} t j||d d d f |k |'d d d f |k @ ddt j}!t j|(|d d d f |k |d d d f |k @ ddt j}"t j|)|d d d f |k |&d d d f |k @ ddt j}#|r|! }!|"|  }*|#|! }+t |d d d f d dk|*|+ |*|+ },| |d d d f |
 |d d d f |   } t j| |,|d d d f |k |d d d f |k @ d d S )	Nr   )axis      g      ?)maskotherg        )r   )tl
program_idloadarangetofloat32storewhere)-OUTXCOSSIN
CU_SEQLENSSEQLEN_OFFSETSseqlen
rotary_dim	seqlen_rostride_out_batchstride_out_seqlenstride_out_nheadsstride_out_headdimstride_x_batchstride_x_seqlenstride_x_nheadsstride_x_headdimr   r   r   r   r   r	   pid_mpid_head	pid_batchrotary_dim_half	start_idxrmrm_csrkrk_halfcossinx0x1o0o1rk_swap	rk_repeatX0X1x0_cosx1_sinout r>   Z/home/ubuntu/.local/lib/python3.10/site-packages/vllm/vllm_flash_attn/ops/triton/rotary.pyrotary_kernel   s   
,((***
&,4
&
,,((&&4**,8r@   Fxr1   r2   seqlen_offsets
cu_seqlens
max_seqlenreturnc	                    s  |du}	|	s| j \ }
n|dusJ d| j \}}
|j d }|d  ||j \}}|j |j ks4J |d9 }||
ks@J d|
dksHJ d|ksPJ d	|j|jksbJ d
|j d|j | j|jkstJ d| j d|j | | }}t|tjr|j  fksJ |jtjtjfv sJ | }n| |ksJ |st| n| }||
k r|s|d|df 	| d|df  |dkrdn|dkrdn|dkrdnd} fdd}|rdn|dkrdnd}tj
| jjT t| || |||||||	s|dnd|d|d|d|	s| dnd| d| d| d|t|tj|	||||dkr;dndd W d   |S 1 sKw   Y  |S )a  
    Arguments:
        x: (batch, seqlen, nheads, headdim) if cu_seqlens is None
            else (total_seqlen, nheads, headdim).
        cos: (seqlen_ro, rotary_dim / 2)
        sin: (seqlen_ro, rotary_dim / 2)
        seqlen_offsets: integer or integer tensor of size (batch,)
        cu_seqlens: (batch + 1,) or None
        max_seqlen: int
    Returns:
        y: (batch, seqlen, nheads, headdim)
    Nz:If cu_seqlens is passed in, then max_seqlen must be passedr   r   r   zrotary_dim must be <= headdim   zOnly support headdim <= 256zseqlen_ro must be >= seqlenz*cos and sin must have the same dtype, got z and z0Input and cos/sin must have the same dtype, got .    @      c                    s   t | d  fS )Nr	   )tritoncdiv)METAbatchnheadsr   r>   r?   <lambda>   s    zapply_rotary.<locals>.<lambda>      )	num_warps)shapedtype
contiguous
isinstancetorchTensorint32int64
empty_likecopy_cudadeviceindexr@   stride)rA   r1   r2   rB   rC   rD   interleavedinplace	conjugate	is_varlenheaddimtotal_seqlen	batch_p_1r   r   outputr   gridr	   r>   rM   r?   apply_rotary   s   


"

rn   )r   NNFFF)typingr   r   r[   rJ   triton.languagelanguager   jit	constexprr@   r\   intrn   r>   r>   r>   r?   <module>   sN   }
