o
    i                     @   s   d dl mZmZ d dlZd dlZd dlmZ ejdej	dej	dej	dej	dej	dej	d	ej	fd
dZ
	 					ddejdejdejdeeejf deej dee dejfddZdS )    )OptionalUnionN
ROTARY_DIMIS_SEQLEN_OFFSETS_TENSOR	IS_VARLENINTERLEAVED	CONJUGATEBLOCK_HBLOCK_Mc           -   	   C   sd  t |}|d }tjdd}tjdd}tjdd}|s*|||  }| ||	  } nt|| }t|| d | }|||  }| ||
  } || |krPd S || td| }|| td| }|sk|| } n	|t||  } td|d }!|| d d d f | |!d d d f   }|| d d d f | |!d d d f   }| d d d f |k |!d d d f |k @ }"tj||"ddtj}#tj||"ddtj}$|r|$ }$|s||d d d d f | |d d d d f |  |!d d d d f |   }| |d d d d f | |d d d d f |
  |!d d d d f |   } |d d d d f |k |d d d d f |k @ |!d d d d f |k @ }%tj||%ddtj}&tj|||  |%ddtj}'|&|# |'|$  }(|&|$ |'|#  })tj| |(|%d tj| ||  |)|%d d S td|}*||d d d d f | |d d d d f |  |*d d d d f |   }| |d d d d f | |d d d d f |
  |*d d d d f |   } |d d d d f |k |d d d d f |k @ |*d d d d f |k @ }%tj||%ddtj}+t	t
|+|||d dg\}&}'|&|# |'|$  }(|&|$ |'|#  })t
t|(|)|||g},tj| |,|%d d S )	N   r   )axis   g      ?)maskotherg        )r   )tritonnext_power_of_2tl
program_idloadarangetofloat32storesplitreshapejoin)-OUTXCOSSIN
CU_SEQLENSSEQLEN_OFFSETSseqlennheads	seqlen_rostride_out_batchstride_out_seqlenstride_out_nheadsstride_out_headdimstride_x_batchstride_x_seqlenstride_x_nheadsstride_x_headdimr   r   r   r   r   r	   r
   BLOCK_KROTARY_DIM_HALFpid_headpid_m	pid_batch	start_idxrhrmrm_csrk_halfmask_cscossinr   x0x1o0o1rkxo rA   R/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/ops/triton/rotary.pyrotary_kernel   s^   
 
(((FFB FFB"rC   Fr?   r8   r9   seqlen_offsets
cu_seqlens
max_seqlenreturnc	                    s"  |du}	|	s| j \ }
n|dusJ d| j \}}
|j d }|d  ||j \}}|j |j ks4J |d9 }||
ks@J d|
dksHJ d|ksPJ d	| | }}t|tjrw|j  fksgJ |jtjtjfv srJ | }n| |ksJ |st| n| }||
k r|s|d
|df 	| d
|df   fdd}|dkrdnd}tj
| jjO tjt| || ||||||	s|dnd|d|d|d|	s| dnd| d| d| d|t|tj|	|||dd W d   |S 1 s
w   Y  |S )a  
    Arguments:
        x: (batch, seqlen, nheads, headdim) if cu_seqlens is None
            else (total_seqlen, nheads, headdim).
        cos: (seqlen_ro, rotary_dim / 2)
        sin: (seqlen_ro, rotary_dim / 2)
        seqlen_offsets: integer or integer tensor of size (batch,)
        cu_seqlens: (batch + 1,) or None
        max_seqlen: int
    Returns:
        y: (batch, seqlen, nheads, headdim)
    Nz:If cu_seqlens is passed in, then max_seqlen must be passedr   r   r   zrotary_dim must be <= headdim   zOnly support headdim <= 256zseqlen_ro must be >= seqlen.c                    s"   t | d t | d  fS )Nr	   r
   )r   cdiv)METAbatchr#   r"   rA   rB   <lambda>   s   " zapply_rotary.<locals>.<lambda>         )r
   r	   )shape
contiguous
isinstancetorchTensordtypeint32int64
empty_likecopy_cudadeviceindexlibrarywrap_tritonrC   stride)r?   r8   r9   rD   rE   rF   interleavedinplace	conjugate	is_varlenheaddimtotal_seqlen	batch_p_1r$   
rotary_dimoutputgridr
   rA   rK   rB   apply_rotaryf   sp   


"

rn   )r   NNFFF)typingr   r   rW   r   triton.languagelanguager   jit	constexprrC   rX   intrn   rA   rA   rA   rB   <module>   sR   ]
