o
     i                     @   s   d dl mZmZmZmZmZ d dlZd dlmZm	Z	 d dl
mZ d dlmZmZmZmZ e	G dd deZG dd	 d	eZG d
d deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZdS )    )AnyIterableListOptionalTupleN)get_operatorregister_operator)+BlockDiagonalCausalWithOffsetPaddedKeysMask)AttentionFwOpBaseContextInputscheck_lastdim_alignment_stride1c                       s  e Zd ZU eddZdhZejejej	hZ
dZedefZee ed< dZdZdZd	ZdZee ed
< dZdZdZedededededee f
 fddZede dee f fddZ!ededededefddZ"ede de#de$ej%ee& f fdd Z'  Z(S )!FwOpxformers-efficient_attention_forward_decoder_splitk_ckcuda   NSUPPORTED_ATTN_BIAS_TYPESFT
ck_splitKFSPLIT_K   @      MqMkvKKvreturnc                    s   t  ||||}|S )N)supershape_not_supported_reasons)clsr   r   r   r   reasons	__class__ O/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/fmha/ck_splitk.pyr   .   s   z FwOp.shape_not_supported_reasonsdc                    s\  t t| |}t|d|jd |jjtjkr't|d|jd t|d|j	d | j
d u r1|d |jjdkrEtj|jdk rE|d |jjd	 }t|jtrw|jj}||jd
 krj|d|jd
  d|  |j}||jkrw|d |jjdv r|jjd d	kr|jddkr|j	ddkr|d	kr|d |jd ur|d	kr|d |S )Nquery   keyvalueztriton is not availabler   )   r   zCrequires GPU with sm80 minimum compute capacity, e.g., A100/H100/L4r   zExpected total z queries not zCVariable query len is not supported in the presence of causal mask.)      r   z0multiquery is only supported with query seqlen=1zEquery with seqlen > 1 is not supported in the presence of causal mask)r   r   not_supported_reasonsr   r'   r)   dtypetorchint32r*   OPERATORappenddevicetyper   get_device_capabilityshape
isinstance	attn_biasr	   	q_seqinfoseqstart_py
min_seqlen
max_seqlenndimstride)r    r&   r!   q_lenseqinfor"   r$   r%   r0   7   s@   


(
zFwOp.not_supported_reasonsBHMkc                 C   s|   t || d}t |d| }|dkr|dkrdnd}|dkr2|| |k r2|d }|dkr2|| |k s$t|d}t |d}|S )z"Heuristic for the number of splitsr   i   i   r      r      )maxmin)r    rD   rE   rF   bhsplit_kmax_chunk_sizer$   r$   r%   get_split_k^   s   

zFwOp.get_split_kinpneeds_gradientc                 C   s  |j }| \}}}|d ur,t|tsJ |j|j |j|j |jj}|jj	}n|j
d }d }|d ur|ddk}	|	ri|dd d d d d df dd|f}
|dd d d d d df dd|f}n|d dd|f}
|d dd|f}|d d|
j
d df}n|}
|}|}|j
\}}}}}|
j
\}}}}}| jd ur| j}n| |||}|jd ur|j}nttj|j
d tjd }| j||
||||d}|d fS )Nr      r   r,   )r1   )r'   r)   r*   seq_positionsscalerL   )r;   get_qkv_in_bmghkr:   r	   	k_seqinfotor6   r<   paddingseqlenr9   rA   	unflattenr   rN   rS   r2   rsqrttensorfloat32itemr4   )r    rO   rP   r;   qkvrW   seq_positions_gpu
multiqueryr)   r*   r'   rD   _rE   rF   rL   qk_scaleoutr$   r$   r%   applyj   sT   

(*

	z
FwOp.apply))__name__
__module____qualname__r   r4   SUPPORTED_DEVICESr2   halfbfloat16floatSUPPORTED_DTYPESSUPPORTED_MAX_Kr7   r	   r   r   r   __annotations__SUPPORTS_DROPOUTSUPPORTS_CUSTOM_SCALESUPPORTS_BMGHKNAMEr   r   intBLOCK_MBLOCK_N
NUM_GROUPSclassmethodr   strr   r   r0   rN   boolr   Tensorr   rf   __classcell__r$   r$   r"   r%   r      sV   
 
&r   c                   @      e Zd ZdZdZdS )FwOp_S1r   
ck_splitK1Nrg   rh   ri   r   rt   r$   r$   r$   r%   r          r   c                   @   r~   )FwOp_S2rH   
ck_splitK2Nr   r$   r$   r$   r%   r      r   r   c                   @   r~   )FwOp_S4r-   
ck_splitK4Nr   r$   r$   r$   r%   r      r   r   c                   @   r~   )FwOp_S8r(   
ck_splitK8Nr   r$   r$   r$   r%   r      r   r   c                   @   r~   )FwOp_S16r   ck_splitK16Nr   r$   r$   r$   r%   r      r   r   c                   @   r~   )FwOp_S32    ck_splitK32Nr   r$   r$   r$   r%   r      r   r   c                   @   r~   )FwOp_S64r   ck_splitK64Nr   r$   r$   r$   r%   r      r   r   c                   @   r~   )	FwOp_S128rG   ck_splitK128Nr   r$   r$   r$   r%   r      r   r   )typingr   r   r   r   r   r2   xformers.ops.commonr   r   xformers.ops.fmha.attn_biasr	   xformers.ops.fmha.commonr
   r   r   r   r   r   r   r   r   r   r   r   r   r$   r$   r$   r%   <module>   s     