o
     i                     @   sp   d dl mZmZmZmZmZmZ d dlZddlm	Z	m
Z
 ddlmZ ddlmZmZmZ e
G dd	 d	eZdS )
    )AnyIterableListOptionalSetTupleN   )get_operatorregister_operator   )+BlockDiagonalCausalWithOffsetPaddedKeysMask)AttentionFwOpBaseContextInputsc                
       s   e Zd ZU dZeddZdhZee e	d< e
je
je
jhZee
j e	d< dZee	d< ed	efZee e	d
< dZdZdZdZededee f fddZededede e
j!e"e# f fddZ$  Z%S )FwOpzt
    An operator optimized for K=256 (so the contiguous dim fits into registers).
    Tested to work on MI250x.
    xformers&efficient_attention_forward_decoder_ckcudaSUPPORTED_DEVICESSUPPORTED_DTYPES   SUPPORTED_MAX_KNSUPPORTED_ATTN_BIAS_TYPESFTck_decoderFdreturnc                    s  t t| |}|j}t|tr|jjd dkr$|d|jjd   |jjd | j	kr>|d|jjd  d| j	 d d}d}|jjd }d	D ]
}||| krT|}qJ|s`|d| d
 || dkrq|d| d|  |j
ddkr~|d |jddkr|d |jj}|jj}	|j
jd |	 }
|jjd |
 }|ttdd|
 |kr|d |
t|d kr|d |jjdkr|d |S )Nr   r   z'One formal batch element expected; got zGot head_dim=z; only head_dim<=z is supported for now.@   )   r   r   z which is too largez; it needs to be divisible by z'expect keys to have last dim contiguousz)expect values to have last dim contiguousz-expect to have same num_queries in each batchzempty lanes not supported yeti    zkey padding exceeds 8192)superr   not_supported_reasons	attn_bias
isinstancer   queryshapeappendr   keystridevalue	q_seqinfoseqstart_py	k_seqinfopaddinglistrangelen)clsr   reasonsr!   threads_per_warprequired_alignmenthead_dimvec_sizeq_startsr,   bsznum_queries	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/fmha/ck_decoder.pyr    #   sN   





zFwOp.not_supported_reasonsinpneeds_gradientc                 C   s|  |rt d|j}| \}}}|d ur2t|tsJ |j|j |j|j |jj	}|jj
}n|jd }d }|d ur|ddk}	|	ro|dd d d d d df dd|f}
|dd d d d d df dd|f}n|d dd|f}
|d dd|f}|d d|
jd df}n|}
|}|}|jd ur|j}nttj|
jd tjd }| j||
|||d}|d fS )Nzbackward pass is not supportedr      r   r   )dtype)r#   r&   r(   seq_positionsscale)NotImplementedErrorr!   get_qkv_in_bmghkr"   r   r+   todevicer)   r,   seqlenr$   r'   	unflattenrB   torchrsqrttensorfloat32itemOPERATOR)r0   r=   r>   r!   qkvr,   seq_positions_gpu
multiqueryr&   r(   r#   qk_scaleoutr;   r;   r<   applyW   sL   

(*
z
FwOp.apply)&__name__
__module____qualname____doc__r	   rN   r   r   str__annotations__rI   halfbfloat16floatr   r@   r   inttyper   r   r   r   SUPPORTS_DROPOUTSUPPORTS_CUSTOM_SCALESUPPORTS_BMGHKNAMEclassmethodr   r   r    boolr   Tensorr   r   rV   __classcell__r;   r;   r9   r<   r      s.   
 
3r   )typingr   r   r   r   r   r   rI   commonr	   r
   r!   r   r   r   r   r   r;   r;   r;   r<   <module>   s    