o
     iD/                  %   @   s   d dl mZmZ d dlZd dlmZ ddlmZ dddd	d
dddddddddejdejdejdejdejdede	de	de
de	de	de	de	deej deej deej de
d ef$d!d"ZdS )#    )OptionalTupleN)+BlockDiagonalCausalWithOffsetPaddedKeysMask   )_is_triton_availableg     @g      ?Fg      @g      0@g      @@T )thetalinear_scaleuse_dynamic_scalingdynamic_old_context_lendynamic_scale_factordynamic_low_freq_factordynamic_high_freq_factorout_qfirst_seqposseqpos	adjacentsinternal_dtypexqxkxvcache_kcache_v	attn_biasr   r	   r
   r   r   r   r   r   r   r   r   r   c          4   
   C   s  t  r| js|js|js|js|js|durtdt s J ddl}ddlm} |jj	d }|j
j	d }| j}|dvrAtd|  }| }| }| }| }|j}|j}|d	 }|}|d	 dkrid}|} |dkry|d	 dkry|d	 } |d
kr| j\}!}"}#}$|"|ksJ |d|||$fkrtd| dd|||$f |jd|||$fkrtd|j dd|||$f |d|| |$fkrtd|jd|| |$fkrtdd}%d|#|$ |$df}&n| j\}!}"}%}#}$|"|ksJ |d||%||$fkrtd| dd||%||$f |jd||%||$fkrtd|j dd||%||$f |d||%| |$fkr7td| dd||%| |$f |jd||%| |$fkrRtd|j dd||%| |$f d|#|$ |% |#|$ |$df}&|!dkrhtd|d dkrstd|d dkr~td|d dkrtd|d dkrtd|d dkrtd|#d|  }'|'| }(|#})|du r| | j}n|j| jkrtd| }&|&d dkrtd|dusJ t|jj	d }*|dur|durtdd}+|dur|j|*fkrt|j},td|, d|* d|d}+n!|dur3|j|fkr.t|j},td|, d| d|d}+d |   }-t|-||$}.t|.d!}.t|.d"}.tt|.d# dd$}/| j}0|jj}1|j
j}2|j
j}3|1j|0ksv|2j|0ksv|3j|0krztd%|d&v sJ t j| j ||jj|*|'|% f g | ||||||1|2|3||||r|	nd|r|
nd|r|nd|r|nd|||)|(|%|$|d |d'kr|d nd|d	 |d |d'kr|d nd|d	 |d |d'kr|d nd|d	 |d |d'kr|d nd|d	 |d |d'kr-|d nd|d	 |1d|2d|3d|&d |d'krO|&d nd|&d	 |+|R d(dd|.||/d) W d   |S 1 spw   Y  |S )*u  
    Performs RoPE (rotary embeddings) and kv-cache emplacement for a heterogeneous
    batch for inference in the style given by
    BlockDiagonalCausalWithOffsetPaddedKeysMask.
    The batch is concatenated along the sequence dimension, so the
    actual dim-0 length of all tensors is 1.

    xq, xk and xv should be (1, slen, n_heads, dim), where
    xq's n_heads can differ from xk and xv.

    This function places the roped xk in the right place in cache_k, and
    xv (unmodified) in the right place in cache_v, and returns out_q
    (the roped xq) such that things are ready to call

    xformers.ops.memory_efficient_attention(
        out_q, cache_k, cache_v, attn_bias=attn_bias
    )

    This functionality is experimental. Its API might be changed without warnings.
    Use it at your own risk.

    Arguments:
        xq: tensor of queries to apply rope to
        xk: tensor of keys to apply rope to
        xv: tensor of values to copy into cache_v
        cache_k: cache of keys, MODIFIED IN PLACE
        cache_v: cache of values, MODIFIED IN PLACE
        attn_bias: details the layout of caches.
                Used to determine frequencies for the
                RoPE calculation as well as the locations in cache_k and cache_v
                to write to. Must be on the device.
        first_seqpos: Optionally a tensor containing the sequence position of the
                    beginning of the cache for each batch element.
                    Providing a tensor of zeros is the same as providing None.
                    This affects the numerical calculation but not which memory
                    locations are read or written.
        seqpos: Optionally a 1D tensor containing the sequence position of each
                    query. This should have length equal to xq.shape[1] .
                    This affects the numerical calculation but not which memory
                    locations are read or written.
        adjacents: If True, the inputs are in adjacent pairs along the final dim axis.
                  This is like the released LLaMA model.
                  If False, the dim axis is split in two equal pieces.
                   I.e. the features are ordered with all the real parts before all
                   the imaginary parts. This matches HuggingFace, e.g.
                   https://github.com/huggingface/transformers/blob/
                   f143037789288ba532dada934a118e648e715738/
                   src/transformers/models/llama/modeling_llama.py#L126-L130
        linear_scale: A scaling factor to apply to the sequence ids when computing
                      the RoPE frequencies.  When set to K, all sequence indices
                      are divided by K.
        use_dynamic_scaling: If true, dynamic scaling in use, using a scaling like
            “YaRN: Efficient Context Window Extension of Large Language Models”
        dynamic_old_context_len: used with use_dynamic_scaling
        dynamic_scale_factor: used with use_dynamic_scaling
        dynamic_low_freq_factor: used with use_dynamic_scaling
        dynamic_high_freq_factor: used with use_dynamic_scaling
        internal_dtype: set to "f32" or "f64" to enforce dtype in the calculation
    NzGradients not supported.r      )_rope_padded_kernel)      zUnexpected xq dimensionr   zunexpected k shape z: expected zunexpected v shape zunexpected cache_k shapezunexpected cache_v shapezunexpected cache_k shape zunexpected cache_v shape zHExpected batch size dimension to be 1 as batches should be concatenated.zEach q head must be contiguouszEach k head must be contiguouszEach v head must be contiguousz$Each cache_k head must be contiguousz$Each cache_v head must be contiguousr   zUnexpected shape of out_qz"Each out_q head must be contiguousz0seqpos and first_seqpos may not both be providedzfirst_seqpos.shape z but (z,) expected.zseqpos.shape i      i         z:`attn_bias` must be on the same device as the other inputs)r   f32f64r   F)const_batch_stridescache_padding_lengthseqlenk_shift
BLOCK_SIZEr   	num_warps)torchis_grad_enabledrequires_grad
ValueErrorr   triton_triton.rope_padded_kernelsr   	q_seqinfoseqstart_py	k_seqinfondimstrideshape	new_emptylentupleelement_sizeminnext_power_of_2maxdeviceseqstartseqlencuda
max_seqlen)4r   r   r   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r.   r   n_total_queriescache_lengthr3   	xq_stride	xk_stride	xv_stridecache_k_stridecache_v_stridecache_k_shapexk_shape
n_kv_headsexpected_kv_headsexpected_cache_headsbszq_len	n_q_headsdimn_groupsout_q_striden_total_headsv_startk_startlogical_bszstride_seqposr5   MAX_FUSED_SIZEr(   r)   r=   	seqstartq	seqstartkseqlenk r]   L/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/rope_padded.pyrope_padded   s  P











	
 !"#$%&'()*+,-./0

88r_   )typingr   r   r*   xformers.ops.fmha.attn_biasr   r   r   Tensorfloatboolstrr_   r]   r]   r]   r^   <module>   sj   	
