o
    iE8                     @   s   d dl Z d dlmZ d dlZd dlmZmZ d dlmZm	Z	 dddZ
		dd	d
Z		 				dddZ				dddZ											 	 					ddeej fddZdS )    N)Optional)	rearrangerepeat)	pad_inputunpad_inputrandomFc                 C   s   |dv sJ |dkrt j|df| |t jd}n.|dkr2t jt|r"dnd| d | d |df|d}n|d	krEt j| d
 | d |df|d}|r\t|D ]}|d dkrWd||< qKd|d< tt j| |dd|d|k }|S )N)fullr   thirdr      devicedtyper   r      )r   r	         zs -> b sb)torchr   int32randintmaxranger   arange)
max_seqlen
batch_sizer   modezero_lengthslengthsipadding_mask r!   N/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/utils/testing.pygenerate_random_padding_mask   s$   "r#   c
           #         sN  |r|rJ | j \ }
}|j d }|j \}}}|j  ||fks$J |j  ||fks/J |dus7|	dur?|r;J |r?J |durdt| ||\}}}} fdd}|durat|d nd}n*t| d}tjd d  tj|jd	}d}} fd
d}|durt|dnd}|durt|||	\}}}}t|||	^}}nt|d}t|d}tjd d  tj|jd	}d}}|r||k sJ |
|ksJ tj|||gdd}tj| ||gdd}|dur fdd}n fdd}|	 
 |||	 
 ||fS |rTtj||gdd}tj||gdd}|} |dur1 fdd}!n fdd}!|	 
 |	 
 ||||| 	 
 |	 
 || |!fS |} |durd fdd}"n fdd}"|	 
 |	 
 |	 
 |dur|	 nd||||||| 	 
 |	 
 |	 
 |dur|	 nd|| |"fS )a  
    Arguments:
        q: (batch_size, seqlen_q, nheads, d)
        k: (batch_size, seqlen_k, nheads_k, d)
        v: (batch_size, seqlen_k, nheads_k, d_v)
        query_padding_mask: (batch_size, seqlen), bool
        key_padding_mask: (batch_size, seqlen), bool
    r   Nc                       t |  S Nr   output_unpadr   	indices_qseqlen_qr!   r"   <lambda><   s    zgenerate_qkv.<locals>.<lambda>zb s ... -> (b s) ...zb s h d -> (b s) h dr   r
   )stepr   r   c                       t | d dS Nz(b s) h d -> b s h dr   r   r'   r   r!   r"   r,   G       dim   c                    r$   r%   r&   
dqkv_unpadr)   r!   r"   r,   `       c                    r.   Nz(b s) t h d -> b s t h dr   r0   r6   r1   r!   r"   r,   b   r2   c                    r$   r%   r&   	dkv_unpadr   	indices_kseqlen_kr!   r"   r,   r   r8   c                    r.   r9   r0   r:   r1   r!   r"   r,   t   r2   c                    r$   r%   r&   dk_unpadr<   r!   r"   r,      r8   c                    r.   r/   r0   r?   r1   r!   r"   r,      r8   )shaper   r   r   r   r   r   allstackdetachrequires_grad_)#qkvquery_padding_maskkey_padding_maskqvkvpacked	qkvpackedquery_unused_maskkey_unused_masknheadsdd_v_nheads_kq_unpadcu_seqlens_qmax_seqlen_q	seqused_qoutput_pad_fnqv_unpadk_unpadcu_seqlens_kmax_seqlen_k	seqused_kv_unpadrest	qkv_unpadqkvdqkv_pad_fnkv_unpadkv	dq_pad_fn
dkv_pad_fn	dk_pad_fnr!   )r   r=   r*   r>   r+   r"   generate_qkv"   s   

















ri   NNc              	   C   s"  t tj| |tjdd}tj||tjd}	|d ur3t |d}t|	d|jd d}	t|	|k|	| d}	|d u r9|nt |dd}
|d u rG| nt |dd}|d d u ra|	||
 | |d	  kS |d u rkt|	|n|
}
t	|	t
||
 | |d	  |
kt|	||
 | |d  k |	|kS )
Nr   s -> s 1b -> b 1 1 1s -> b 1 1 sr   r           r   r
   )r   r   r   longr   rA   wheresum	full_like
logical_orminimumlogical_and)r+   r>   window_sizesink_token_lengthrI   rJ   key_leftpadr   row_idxcol_idxsksqr!   r!   r"   construct_local_mask   s*   

"r}   c                 C   s   t tj| |tjdd}tj||tjd}|d ur3t |d}t|d|jd d}t||k|| d}|d u r9|nt |dd}	|d u rG| nt |dd}
|d u rYt||n|	}	||	 |
 ||	 |
 |  }t	||k ||| kS )	Nr   rk   rl   rm   r   r   rn   r   )
r   r   r   ro   r   rA   rp   rq   rr   rs   )r+   r>   attention_chunkrI   rJ   rx   r   ry   rz   r{   r|   col_limit_left_chunkr!   r!   r"   construct_chunk_mask   s&   	
r           Tlearnable_sinkc           (   
   C   s  |	r|d df}| j }|r%|  | | } }}|
dur#|
 nd}
|durRt|d| jd |jd  d}|  | | j } |
durP|
 | |
j nd}
|durd| t|d j|j d}|durv| t|d j|j d}| jd |jd }}t|d	| jd |jd  d}t|d	| jd |jd  d}| jd
 }|jd
 }dt|
du r|n||  }|st	d| | |}n	t	d| || }|
dur|t	d|
| | }|dkrt
|| | }|dur|t| dtd d}|d dus	|d durt|||||||| jd}|dkr4t||||||| jd}|dur2t||n|}|durA||td |durJ|| }|du r[tj|d
d|j }n6|tj} tj| d
dd}!t|d}t||!}"t| |" }#|#jd
ddt||"  }$|#|$ |j }|dur|t| dd}|dur|t| dd}|dur|tj|d
ddd}dd|  }%|dur|| d}&n|}&|dur|&||&j }&t	d|&||% }'|dur|'t| dd |'j|d|j|dfS )a  
    Arguments:
        q: (batch_size, seqlen_q, nheads, head_dim)
        k: (batch_size, seqlen_k, nheads, head_dim)
        v: (batch_size, seqlen_k, nheads, head_dim_v)
        qv: (batch_size, seqlen_q, nheads, head_dim_v)
        query_padding_mask: (batch_size, seqlen_q)
        key_padding_mask: (batch_size, seqlen_k)
        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
        dropout_p: float
        dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k)
        causal: whether to apply causal masking
        upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast
            output back to fp16/bf16.
        reorder_ops: whether to change the order of operations (scaling k instead of scaling k, etc.)
            without changing the math. This is to estimate the numerical error from operation
            reordering.
    Output:
        output: (batch_size, seqlen_q, nheads, head_dim_v)
        attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout
    r   Nzb h -> b 1 (h g) 1r5   )gzb h -> b 1 h 1)r   r
   zb s h d -> b s (h g) dr   g      ?zbthd,bshd->bhtszb s -> b 1 1 sz-inf)rx   r   r3   T)r4   keepdimz
h -> h 1 1zb s -> b 1 s 1r   zbhts,bshd->bthdzb s -> b s 1 1)r   floatr   rA   tor   mathsqrtr   einsumtanhmasked_fill_r}   r   r   rs   softmaxfloat32amaxmaximumexprq   masked_fillrB   )(rF   rG   rH   rI   rJ   rx   	attn_bias	dropout_pdropout_maskcausalrK   	q_descale	k_descale	v_descalerv   r~   rw   r   softcapupcastreorder_opsintermediate_dtypedtype_ogr+   r>   rQ   dvsoftmax_scalescores
local_mask
chunk_mask	attentionscores_fp32
logits_maxlogits_or_sinks_maxunnormalized_scores
normalizerdropout_scalingattention_dropoutputr!   r!   r"   attention_ref   s   + 



	









r   )r   F)NNNFFNN)rj   r   NNNN)NNNN)NNNNr   NFNNNNrj   r   r   Nr   TFN)r   typingr   r   einopsr   r   flash_attn.bert_paddingr   r   r#   ri   r}   r   Tensorr   r!   r!   r!   r"   <module>   sT   

 
(
%