o
    
۾i                     @   s   d Z ddlZddlmZ ddlmZmZ ddlmZ ej	dej
dej
dej
d	ej
d
ej
dej
dej
dej
fddZdejdefddZ				d"dejdejdejdejdejdejdedededB dedB dedB fd d!ZdS )#zD
Memory-efficient attention for prefill.
It supports page size = 1.
    N)current_platform)tltriton)RCP_LN2kv_group_numBLOCK_MBLOCK_DMODELBLOCK_N	IS_CAUSALSLIDING_WINDOW_QSLIDING_WINDOW_KLkc           >      C   s  t d}t d}t d}|| }t || }t || }|| }t d|}t d|}|| t d| } || d d d f  | ||  |d d d f  }!|d d d f |	 ||
  |d d d f  }"|d d d f | ||  |d d d f  }#||k }$t j| |! | d d d f |k |$d d d f @ dd}%||" }&||# }'t j|gt jdtd }(t j|gt jd})t j||gt jd}*t ||k dd}+|},|rt |,|d | n|,},d}-|+|, }.t	|-|.|D ]}/| d d d f }0|/|d d d f  }1|1|k }2|r|2|0|1kM }2|dkr|0|1 |knd }3|dkr#|1|0 |knd }4|3d ur.|2|3M }2|4d ur7|2|4M }2t 
|/|}/t j|&||/ |	  |1|k |$d d d f @ dd}5t |%|5}6t |2|6| d}6t |(t |6d}7|6|7d d d f 8 }6t j|6}8t |8d}9t j|(|7 }:|)|: |9 })|*|:d d d f  }*t j|'||/ |  |/|d d d f  |k |$d d d f @ dd};|8|;j}8t |8|;|*}*|7}(q|*|)d d d f  }*|| d d d f  | ||  |d d d f  }<||< }=t j|=|*| d d d f |k |$d d d f @ d	 d S )
Nr         g        )maskotherdtypeinfg    ח)r   )r   
program_idloadarangezerosfloat32floatwhereminimumrangemultiple_ofdotmaximummaxmathexp2sumtor   store)>QKVsm_scaleB_Start_LocB_SeqlenOut
stride_qbs	stride_qh
stride_kbs	stride_kh
stride_vbs	stride_vh
stride_obs	stride_ohr   r   r   r	   r
   r   r   r   	cur_batchcur_headstart_mcur_kv_headcur_batch_seq_lencur_batch_in_all_start_indexblock_start_locoffs_noffs_doffs_moff_qoff_koff_vmask_dqk_ptrsv_ptrsm_il_iacc
block_maskend_nstart_n_limitend_n_limitstart_npos_qpos_kr   sliding_mask_qsliding_mask_kkqkm_ijpl_ijalphavoff_oout_ptrs r\   b/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/ops/triton_prefill_attention.py_fwd_kernel$   s   


,,"

&&
r^   r   returnc                 C   s(   | t jkrdS t rtdrdS dS )N    P      @   )torchr   r   is_cuda_alikehas_device_capabilityr   r\   r\   r]   get_block_size   s   
rg   TrD   rS   rY   ob_start_loc	b_seq_lenmax_input_len	is_causalsoftmax_scalesliding_window_qsliding_window_kc                 C   s4  t | j}| jd |jd |jd }}}|du r d|d  n|}|t9 }|jd | jd }}| jd |jd  }||t||f}|dkrJdnd	}|	durR|	nd}	|
durZ|
nd}
t| | ||||||| d| d|d|d|d|d|d|df||t||||	|
|d|d

 dS )zs
    q, k, v: [b * s, head, head_dim]
    b_start_loc: [b]
    b_seq_len: [b]
    out: [b * s, head, head_dim]
    Ng      ?g      ?r   r   rc         )
r   r   r   r	   r
   r   r   	num_warps
num_stagesr   )	rg   r   shaper   r   cdivr^   stridenext_power_of_2)rD   rS   rY   rh   ri   rj   rk   rl   rm   rn   ro   BLOCKLqr   _r*   batchheadr   gridrs   r\   r\   r]   context_attention_fwd   sL   
"r   )TNNN)__doc__rd   vllm.platformsr   vllm.triton_utilsr   r   vllm.utils.math_utilsr   jit	constexprr^   r   intrg   Tensorboolr   r   r\   r\   r\   r]   <module>   sh    	
