o
    ij                  *   @   sh  d dl Z d dlmZ d dlmZmZ edrdndZe r!dndZ	e
 d	kZe e Zejd d ejejfd
ejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejf(ddZejdedejdejdejdejdejdejfd d!Ze  				"			"d)d#ed$e jd%e jd&efd'd(ZdS )*    N)current_platform)tltritonP      @         )      xstride_k_cache_blnum_queries_per_kvIN_PRECISIONBLOCK_MBLOCK_DMODELBLOCK_DMODEL_PADDED
BLOCK_SIZEPHYSICAL_BLOCK_SIZEBLOCK_NSLIDING_WINDOWnum_unroll_cachenum_unroll_requestSKIP_DECODE	USE_SINKSUSE_FP8	MAX_Q_LENMAX_CTX_LENFP8_MINFP8_MAXc8           a   	   C   sn  t d}8t d}9t d}:|9|& };t ||8 }<t ||8 }=t ||8 d }>|>|= }?|<|? }@|1r:|?dkr:d S |(|: }At d|+}Bt d|-}Ct d|*}D|:|( t d|( }E|=|Ed d d f  | |9|  |Dd d d f |  }Ft t d|*|)k ddt j}Gt j| |F |Gd d d f |Ed d d f |?k @ dd}H|2st j|(gtdt j	d}It j
|(gt j	d}Jn&t j|t j|(g|9t jd |E|?k tddjt j	d}It |Itdkdd}Jt j
|(|*gt j	d}Kt jd|@|+|/d	D ]}L|L|B }M|M|, }Nt ||8|  |N|  t j}O|M|, }P|Od d d f | |;|  |Dd d d f | |  |Pd d d f |   |Dd d d f | |!  }Q|Od d d f |" |;|#  |Dd d d f |$  |Pd d d f |%  }R|L|+ |@ksn|)|*krt j||Q |Gd d d f |L|Bd d d f  |@k @ dd}Snt ||Q }S|Sj r|St j	t | |Hj}Tn|S}T|t j|H|T|'d
 }Ut |L|Bd d d f  |@k |Utd}U|.dkrt |@|Ed d d f  |L|Bd d d f   |.k |Utd}Ut |It j|Udd}Vt |U|Vd d d f  }Wt |Vd d d f tdkd|W}Wt j|Wdd}Xt |I|V }Yt |Itdkd|Y}Y|K|Yd d d f  }K|L|+ |@ksA|)|*kr^t j||R |Gd d d f |L|Bd d d f  |@k @ dd}Znt ||R }Z|Zj r{|Zt j	t |	 |Hj}[n|Z}[|W|[j}Wt j|W|[|K|'d}K|J|Y |X }J|V}Iq|Cd d d f | |;|  |Dd d d f |  }Q|Cd d d f | |;|  |Dd d d f |  }R||Q }\||R }]t |A|?k dd}^t jd|^|:d  |( |-|0d	D ]}Lt |L|-}Lt j|\|=|L |  |Gd d d f |L|Cd d d f  |?k @ dd}Tt j
|(|-gt j	d}Ut j|H|T|U|'d}U|U|9 }Ut |Ed d d f |L|Cd d d f  k|Utd}U|.dkr]t |Ed d d f |L|Cd d d f   |.k |Utd}Ut |It j|Udd}Vt |U|Vd d d f  }Wt |Vd d d f tdkd|W}Wt j|Wdd}Xt |I|V }Yt |Itdkd|Y}Y|K|Yd d d f  }Kt j|]|=|L |  |Gd d d f |L|Cd d d f  |?k @ dd}[|W|[j}Wt j|W|[|K|'d}K|J|Y |X }J|V}Iq|K|Jd d d f d  }K|=|Ed d d f  | |9|  |Dd d d f |  }_||_ }`|3r|Kt |
 }Kt |K|6|7}Kt j|`|K|Gd d d f |Ed d d f |?k @ d d S )Nr                 maskother-infdtype      ?)loop_unroll_factor)input_precision)axisaccr+   g|=r$   )r   
program_idloadarangewheretoint1fullfloatfloat32zerosint64ranger(   is_fp8dotmaximummaxexpsummultiple_ofclampstore)aQKVK_cacheV_cachesink_ptrB_Locsm_scalek_scalev_scaleout_scale_invB_Start_LocB_Seqlenr   Outstride_b_loc_bstride_b_loc_s
stride_qbs	stride_qh	stride_qd
stride_kbs	stride_kh	stride_kd
stride_vbs	stride_vh	stride_vd
stride_obs	stride_oh	stride_odstride_k_cache_bsstride_k_cache_hstride_k_cache_dr   stride_k_cache_xstride_v_cache_bsstride_v_cache_hstride_v_cache_dstride_v_cache_blr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	cur_batchcur_headstart_mcur_kv_headcur_batch_seq_lencur_batch_in_all_start_indexcur_batch_in_all_stop_indexcur_batch_query_lencur_batch_ctx_lenblock_start_loc	offs_bs_noffs_noffs_doffs_moff_qdim_maskqm_il_ir.   start_ntoken_indicesbn_logical_indicesbninternal_offsetsoff_koff_vk_loadkqkm_ijpl_ijalphav_loadvk_ptrsv_ptrs
block_maskoff_oout_ptrs r   Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/attention/ops/prefix_prefill.py_fwd_kernel$   s  
;

"

 
&"
 
2
&"&r   c-           Y   	   C   s  t d}-t d}.t d}/|.|& }0t |
|- }1t |	|- }2t |	|- d }3|3|2 }4|1|4 }5|,r:|4dkr:d S |(|/ }6t d|+}7t d|*}8|/|( t d|( }9|2|9d d d f  | |.|  |8d d d f |  }:t t d|*|)k ddt j};t j| |: |;d d d f |9d d d f |1|5 k @ dd}<t j|(gt jdt	d }=t j|(gt jd}>t j|(|*gt jd}?t ||. }@t d|(|6 |5 }Ad}Bt
d|5|+D ]}Ct |C|+}Ct j||-|  |C|7 | |  |C|7 |5k ddt j}D|Dd d d f | |0|  |8d d d f | |  |C|7d d d f  | |   |8d d d f | |!  }E|Dd d d f |" |0|#  |8d d d f |$  |C|7d d d f  | |%  }Ft j||E |;d d d f |C|7d d d f  |5k @ dd}G|Gj r|Gt jt | |<j}Hn|G}Ht j|(|+gt jd}It j|<|H|I|'d}It |C|7d d d f  |5k |It	d	}I|I|9 }It d|+d d d f |B |Ad d d f  |@ }Jt |Jdk|Ad d d f |1k @ |Jt	d	}J|I|J7 }I|B|+7 }Bt |Id}Kt |=|K}Lt j|I|Ld d d f  }Mt |Md}Nt j|=|L }O|O|> |N }P|O}Q|?|Qd d d f  }?t j||F |;d d d f |C|7d d d f  |5k @ dd}R|Rj rR|Rt jt | |<j}Sn|R}S|M|Sj}Mt j|M|S|?d
d}?|P}>|L}=q|7d d d f | |0|  |8d d d f |  }E|7d d d f | |0|  |8d d d f |  }F||E }T||F }Ut |6|1|5 k dd}Vt ||. }@t d|(|6 |5 }A|5}Bt
d|V|/d  |( |+D ]}Ct |C|+}Ct j|T|2|C |  |;d d d f |C|7d d d f  |1|5 k @ dd}Ht j|(|+gt jd}It j|<|H|Id
d}I|I|9 }It |9d d d f |C|7d d d f  k|It	d	}It d|+d d d f |B |Ad d d f  |@ }Jt |Jdk|Ad d d f |1k @ |Jt	d	}J|I|J7 }I|B|+7 }Bt |Id}Kt |=|K}Lt j|I|Ld d d f  }Mt |Md}Nt j|=|L }O|O|> |N }P|O}Q|?|Qd d d f  }?t j|U|2|C |  |;d d d f |C|7d d d f  |1|5 k @ dd}S|M|Sj}Mt j|M|S|?d
d}?|P}>|L}=q|?|>d d d f  }?|2|9d d d f  | |.|  |8d d d f |  }W||W }Xt j|X|?|;d d d f |9d d d f |1|5 k @ d d S )Nr   r    r!   r"   r#   r'   infr-   r&   ieeer/   )r   r0   r1   r2   r3   r4   r5   r9   r8   r7   r;   rB   r:   r(   r<   r=   r?   r>   mathr@   rA   rD   )YrE   rF   rG   rH   rI   rK   rL   rM   rN   rP   rQ   Alibi_slopes
block_sizer   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   r   rd   re   rf   rg   rh   r   r   r   r   r   r   r   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rt   ru   rv   rw   rx   ry   rz   r{   r.   alibi_slopealibi_start_qalibi_start_kr|   r   r   r   r   r   r   alibir   m_i_newr   r   r   l_i_new	acc_scaler   r   r   r   r   r   r   r   r   r   _fwd_kernel_alibib  sx  
1


& *& 2*r   Fkv_cache_dtyperM   rN   is_block_table_ptrc           +          s^  | j tju }tr|rdnd }d|v rJ|j tjt fv sJ |j tjt fv s*J |dv r3t }n|dkr;tj}ntd||	|}|	|}|j tjksZ|j tjkr^|dkr^td| j
d |j
d |j
d }}}||krw||ksyJ t|}|d u rd	|d
  }|	j
d | j
d  | j
d |j
d  } d t|ksJ |d u s|dkrd}|r| }|d| }| }|dk} t| || | |tj}!n|tj}!|d ur|d u sJ d|d u sJ d|rtd nt}" t|"f}#t|# g | ||||||||||	||j
d |j
d ||d|d| d| d| d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|dR |||"|||"|tdd	 d S |
d u rdn|
}
i }$t ri }$|j
d }%|%dko|%|%d @ dk}&|&rd}'d}(nd}'d}(d}) fdd}*t|* | ||||||!||||d urd	| nd	||	|j
d ||!d|!d| d| d| d|d|d|d|d|d|d|d|d|dfi d|dd|dd|dd|dd|dd|dd|dd|dd |dd!|)d"|%d#|d$|d%|d&|d'|d(|d)|d ud*|'d+|(d,dd-dd.dd/dd0|d u|$ d S )1Nr   fp8)r   fp8_e4m3fp8_e5m2zUnsupported FP8 dtype:autozLkv_cache_dtype='auto' unsupported for            FP8 KV Cache prefill kernelr)   g      ?r   r    z%Sinks arg is not supported with alibiz#FP8 output not supported with alibir!      r   )	r   r   r   r   r   r   r   	num_warps
num_stagesr   r       c                    s    t | d fS )Nr   )r   cdiv)METAbatchheadmax_input_lenr   r   <lambda>$  s    z'context_attention_fwd.<locals>.<lambda>ra   rb   rc   r   rd   re   rf   rg   rh   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r(   torchr8   	IS_TURINGuint8r   	fp8_dtypefloat8_e5m2
ValueErrorviewshaper   next_power_of_2lenelement_sizestridedata_ptrr3   r4   int32
BASE_BLOCKr   r   	NUM_WARPSis_rocmr   )+ry   r   r   or   k_cachev_cacheb_locb_start_loc	b_seq_lenmax_seq_lenr   rM   rN   alibi_slopessliding_windowrL   skip_decodefp8_out_scalesinksr   q_dtype_is_f32r   target_dtypeLqLkLv	Lk_paddedr   kv_element_sizeblock_byte_stride	base_addrr$   processed_b_locBLOCKgridextra_kargsreal_block_sizeis_pow2r   r   TRITON_BLOCK_SIZEgrid_fnr   r   r   context_attention_fwdz  s  



"

	
 !"#$%&'1




 
!
"
#
$
%
&'()*+,-./01234569r   )NNNFNNF)r   vllm.platformsr   vllm.triton_utilsr   r   has_device_capabilityr   r   r   get_device_capabilityr   finfor   float8_infojitminr?   	constexprr   intr   inference_modestrTensorboolr   r   r   r   r   <module>   s   6!'()*+,-./012345678  ?'()*+,-  