o
    
۾i                  F   @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	 ee
Ze Ze e Ze	jdd Ze	jdd	 Ze	jd
ejdejfddZe	jejejfdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejd ejd!ejd"ejd#ejd$ejd%ejd&ejd'ejd(ejd)ejd
ejd*ejd+ejd,ejd-ejd.ejfDd/d0Ze	jdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejd"ejd#ejd$ejd%ejd&ejd'ejd(ejd)ejd
ejd*ejd+ejd1ejd ejd!ejf<d2d3Ze	jejejfdejdejdejdejdejdejdejd
ejd1ejd,ejd-ejd.ejfd4d5Zd6ed7ed8efd9d:Zd6ed7ed;ed<ed8ef
d=d>Z											?dBd@dAZ dS )C    N)init_logger)vllm_is_batch_invariant)current_platform)tltritonc                 C   s   | | d | S )N    )xyr   r   b/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/ops/triton_unified_attention.pycdiv_fn   s   r   c                 C   s2   | | }t |}t | }|||  ||  S )N)r   exp)Sr	   Sdivp1p2r   r   r   apply_softcap   s   
r   BLOCK_Quse_q_block_modec           
      C   sd   d}|}||k r.|| d }t | | }|r|| | n|}	|	|kr(|d }n|}||k s|d S )Nr      r   )r   load)
query_start_len_ptr
target_idxnum_seqsr   r   leftrightmidvalmid_valr   r   r   find_seq_idx#   s   

r   num_query_headsnum_queries_per_kvblock_table_stridequery_stride_0query_stride_1output_stride_0output_stride_1qq_bias_stride_0
BLOCK_SIZE	TILE_SIZE	HEAD_SIZEHEAD_SIZE_PADDEDUSE_ALIBI_SLOPESUSE_ALIBI_SQRTUSE_QQ_BIASUSE_SOFTCAP	USE_SINKSSLIDING_WINDOWUSE_MM_PREFIXMAX_MM_RANGESstride_k_cache_0stride_k_cache_1stride_k_cache_2stride_k_cache_3stride_v_cache_0stride_v_cache_1stride_v_cache_2stride_v_cache_3r   BLOCK_MUSE_FP8FP8_MINFP8_MAXc2           r   	   C   sZ  t d}2t d}3t|+|2|-|,d}4t |+|4 |, |4 }5|2|5 }6t |+|4 }7t |+|4 d }8|8|7 }9|6|, |9kr=d S t d|.}:t d|};t d|}<|6|, |:|  }=|7|= }>|3| |:|  }?|>d d d f | |?d d d f |  |;d d d f  }@t |;|k ddt j}At |=|9k ddt j}Bt |?|k ddt j}Ct j||@ |Ad d d f |Bd d d f @ |Cd d d f @ dd}D|4| }E|st j|.gt	dt j
d}Fnt j||? |Ct	ddjt j
d}Ft j|.gdt j
d}Gt j|.|gt j
d}Ht ||4 }I|I|9 }J|rt j||? |Cdd}K|r%||=d d d f |  }L|J|6|,  |.d |  d }M| r=t |M|I}Mnt |M|I}Mt|M|}Nd}O|N}P|dkr| s|6|, }Qt |Q|.d |  |9d }R|J|Q | d }S|J|R }Tt d|S| }Ot |T| d |N}Pt|O|PD ]I}U|U| |< }V|V|Mk }Wt ||E |V|  t j}X|Xd d d f |' |3|)  |;d d d f |*  |V| d d d f |(  }Y|Xd d d f |# |3|%  |;d d d f |&  |V| d d d f |$  }Zt j||Z |Ad d d f |Wd d d f @ dd}[|[j r$|Dj r|[}\n|[t j
t |
 |Dj}\n|[}\t j||Y |Ad d d f |Wd d d f @ dd}]|]j r]|Dj rM|]}^n|]t j
t | |Dj}^n|]}^|J|=d d d f  }_|Vd d d f |_k}`|dkr|`|_|V |k @ }`| rt|!D ]M}at |"|4|! d	  |ad	  }bt |"|4|! d	  |ad	  d }c|b|ck }d|_|bk|_|ck@ |d@ }e|Vd d d f |bk|Vd d d f |ck@ |d@ }f|`|e|f@ O }`qt j|.|ft j
d
}g|g|	t |D|\ 7 }g|rt|g|}gt |Cd d d f |Bd d d f @ |`@ |gt	d}g|r?|r/|V|J|=d d d f   }ht |hdkt |h t j
 d}in|V|J }i|g|Kd d d f |i 7 }g|ri|V|J }j|jdkoN|j|k }kt j|L|jd d d f  |kd d d f dd}l|g|l7 }gt |Ft j|gdd}mt |mt	dk|md}mt |g|md d d f  }nt j|ndd}ot |F|m }p|H|pd d d f  }H|G|p |o }G|m}F|r|6|, }Qt |J|Q |Vd d d f  |k |^d}^|Ht |n|^j|^7 }Hq|H|Gd d d f  }H|/r|Ht | }Ht |H|0|1}H|>d d d f | |?d d d f |  |;d d d f  }qt j| |q |H|Ad d d f |Bd d d f @ |Cd d d f @ d d S )Nr   r   T        maskother-infdtype      ?r   shaperF   axisrB   )r   
program_idr   r   arangewheretoint1fullfloatfloat32zerosmaximumminimumr   rangeint64rF   is_fp8dotr   sqrtmaxr   sumclampstore)r
output_ptr	query_ptrkey_cache_ptrvalue_cache_ptrsink_ptrblock_tables_ptrseq_lens_ptralibi_slopes_ptrqq_bias_ptrscalek_scalev_scale	out_scalesoftcapr    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   mm_prefix_range_ptrr4   r5   r6   r7   r8   r9   r:   r;   r   r   r   r<   r=   r>   r?   q_block_global_idxkv_head_idxseq_idxq_block_start_idxq_block_local_idxcur_batch_in_all_start_indexcur_batch_in_all_stop_indexcur_batch_query_lenoffs_moffs_doffs_t	query_posquery_offset_0query_offset_1query_offsetdim_maskquery_mask_0query_mask_1Qblock_table_offsetMLaccseq_lencontext_lenalibi_slopeqq_bias_row_ptrsmax_seq_prefix_len	num_tiles
tile_starttile_endqpos_loqpos_hifirst_allowed_keylast_allowed_keyj
seq_offset	tile_maskphysical_block_idxv_offsetk_offsetK_loadKV_loadVquery_abs_posseq_maskirange_start	range_endis_valid
q_in_range
k_in_ranger   relative_posalibi_offsetkey_rel_posis_query_keyqq_biasm_jPl_jalphaoutput_offsetr   r   r   kernel_unified_attention_2d:   s  
5

.


	  

*.
r   NUM_SEGMENTS_PER_SEQc/           s   	   C   s  t d}/t d}0t d}1t|'|/|)|(d}2t |'|2 |( |2 }3|/|3 }4t |'|2 }5t |'|2 d }6|6|5 }7|4|( |7krBd S t ||2 }8|+}9t|8|9| }:|1|: | |8kr\d S t d|*};t d|}<t d|}=|4|( |;|  }>|5|> }?|0| |;|  }@|?d d d f | |@d d d f |  |<d d d f  }At |<|k ddt j}Bt |>|7k ddt j}Ct |@|k ddt j}Dt j||A |Bd d d f |Cd d d f @ |Dd d d f @ dd}E|2| }F|r|1dkrt j||@ |Dt	ddjt j
d}Gnt j|*gt	dt j
d}Gnt j|*gt	dt j
d}Gt j|*gd	t j
d}Ht j|*|gt j
d}I|8|7 }J|rAt j|	|@ |Ddd}K|rP|
|>d d d f |  }L|J|4|(  |*d |  d }Mt |M|8}Mt|M|}Nd}O|N}P|dkr|,s|4|( }Qt |Q|*d |  |7d }R|J|Q | d }S|J|R }Tt d|S| }Ot |T| d |N}Ptt|1|: |Ot|1d |: |PD ]I}U|U| |= }V|V|Mk }Wt ||F |V|  t j}X|Xd d d f |# |0|%  |<d d d f |&  |V| d d d f |$  }Y|Xd d d f | |0|!  |<d d d f |"  |V| d d d f |   }Zt j||Z |Bd d d f |Wd d d f @ dd}[|[j rQ|Ej rA|[}\n|[t j
t | |Ej}\n|[}\t j||Y |Bd d d f |Wd d d f @ dd}]|]j r|Ej rz|]}^n|]t j
t | |Ej}^n|]}^|J|>d d d f  }_|Vd d d f |_k}`|dkr|`|_|V |k @ }`|,rt|-D ]M}at |.|2|- d  |ad  }bt |.|2|- d  |ad  d }c|b|ck }d|_|bk|_|ck@ |d@ }e|Vd d d f |bk|Vd d d f |ck@ |d@ }f|`|e|f@ O }`qt j|*|ft j
d
}g|g|t |E|\ 7 }g|rt|g|}gt |Dd d d f |Cd d d f @ |`@ |gt	d}g|rl|r\|V|J|>d d d f   }ht |hdkt |h t j
 d}in|V|J }i|g|Kd d d f |i 7 }g|r|V|J }j|jdko{|j|k }kt j|L|jd d d f  |kd d d f dd}l|g|l7 }gt |Gt j|gdd}mt |mt	dk|md}mt |g|md d d f  }nt j|ndd}ot |G|m }p|I|pd d d f  }I|H|p |o }H|m}G|r|4|( }Qt |J|Q |Vd d d f  |k |^d}^|It |n|^j|^7 }Iq|?d d d f t j||+ |  |@d d d f |+|   |1|  t d|d d d f  }qt j| |q |I|Bd d d f |Cd d d f @ |Dd d d f @ d |?t j||+  |@|+  |1 }rt j||r |G|C|D@ d t j||r |H|C|D@ d d S )Nr   r   r   Tr@   rA   rD   rE   rG   rH   rJ   rL   )r   rM   r   r   r   rN   rO   rP   rQ   rS   rT   rR   rU   rW   rV   rX   r]   minrY   rF   rZ   r[   r   r\   r   r^   r`   )ssegm_output_ptrsegm_max_ptrsegm_expsum_ptrrb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rn   r    r!   r"   r#   r$   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r4   r5   r6   r7   r8   r9   r:   r;   r   r   r   r<   r   r2   r3   ro   rp   rq   segm_idxrr   rs   rt   ru   rv   rw   r   num_segmentstiles_per_segmentrx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   segm_output_offsetsegm_offsetr   r   r   kernel_unified_attention_3d  s  
3


.


	
	
  

*
.r   c           '      C   s,  t d}t d}t||||d}t || }|}t||| }t||| }t d|t j|g|t jdk }t t d||k dd	t j
}|	t j||  ||  t d| }t j|| |tdd}t |}t j|| |dd} | t ||  } t | }!|	t j|| |  |||   t d|d d d f |  t d|d d d f  }"t j||" |d d d f |d d d f @ dd}#|#t || d d d f 9 }#t j|#dd}$t |!dkd|$|! }%|r|%t | }%t |%||}%|| ||	  t d| }&t j| |& |%|d	 d S )
Nr   r   FrE   rD   rA   r@   rJ   rL   )r   rM   r   r   r   rN   rR   int32rO   rP   rQ   rY   rS   r]   r   r^   r_   r`   )'ra   r   r   r   rg   r   r    out_scale_invr%   r&   r"   r)   r*   r+   r   r   r   r=   r>   r?   query_token_idxquery_head_idxrr   r   r   r   act_num_segments	segm_maskr   r   segm_maxoverall_maxsegm_expsumoverall_expsumr   segm_outputacc_sumr   r   r   r   r   reduce_segments  sh   



"






r   	head_sizesliding_windowreturnc                 C   s   |dko| dv S )a	  Detect Gemma3 models via unique (head_size, sliding_window) signature.

    Gemma3 models are the only ones using sliding_window=1024 with
    head_size 128 (27B) or 256 (1B, 4B, 12B). Other SWA models use
    different window sizes (Mistral=4096, Phi-3=2047).
    i   )      r   )r   r   r   r   r   _is_gemma3_attentionT  s   r   element_size
is_prefillc                 C   s&   t | |rdS |rdS |dkrdS dS )zSelect tile size with Gemma3-specific optimization.

    For Gemma3, use 32 for both prefill and decode to better utilize
    the larger head dimension (128/256). For other models, use
    the default vLLM behavior.
        r      )r   )r   r   r   r   r   r   r   _get_tile_size^  s
   
r   Fc           +      C   sn  |	sJ d|d u sJ d|d ur |j d | j d ks J dd}d}|d ur=|jdkr5d}|j d }ntd	|j  |d u}|d u}|j d }t|} | j d }!|j d
 }"|!|" }#| j d
 }$|#dkrgdnt|#}%|%|# }&| j d |& |  }'|
d dkrd|
d  nd}(t|$|(|  dd})t|$|(|  dd}*|d u s|d u s|d u s|d u s|d u s|dks| |kstrt	|'|"f dCi d|d| d|d|d|d|d|d|d|d|d|d|d|d urd| ndd|d|!d|#d|
dd| 
dd | 
dd!|
dd"|
dd#|r |
dnydd$|d%|)d&|$d't|$d(|d)|d*|d+|dkd,|d ud-|d.|d/|d0d|
d  d1|
dd2|
dd3|
d
d4|
dd5|
dd6|
dd7|
d
d8|
dd9|d:|&d;| d<|%d=|d u d S d$|d%|)d&|$d't|$d(|d)|d*|d+|dkd,|d ud-|d.|d/|d0d|
d  d1|
dd2|
dd3|
d
d4|
dd5|
dd6|
dd7|
d
d8|
dd9|d:|&d;| d<|%d=|d u d S t|'|"|f dCi d>|d?|d@|d| d|d|d|d|d|d|d|d|d|d|d|d|!d|#d|
dd| 
dd | 
dd#|rg|
dndd$|d%|*d&|$d't|$d(|d)|d*|d+|dkd,|d ud-|d.|d/|d0d|
d  d1|
dd2|
dd3|
d
d4|
dd5|
dd6|
dd7|
d
d8|
dd9|d:|&d;| d<|%dA| t| j d |!f dCi d|d>|d?|d@|d|d;| d|!dB|d urd| ndd!|
dd"|
dd|
dd%|*d&|$d't|$d9|d:|&dA|d=|d u d S )DNz"Only causal attention is supportedzQ scales not supportedr   r   z"Sinks must be num_query_heads sizeF   Tz#Unsupported mm_prefix_range shape: r   r   )r   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rG   rn   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r2   r3   ro   r1   r4   r5   r6   r7   r8   r9   r:   r;   r   r   r   r<   r=   r   r   r   r   r   r   )rI   ndim
ValueErrorlenr   next_power_of_2r   r   is_batch_invariantr   strider   r   )+qkvoutcu_seqlens_qmax_seqlen_q	seqused_kmax_seqlen_ksoftmax_scalecausalwindow_sizeblock_tablern   	q_descale	k_descale	v_descaleseq_threshold_3Dnum_par_softmax_segmentssoftmax_segm_outputsoftmax_segm_maxsoftmax_segm_expsumalibi_slopesoutput_scaler   sinksmm_prefix_rangeuse_alibi_sqrtuse_mm_prefixmax_mm_rangesuse_alibi_slopesuse_qq_bias
block_sizer   r    num_kv_headsr!   r   r<   r   total_num_q_blockssliding_window_valTILE_SIZE_PREFILLTILE_SIZE_DECODEr   r   r   unified_attentiont  s  





	






 !"#$%&'(
)
*
+
,
-
.
/
012345
 !"#$%&'(
)
*
+
,
-
.
/
0123458	




 !"#$
%
&
'
(
)
*
+
,-./013
	



r   )NNNNNNNNNNF)!torchvllm.loggerr   *vllm.model_executor.layers.batch_invariantr   vllm.platformsr   vllm.triton_utilsr   r   __name__loggerr   finfo	fp8_dtypefloat8_infojitr   r   	constexprr   r   r]   rY   r   r   r   r   intboolr   r   r   r   r   r   r   <module>   s  	

2 !"$%&'()*+-./012  ] !"#$%&'(*+,-./  e
Y

'