o
    
۾iL                     @   sv  d Z ddlZddlmZ ddlmZ ddlmZmZ e	 Z
eeZeejedk r3ed ejdd	 Zejd
ejdejdejdejdejdejdejdejdejfddZdd Zejd
ejdejdejdejdejdejdejdejdejdejdejdejfddZdd ZejdejdejdejfddZd d! Z	"d*d#d$Z	"d*d%d&Z	'	"d+d(d)ZdS ),zF
Memory-efficient attention for decoding.
It supports page size >= 1.
    N)version)current_platform)tltritonz3.2.0zUThe following error message 'operation scheduled before its operands' can be ignored.c                 C   s   dt d|   d S )N      )r   sigmoid)x r
   a/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/ops/triton_decode_attention.pytanh4   s   r   kv_group_numBLOCK_DMODELBLOCK_DVBLOCK_NNUM_KV_SPLITS	PAGE_SIZE	logit_capLkLvc           :      C   s   t d}t d}t d}|| }t d|}t d|}||k } ||k }!t || }"|}#|| ||	  | }$t j| |$ | dd}%t |"|}&|&| }'t |'|& |"}(td })d}*t j|gt jd}+|(|'krt	|'|(|D ]},|,t d| }-t j|||#  |-|  |-|(k dd}.|.| |-|  }/|/d d d f |
 ||  |d d d f  }0t j||0 |-d d d f |(k | d d d f @ dd}1t 
|%d d d f |1 d}2|2|9 }2|dkr|t|2|  }2t |-|(k |2td}2|/d d d f | ||  |d d d f  }3t j||3 |-d d d f |(k |!d d d f @ dd}4t t |2d|)}5t |)|5 }6t |2|5 }7|+|69 }+|+t 
|7d d d f |4 d7 }+|*|6 t 
|7d }*|5})qq|| ||  ||  | }8t j||8 |+|* |!d	 || ||  ||  | }9t ||9 |)t |*  d S d S )
Nr   r   r           maskotherinfdtype-infr   )r   
program_idarangeloadcdivminimumfloatzerosfloat32rangesumr   wheremaximummaxexpstorelog):QK_BufferV_Buffersm_scaleReq_to_tokensB_SeqlenAtt_Outstride_req_to_tokens_b
stride_qbs	stride_qhstride_buf_kbsstride_buf_khstride_buf_vbsstride_buf_vhstride_mid_obstride_mid_ohstride_mid_osr   r   r   r   r   r   r   r   r   	cur_batchcur_headsplit_kv_idcur_kv_headoffs_doffs_dvmask_dmask_dvcur_batch_seq_lencur_batch_req_idxoff_qqkv_len_per_splitsplit_kv_startsplit_kv_ende_maxe_sumaccstart_noffs_nkv_page_numberkv_loc
offs_buf_kkqk
offs_buf_vvn_e_maxre_scalep
offs_mid_ooffs_mid_o_1r
   r
   r   _fwd_kernel_stage1:   s   




"" r`   c
                 C   s  t sdnd}
|}|jd }|jd }| jd | jd }}|||f}| jd |jd  }d}|dkr8t r6dnd}t|}t|}t| | |||||||d| d| d|d	|d|d	|d|d|d|df||||
|||	|d||d
 d S )N@      r   r      r   )r   r   r   r   r   r   r   	num_warps
num_stagesr   r   )is_hip_shaper   next_power_of_2r`   stride)rK   k_bufferv_bufferatt_outr3   r4   num_kv_splitsr2   	page_sizer   BLOCKr   r   r   batchhead_numgridr   rg   r   r   r
   r
   r   _decode_att_m_fwd   sV   




rv   
q_head_num	BLOCK_DPEBLOCK_Hc           F      C   s&  t d}t d}|t || }t d} ||kr|n|}!||! t d| }"|"|d |! k }#|#|"|k @ }#t d|}$t d|}%|$|k }&|%|k }'t || }(|})|| |"d d d f |	  |$d d d f  }*t j| |* |#d d d f |&d d d f @ dd}+|dkr|t d| },|,|k }-|| |"d d d f |	  |,d d d f  }.t j| |. |#d d d f |-d d d f @ dd}/t |(|}0|0|  }1t |1|0 |(}2t j|gt jdtd }3t j|gt jd}4t j||gt jd}5|2|1krt	|1|2|D ]9}6|6t d| }7t j|||)  |7|  |7|2k dd}8|8| |7|  }9|9d d d f |
 ||  |$d d d f  }:t j||: |7d d d f |2k |&d d d f @ dd};t 
|+|;|+j}<|dkr|9d d d f |
 ||  |,d d d f  }=t j||= |7d d d f |2k |-d d d f @ dd}>|<t 
|/|>|/j7 }<|<|9 }<|dkr|t|<|  }<t |#d d d f |7d d d f |2k @ |<td}<|9d d d f | ||  |%d d d f  }?t j||? |7d d d f |2k |'d d d f @ dd}@t t |<d|3}At |3|A }Bt |<|Ad d d f  }C|5|Bd d d f 9 }5|5t 
|C|@j|@7 }5|4|B t |Cd }4|A}3q|| |"d d d f |  | |  |%d d d f  }Dt j||D |5|4d d d f  |#d d d f |'d d d f @ d	 || |"|  | |  | }Et j||E |3t |4 |#d	 d S d S )
Nr   r   r   r   r   r   r   r   r   )r   r   r"   r    r!   r#   r%   r&   r$   r'   dottor   r   r)   r*   r+   r,   r(   r-   r.   )Fr/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r   rw   r   rx   r   r   ry   r   r   r   r   r   r@   cur_head_idrC   rB   VALID_BLOCK_HrA   mask_hrD   rE   rF   rG   rH   rI   offs_qrK   offs_dpemask_dpeoff_qpeqperL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   offs_buf_kpekperY   rZ   r[   r\   r]   r^   r_   r
   r
   r   _fwd_grouped_kernel_stage1   s   
 

,0*&
"
"
*"
r   c
                 C   s\  d}
|j d }|j d }tr|dkrd}
|dkrd}d}n|dkr&d}d}nt|}d	}t|}| j d	 | j d
 }}| j d
 |j d  }d}|}|t|t|||f}i }d}tred
ddd}d
}t| | |||||||d	| d	| d
|d|d|d|d|d	|d
|df||||||
||||	d|||d| d S )N    rc   i@     i   ra   i      r   r   rd   r   waves_per_eumatrix_instr_nonkdimkpackrf   re   )r   rw   r   rx   r   r   ry   r   r   r   rg   rh   r   r   )rj   ri   r   rk   r"   minr   rl   )rK   rm   rn   ro   r3   r4   rp   r2   rq   r   rr   r   r   r   rx   r   rs   rt   r   ry   r   ru   extra_kargsrh   r
   r
   r   _decode_grouped_att_m_fwd  s~   



 
r   c           !      C   s  t d}t d}t || }t d|}||k }d}td }t j|gt jd}|| ||  | }|| ||  | }td|
D ]W}t ||
}|| }t 	|| |}||krt j| | ||  |dd}t | | ||  }t 
||}t || }||9 }t || }||| 7 }|| | }|}qDt j|||  ||  | || |d |t | } t |||	  | |  d S )Nr   r   r   r   r   r   r   )r   r   r!   r    r$   r%   r&   r'   r"   r#   r*   r,   r-   r.   )!Mid_Oolser4   r=   r>   r?   
stride_obs	stride_ohstride_lse_bsr   r   r   r@   rA   rH   rD   rF   rP   rO   rQ   offs_v
offs_logicrB   rL   rM   rN   tvtlogicr[   	old_scale	exp_logiclse_valr
   r
   r   _fwd_kernel_stage2  sJ   


r   c                 C   s   |j d |j d }}|j d }	t|	}
|}i }tr!dddd}||f}t| | |||| d| d| d|d|d|df
||
|	ddd| d S )	Nr   r   rc   re   r   r   r   )r   r   r   rg   rh   )rj   r   rk   ri   r   rl   )logitsrK   r   r   rn   	b_seq_lenrp   rs   rt   r   r   r   r   ru   r
   r
   r   _decode_softmax_reducev_fwd-  s8   	


r   r   c                 C   2   t | |||||||	|
|
 t|| ||||| d S N)rv   r   rK   rm   rn   r   r   req_to_tokenr   attn_logitsrp   r2   rq   r   r
   r
   r   decode_attention_fwd_normalW     r   c                 C   r   r   )r   r   r   r
   r
   r   decode_attention_fwd_groupedv  r   r   r   c                 C   sr   ||j d ks	J | j d |j d  }|dkr(t| |||||||||	|
| d S t| |||||||||	|
| d S )Nr   r   rd   )rj   r   r   )rK   rm   rn   r   r   r   r   r   rp   r2   rq   r   r   r
   r
   r   decode_attention_fwd  s>   r   )r   )r   r   )__doc__logging	packagingr   vllm.platformsr   vllm.triton_utilsr   r   is_rocmri   	getLogger__name__loggerparse__version__warningjitr   	constexprr`   rv   r   r   r   r   r   r   r   r
   r
   r
   r   <module>   s   

>  V>6
+
*