o
    پiM)                  (   @   sd  d dl Z d dlZd dlmZ ejdejfddZ	d)ddZejdejdejd	ejfd
dZ	de j
de j
de j
de j
de j
de j
de j
de j
de j
de j
de j
de j
de j
de j
de j
dedede j
de j
de j
f(d d!Zejdejd"ejd	ejfd#d$Zde j
de j
de j
d"edede j
de j
fd%d&Zde j
de j
de j
d"edede j
de j
fd'd(ZdS )*    N
BLOCK_SIZEc                 C   s   t d}t || }t || }t || }	|d }
td|
|D ]3}|t d| }||
k }| | | d }t j||d}t   | | | }t j|||d t   q$|dkrz|d urj| t ||  }n| | | d }t ||	 d S d S Nr      mask)tl
program_idloadrangearangedebug_barrierstore)input_ids_ptrextend_start_loc_ptrextend_seq_lens_ptrtopk_index_ptrselect_index_ptrr   pid	start_locseq_len	new_tokennum_elements_to_shiftoffoffsetsr   read_ptrval	write_ptrlast_pos_ptr r   b/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/speculative/multi_layer_eagle_utils.pyrotate_input_ids_kernel   s(   
	
r    c                 C   s<   |j d }|d urdnd}|f}t| | |||||d | S )Nr   i      )r   )shaper    )	input_idsextend_start_locextend_seq_lens
topk_indexselect_index
batch_sizer   gridr   r   r   rotate_input_ids_triton9   s   
r*   
HIDDEN_DIM	BLOCK_SEQ	BLOCK_HIDc           8      C   s  t d}t || }t || }t || } |d }!| | }"t |
| |! t || |" t d|}#|#|k }$t j| |  |# |$d}%t j||" |# |%|$d t || }&t ||" | |& t ||  t j||  |# |$d}'t j||" d |# |'|$d t ||" tt ||  d d t j||  |# |$d}(t j|	|" d |# |(|$d t || })|| d }*|*dkr||)|  |*|  }+t |+},t |	|" |, |)d | |d  |  }-td||D ]d}.|.t d| }/|/|k }0t|D ]0}1|1|k r,|| |1 |  |/|  }2||"d |1 |  |/|  }3t j|2|0d}4t j|3|4|0d q||- |/|  }5t j|5|0d}6||"|  |/|  }7t j|7|6|0d qd S r   )r   r   r	   r   r   maxr
   )8old_input_ids_ptrold_positions_ptrold_hidden_states_ptrold_out_cache_loc_ptrold_extend_seq_lens_ptrold_extend_start_loc_ptrr   positions_ptrhidden_states_ptrout_cache_loc_ptrr   r   next_token_ids_ptrseq_lens_ptrpadding_lens_ptrreq_pool_indices_ptrreq_to_token_ptrreq_to_hidden_states_pool_ptrstepstride_hidden_seqstride_hidden_dimstride_pool_reqstride_pool_stepstride_pool_dimstride_req_token_0stride_req_token_1r+   r,   r-   r   r   old_extend_len	old_startnew_extend_len	new_startoffs_seqmask_seqold_idspadding_lenold_pos	old_cachereq_idxtoken_idx_colreq_token_ptr_loclast_cache_locpool_vec_offset_baseoff_hoffs_hmask_hi	old_h_ptr	new_h_ptr	chunk_old	pool_ptrspool_valnew_h_start_ptrsr   r   r   assign_new_state_kernelK   s   
%



r_   next_token_idsold_input_idsold_positionsold_hidden_statesold_out_cache_locold_extend_seq_lensold_extend_start_locr#   	positionshidden_statesout_cache_locr%   r$   seq_lenspadding_lensnum_seqsr>   req_pool_indicesreq_to_tokenreq_to_hidden_states_poolc                 C   s   |	j d }d}d}|f}t| |||||||||	|
||| |||||||d|d|d|d|d|d|df|||d dS )zM
    Wrapper function to calculate offsets and launch the Triton kernel.
    r   r!   @   r      )r+   r,   r-   N)r"   r_   stride)r`   ra   rb   rc   rd   re   rf   r#   rg   rh   ri   r%   r$   rj   rk   rl   r>   rm   rn   ro   
hidden_dimr,   r-   r)   r   r   r   assign_new_state_triton   sH   
rt   	pool_sizec                 C   s   t d}t || }t || }|| }t || }|| }t|D ]@}td|
|D ]7}|t d| }||
k }| || | |  ||  }t j||d}|| ||  ||	  }t j|||d q.q&d S )Nr   r   )r   r   r	   r
   r   r   )r6   r;   r=   r   r   r?   r@   rA   rB   rC   r+   ru   r-   r   
extend_lenr   end_locrP   rT   rX   rU   rV   rW   hid_ptrhid_valpool_ptrr   r   r    assign_hidden_states_pool_kernel  s8   
r{   c                 C   sT   |f}t | | ||||| d| d|d|d|d| jd |dd d S )Nr   r   rq   rp   )r+   ru   r-   )r{   rr   r"   )rh   rm   ro   ru   rl   r%   r$   r)   r   r   r    assign_hidden_states_pool_triton3  s    	
r|   c                 C   sb   t |D ]*}|| }|| }	|| }
|
|	 }||d |d d f | || |d d f  qd S N)r
   copy_)rh   rm   ro   ru   rl   r%   r$   reqpool_idxrv   r   rw   r   r   r   assign_hidden_states_pool_torchN  s   	r   r}   )torchtritontriton.languagelanguager   jit	constexprr    r*   r_   Tensorintrt   r{   r|   r   r   r   r   r   <module>   s   %
 !"u	

D-
