o
    پic                     @  s~  d dl mZ d dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z& e# Z'e$ Z(e% Z)erd dl*m+Z+ e'rd dl,m-Z- ne(rd dl,m-Z- nd dl.m-Z- e/e0Z1ej23 Z4ej53 Z6dZ7e'Z8dwdxddZ9ej:dyddZ;ej:dzddZ<d{d&d'Z=ej:d|d-d.Z>ej:d}d2d3Z?ej:d~d6d7Z@ej:dd:d;ZAejBd<e)d=ddBdCZCej:ddDdEZDejBd<e)d=ddGdHZEejBd<e)d=ddNdOZFe4e6fddTdUZG	dwdd^d_ZHddgdhZIddkdlZJeddodpZKddsdtZLddudvZMdS )    )annotationsN)contextmanager)TYPE_CHECKINGListOptional)snapshot_download)BaseGrammarObject)GroupCoordinatorpatch_tensor_parallel_group)envs)LogitsProcessorOutput)Reqget_last_loc)
ServerArgsget_global_server_args)is_cudais_hipis_npunext_power_of_2)EagleVerifyInput)	fast_topk   server_argsOptional[ServerArgs]returnboolc                 C  s   | d u rt  } | j S N)r   enable_multi_layer_eagle)r    r   U/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/speculative/spec_utils.pyspec_need_hidden_states5   s   r!   bs_uppertl.constexprc                 C  s   t jdd}t d|}t || }t || }	t t j|| ||k dd}
||
 }||	k }t || ||	 | | |
|	d 7 }
t | |
 }t || | d S )Nr   axismaskotherr   tl
program_idarangeloadsumstore)verified_idseq_lensaccept_lens	positionsnew_verified_idr"   pidoffsets
seq_lengthaccept_lengthaccept_len_cumsumpositions_ptrr'   verified_id_datar   r   r    $create_extend_after_decode_spec_info=   s   	r<   pool_lenc                 C  s  d}t jdd}t || }	t || }
|t | | |  }t d|}t j|| ||k dd}t j|| ||k dd}t j|| dd}|| }t d||	 }t d|}t |
|	 |}t|D ]!}||
k }t j|| |d}t j|| ||d ||7 }||7 }qfd S )N    r   r$   r&   r'   )r*   r+   r-   r,   r.   cdivranger/   )req_pool_indicesreq_to_tokenstart_offset
end_offsetout_cache_locr=   r"   
BLOCK_SIZEr5   kv_startkv_end
token_poollength_offsetstartend
out_offsetout_cache_ptrsave_offsetload_offsetnum_loop_r'   datar   r   r    assign_req_to_token_poolW   s(   

rU   rB   torch.TensorrC   rD   rE   rF   
batch_sizeintc              	   C  s(   t |f | |||||jd t| d S Nr   )rU   shaper   )rB   rC   rD   rE   rF   rW   r   r   r    assign_req_to_token_pool_funcz   s   r[   duplicate_cache_lentopkspeculative_num_steps	page_size
iter_upperc           ,      C  s  d}t jdd}|dks|dkr|| }||| |  }nt d|}t || }t t j|| ||k d}|| }t || }|t | | |
  }t ||}t|D ]%}t d|||  }||k }t j|| |d}t j|| | ||d qX|dkrJ|dkrL|	dkrNt || }|| }t d|}||k }t || } || | }!t j|!| |d}"t || }#td|D ]A}$t j||d |#|   |$d |  | |"|d t j|!|$|  |  | |d}%t j||d |#|   |$d |  | |%|d qt d|}&t|D ]?}$|&|| k }'|&|k}(|'|(@ })t j|!|$|  |  |& |)dd}*|| | }+t j||+ |$|  | |& |*|)d qd S d S d S d S )N   r   r$   r   r?   r&   )r*   r+   r,   r-   r.   r@   rA   r/   ),rB   rC   r1   extend_lensnum_new_pages_per_topkrF   source_cache_loctarget_cache_loclast_page_lens_cumsumr\   r=   r]   r^   r_   r"   r`   rG   r5   copy_lenrO   	bs_offsetcum_copy_lenrH   rJ   rR   icopy_offsetr'   rT   
prefix_lenlast_page_lenr6   num_new_pages_per_topk_prefix_basesrc_indiceslast_page_lens_cumsum_topk_idtgt_indicesiter_offset
mask_upper
mask_lowercombined_maskindices
ptr_offsetr   r   r    assign_draft_cache_locs   s   



*rz   kv_indices_stridekv_indptr_stridenum_tokens_upperc           )      C  sh  d}t jdd}t jdd}t jdd}t jdd}t jdd}t jdd}||| 7 }||| 7 }|d7 }t d|	}t j|| ||k dd}t || }t |}|| || |  |||   }|| }|t | | |  }t d|}t ||}t|D ]}||k }t j|| |d}t j|| ||d ||7 }qt d|
}|dks|dkrt j|| ||  t d|
 ||k d} n-|}!|!| }"|"| | d | }#|| | }$|$||# |  |" }%t j||% | ||k d} t j|| | | ||k d t d|}&|| | }'|'dkr|| }'t j||& |&|'k dd}t |}(t ||' |(|'|   d S )Nra   r   r$   r      r&   r?   )	r*   r+   num_programsr,   r-   r.   r@   rA   r/   ))rB   rC   paged_kernel_lens
kv_indices	kv_indptrr3   r=   r{   r|   r"   r`   r}   r_   rG   itersbidrr   	num_stepsnum_seqsr]   rQ   r1   seq_lencum_seq_len	kv_offsetkv_ptrtoken_pool_ptrrR   rS   r'   rT   extend_offsetextend_datarl   rm   rc   ro   rL   rh   zidbaser   r   r     generate_draft_decode_kv_indices   sd   
 



r   num_draft_tokensrG   c                 C  s   t d|}t jdd}t | | }||k }t j|||  | |dd}	t |	}
||
 }|| d | | | }tt|dt|| |D ]}t |||  | d qGd S )Nr   r$   r&   r   F)	r*   r,   r+   r-   r.   rA   maxminr/   )r1   
evict_maskr_   r   rG   t_ranger   r   io_maskmask_row	num_trues	num_falserL   rj   r   r   r    align_evict_mask_to_page_sizeK  s   
r   num_verify_tokensnum_verify_tokens_upperc                 C  s  t jdd}t d|}	t d|}
t j||
 |
|k d}t || }t || d }t j|||  |	 |	|k d}t j| | |	 ||	|k d t j||
 |
|k d}t || }|| }t |}|}t j|||  | |	 |	|k d}t j|| |	 ||	|k d d S )Nr   r$   r?   r   r)   )tgt_cache_locto_free_slotsr8   to_free_num_slotsrF   r   r   r"   r   offsetrh   accept_len_alltgt_cache_loc_startrg   out_cache_loc_rowto_free_num_slots_allto_free_num_slots_curout_cache_loc_startto_free_slots_startr   r   r    get_target_cache_locd  s8   



r   T)dynamicdisabler1   accept_indexr8   draft_token_numc                 C  sP   || }t |}| | }t | | d | d | | |}	||	 }
|||
fS rY   )torch
empty_likeminimum)r1   rF   r   r8   r   r_   src_cache_locr   extended_lenkeep_lenr   r   r   r    get_src_tgt_cache_loc  s   	

r   c                 C  s   t d}t d|}t j|| ||k d}t || }	t j|| ||k d}
t |
}t || }t d|}t j||	 | ||k d}t j| | | |||k d d S )Nr   r?   r)   )rF   r   r8   accept_length_filterr"   r   r   rh   accept_length_all	old_startaccept_length_filter_all	new_startrg   rk   valuer   r   r     filter_finished_cache_loc_kernel  s    
	

r   unfinished_index_devicec                 C  s,   t | }| | d ||< || d  |S rY   )r   
zeros_likeadd_)r8   r   r1   r   r   r   r    create_accept_length_filter  s
   

r   rj   topk_p
topk_indexhidden_statesscoresc                 C  s4  | dkr1|  }|d ur|j|dd}|}|d|tjd|tj|jdd|jd df}nct	|d|
d||}t|j dd|dd\}	}
|	}|
d|d }tj||
dd  }|jd dkr|
  | tjd|jd ||jd	| }||d d f }|||
|d | d  |  f}||||fS )
Nr   )dimr   dtypedevicer~   )	start_dim)indexr   )stepr   )flattenrepeat_interleave	unsqueezer   r,   longr   repeatrZ   mulreshaper   gather)rj   r   r   r   r   r]   	input_ids	tree_infoexpand_scores	topk_cs_ptopk_cs_indexselected_input_indexr   r   r    select_top_k_tokens  s@   		
r   simulate_acc_lenfloatsimulate_acc_methodstrc                 C  s^  |dksJ |dkr&t j|dddd}t j|d|d d}t|  }nO|d	krntdt|d |}t|d }||d k rD|d n|}	||	krM|}n(|| }
d|
 }t j||
gdd
}t j	|dd}|dkrk|n|	}nt
dt | d d df dd}t j||d fdt jdd}|t j|| jd
 |d d d |f< ||d  |d |S )Ng        multinomialg      ?)r   cpu)meanstdsizer   r   )r   r   zmatch-expected)r   )num_samplesr   zInvalid simulate_acc_method: r   cudar   d   )r   normalclamprX   rounditemr   r   tensorr   
ValueErrorSIMULATE_ACC_METHODviewfullint32r,   r   fill_)r   predictr8   bs
spec_stepsr   r   simulated_valueslowerupperweight_upperweight_lowerprobssampled_indexaccept_indx_first_colsim_accept_indexr   r   r    generate_simulated_accept_index  s@   	
r   retrieve_next_tokenretrieve_next_siblingdraft_tokensgrammarr   allocate_token_bitmask
vocab_sizeOptional[int]c                   sJ   | j |j   krj ksJ  J d fddd	| |d
 dS )zW
    Traverse the tree constructed by the draft model to generate the logits mask.
    currrX   r   rV   r   
parent_posc                   s   | dkrd}n | }|  }r|krd}n||d  d|d > @ dk}|rU| dkr3 |    sL |  ||  dkrL||  |||  | dkrUd ||  dkrf||  ||| d S d S )Nr   TFr>   r   r   )accept_tokenis_terminatedfill_vocab_maskrollback)r  r   r   r  acceptedparent_bitmaskcurr_token_idr  dfsr   r  r  r   r    r  I  s>   
ztraverse_tree.<locals>.dfsr   r   N)r  rX   r   rV   r   rV   r  rX   )rZ   )r   r   r   r  r  r  r   r  r    traverse_tree:  s   "2r  reqs	List[Req]verify_inputr   retrieve_next_token_cpuretrieve_next_sibling_cpudraft_tokens_cpuc              
   C  s   |j d }d}t| |j d ksJ d}t| D ]P\}	}
|
jdurh|du r0|
jj|| dd}|
j}t }t||	 ||	 ||	 |
j||	| |	d |  |d t | }|t	krht
d| d	|
j  q||_|S )
a  
    Generate the logit mask for structured output.
    Draft model's token can be either valid or invalid with respect to the grammar.
    We need to perform DFS to
    1. figure out which tokens are accepted by the grammar.
    2. if so, what is the corresponding logit mask.
    r   Nr   r   )r  rW   r   r   )r  zBit mask generation took z seconds with grammar: )rZ   len	enumerater  allocate_vocab_masknumeltimeperf_counterr  TREE_TRAVERSE_TIME_THRESHOLDloggerwarning)r  r  r  r  r  r  r   r  r  rj   reqstree_traverse_timer   r   r    generate_token_bitmask~  sD   


r#  token_map_path	List[int]c                 C  sX   t j| stt j| ddgd}t j|t j| } tj| dd}tj	|tj
dS )Nz*.binz*.safetensors)ignore_patternsT)weights_only)r   )ospathexistsr   dirnamejoinbasenamer   r-   r   int64)r$  	cache_dirhot_token_idr   r   r    load_token_map  s   
r1  tp_groupr	   c                 c  s6    t |  d V  W d    d S 1 sw   Y  d S r   )r
   )r2  r   r   r    draft_tp_context  s   
"r3  logits_outputr   c                 C  s,   | j }tt|rtd tdd S )Nz3Detected errors during sampling! NaN in the logits.)next_token_logitsr   anyisnanr  errorr   )r4  logitsr   r   r    
detect_nan  s
   
r:  c                 C  s\   |}|| }|| | d | }|| | |||   }|| }	t | ||}
|||
||	|fS rY   r   )rC   rB   r1   r^   r]   r_   prefix_lenslast_page_lensrc   rb   last_locr   r   r    (get_last_loc_large_page_size_large_top_k  s*   r>  r   )r   r   r   r   )r"   r#   )r=   r#   r"   r#   )rB   rV   rC   rV   rD   rV   rE   rV   rF   rV   rW   rX   )r\   r#   r=   r#   r]   r#   r^   r#   r_   r#   r"   r#   r`   r#   )r=   r#   r{   r#   r|   r#   r"   r#   r`   r#   r}   r#   r_   r#   )r_   r#   r   r#   rG   r#   )r   r#   r   r#   r"   r#   )r1   rV   rF   rV   r   rV   r8   rV   r   rX   r_   rX   )r"   r#   r   r#   )r8   rV   r   rV   r1   rV   )rj   rX   r   rV   r   rV   r   rV   r   rV   r]   rX   )r   r   r   r   )r   rV   r   rV   r   rV   r  r   r  rV   r  r  )r  r  r  r   r  rV   r  rV   r  rV   r  rX   )r$  r   r   r%  )r2  r	   )r4  r   )rC   rV   rB   rV   r1   rV   r^   rX   r]   rX   r_   rX   )N
__future__r   loggingr(  r  
contextlibr   typingr   r   r   r   tritontriton.languagelanguager*   huggingface_hubr   +sglang.srt.constrained.base_grammar_backendr   %sglang.srt.distributed.parallel_stater	   r
   sglang.srt.environr   "sglang.srt.layers.logits_processorr   "sglang.srt.managers.schedule_batchr   sglang.srt.mem_cache.commonr   sglang.srt.server_argsr   r   sglang.srt.utilsr   r   r   r   _is_cuda_is_hip_is_npu!sglang.srt.speculative.eagle_infor   
sgl_kernelr   sglang.srt.utils.common	getLogger__name__r  SGLANG_SIMULATE_ACC_LENgetSIMULATE_ACC_LENSGLANG_SIMULATE_ACC_METHODr   r  TREE_SPEC_KERNEL_AVAILABLEr!   jitr<   rU   r[   rz   r   r   r   compiler   r   r   r   r   r  r#  r1  r3  r:  r>  r   r   r   r    <module>   s    



"lP-8;
D
4

