o
    پi2!                     @  s   d dl mZ d dlZd dlZd dlmZmZmZmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ erPd d
lmZ d dlmZ eeZejG dd dZd%ddZd&ddZd'dd Z d(d)d#d$Z!dS )*    )annotationsN)TYPE_CHECKINGListOptionalUnion)ExpertDistributionMetricsLogitsProcessorOutput)FutureIndices)Req)PPProxyTensors)
ServerArgs)GenerationBatchResult)EagleDraftInputc                   @  s   e Zd ZU dZded< dZded< dZded< dZd	ed
< dZded< dZ	ded< dZ
ded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< d%dd Zed&d#d$ZdS )'r   NzOptional[LogitsProcessorOutput]logits_outputzOptional[PPProxyTensors]pp_hidden_states_proxy_tensorsz1Optional[Union[torch.Tensor, List[torch.Tensor]]]next_token_idsr   intnum_accepted_tokenszOptional[List[int]]accept_length_per_req_cpuFboolcan_run_cuda_graphextend_input_len_per_req extend_logprob_start_len_per_reqzOptional[torch.cuda.Event]	copy_donezOptional[callable]delay_sample_funczOptional[FutureIndices]future_indiceszOptional[torch.Tensor]accept_lenszOptional[EagleDraftInput]next_draft_inputz#Optional[ExpertDistributionMetrics]expert_distribution_metricsreturn_logprobc                 C  s   |r$| j jdur| j jjddd| j _| j jdur$| j jjddd| j _| j jdur5| j jjddd| j _| jjddd| _| jdurL| jjddd| _| j }durW|  | j	
  dS )zCopy tensors to CPU in overlap scheduling.
        Only the tensors which are needed for processing results are copied,
        e.g., next_token_ids, logits outputs
        NcpuT)non_blocking)r   next_token_logprobstoinput_token_logprobshidden_statesr   r   r   copy_to_cpur   record)selfr    x r+   M/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/managers/utils.pyr'   4   s"   

z!GenerationBatchResult.copy_to_cpunext_pp_outputsr   c              	   C  s.   |j }| |d |d |dd |dd |dS )Nr   r   r   )r   r   r   r   r   r   )tensorsget)clsr   r-   r   
proxy_dictr+   r+   r,   from_pp_proxyP   s   
z#GenerationBatchResult.from_pp_proxy)r    r   )r-   r   )__name__
__module____qualname__r   __annotations__r   r   r   r   r   r   r   r   r   r   r   r   r   r'   classmethodr2   r+   r+   r+   r,   r      s$   
 
r   reqr   max_req_input_lenr   allow_auto_truncater   returnOptional[str]c                 C  sd   t | j|kr0|r"tdt | jd|d | jd| | _dS dt | j d| d}|S dS )a:  Validate and potentially truncate input length.

    Args:
        req: The request containing input_ids to validate
        max_req_input_len: Maximum allowed input length
        allow_auto_truncate: Whether to truncate long inputs

    Returns:
        Error message if validation fails, None if successful
    zuRequest length is longer than the KV cache pool size or the max context length. Truncated. len(req.origin_input_ids)=z, max_req_input_len=.NzInput length (z- tokens) exceeds the maximum allowed length (z> tokens). Use a shorter input or enable --allow-auto-truncate.)lenorigin_input_idsloggerwarning)r8   r9   r:   	error_msgr+   r+   r,   validate_input_lengthb   s"   rC   resultdictc                 C  s\   | j }|d us	J | j| j| j j| j j| j j| j j| j j| j j| j j	| j j
| j j| j jdS )N)r   r   r#   next_token_top_logprobs_valnext_token_top_logprobs_idx!next_token_token_ids_logprobs_val!next_token_token_ids_logprobs_idxr%   input_top_logprobs_valinput_top_logprobs_idxinput_token_ids_logprobs_valinput_token_ids_logprobs_idx)r   r   r   r#   rF   rG   rH   rI   r%   rJ   rK   rL   rM   )rD   r   r+   r+   r,   get_logprob_dict_from_result   s   rN   r-   r   2tuple[LogitsProcessorOutput, list[int], list[int]]c                 C  sb   t d d | d | d | d | d | d | d | d | d | d	 | d
 d}| d }| d }|||fS )Nr#   rF   rG   rH   rI   r%   rJ   rK   rL   rM   )next_token_logitsr&   r#   rF   rG   rH   rI   r%   rJ   rK   rL   rM   r   r   r   )r-   r   r   r   r+   r+   r,   get_logprob_from_pp_outputs   s.   
rQ   server_argsOptional[ServerArgs]c                 C  sn   | d u rddl m} | } | jd u rdS | jpd}| jpd}| j}| j}|dks,|dkr3t|| |S td)Nr   )get_global_server_args   zLget_alloc_len_per_decode not implemented for page_size > 1 and spec_topk > 1)	sglang.srt.server_argsrT   speculative_algorithmspeculative_num_stepsspeculative_eagle_topkspeculative_num_draft_tokens	page_sizemaxNotImplementedError)rR   rT   
spec_steps	spec_topkspec_tokensr[   r+   r+   r,   get_alloc_len_per_decode   s   


ra   )r8   r   r9   r   r:   r   r;   r<   )rD   r   r;   rE   )r-   r   r;   rO   )N)rR   rS   r;   r   )"
__future__r   dataclassesloggingtypingr   r   r   r   torch#sglang.srt.eplb.expert_distributionr   "sglang.srt.layers.logits_processorr	   !sglang.srt.managers.overlap_utilsr
   "sglang.srt.managers.schedule_batchr   ,sglang.srt.model_executor.forward_batch_infor   rV   r   sglang.srt.managers.schedulerr   !sglang.srt.speculative.eagle_infor   	getLoggerr3   r@   	dataclassrC   rN   rQ   ra   r+   r+   r+   r,   <module>   s*    

I
!
