o
    پi(                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZm	Z	 ej
ddZ	d#ddZd$dd	Zd
d ZdedefddZ						 				d%dededededededededededed edefd!d"ZdS )&    N)RequestFuncOutputget_tokenizerremove_prefixsample_random_requestsi@ )totalc                    s6  t jtd4 I dH }i }d}g }d}t }|}	t }
z|j|| |d4 I dH }|jdkrd}d}|j2 zf3 dH W }|	 }|sDq7t
|dd	}t | }|d
krWq7t|}|dr|d }|dd}t }|dkrt | }||
_|dpi dd}|dpi dd}n|
j||	  |}	q76 ||
_||
_d|
_||
_||
_||
_t|
jd |
_n	|jpd|
_d|
_W d  I dH  n1 I dH sw   Y  W n! ty } zd|
_t||
_td|  W Y d}~nd}~ww W d  I dH  n1 I dH sw   Y  |r|d |
S )zSend a streaming request to the server and collect cache metrics.

    Returns a RequestFuncOutput with additional cached_tokens and output_ids attributes.
    timeoutN         )urljsonheaders   r   zutf-8zdata: z[DONE]
output_idstext	meta_infoprompt_tokenscached_tokensT   FzRequest failed: ) aiohttpClientSessionAIOHTTP_TIMEOUTtimeperf_counterr   poststatuscontentstripr   decoder   loadsgetttftitlappendgenerated_textr   successlatency
prompt_lenr   lengenerated_lenreasonerror	Exceptionstrprintupdate)payloadr   pbarsessionr   r$   all_output_idsr!   stmost_recent_timestampoutputresponser   r   chunk_byteschunkr&   data	timestampe r=   R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/kits/cache_hit_kit.pyasync_request_sglang_generate   s|   	


!(0
*?
r?   r	   c                 C   s    | d|dddddi|dddS )Nr
   T)temperaturemax_new_tokens
ignore_eosinclude_usageF)	input_idssampling_paramsstreamstream_options	lora_pathreturn_logproblogprob_start_lenr=   )rE   
output_lenrI   r=   r=   r>   gen_payload_   s   rM   c                    s<   t |fdd  fdd| D }t j| I dH S )z=Send a batch of payloads concurrently with concurrency limit.c              	      sN    4 I d H  t | I d H W  d   I d H  S 1 I d H s w   Y  d S N)r?   )r0   )	semaphorer   r=   r>   	_send_onew   s   0z_send_round.<locals>._send_onec                    s   g | ]	}t  |qS r=   )asynciocreate_task).0p)rP   r=   r>   
<listcomp>{   s    z_send_round.<locals>.<listcomp>N)rQ   	Semaphoregather)payloadsr   max_paralleltasksr=   )rP   rO   r   r>   _send_roundo   s
   
r[   base_urlreturnc                 C   sH   zt j|  ddd}|  | }|ddW S  ty#   Y dS w )z/Query server for page_size used by radix cache.z/get_server_info
   r   	page_sizer   )requestsr    raise_for_statusr   r,   )r\   respinfor=   r=   r>   _get_page_size   s   rd                r   @   
model_pathnum_clients
num_roundsrequest_lengthoutput_lengthmiss_tolerancesub_question_input_lengthrI   dataset_pathrY   seedc           ,         sf  ddl }ddl}|| |j | |  d}t| }t|  d td |dkr/|n|}t|}t	||d||	dd}d	d
 |D }t	||t
|d d d||	dd}dd
 |D }dd t|D }dg| }dd
 |D }d}t|D ]} fdd
|D }tt|||
}t|D ]\}}|jsJ d| d| d|j || d |j || d |j || d |j |dkrd}n||  | } | | | }d| d| d|j d| d||  d d| d}!t|! |j|ksJ |j||< || |j ||d k r$|| ||  |d7 }qq{d}"d}#i i d}$t|D ]b}%||% }&t|&d }'t|&d }(|'dkrN|(|' nd})|&d rat|&d t|&d  nd}*|)|*|'|(t|&d d|$d d |% < |"|'7 }"|#|(7 }#td!|% d"|)d#d$|*d#d%|( d&|' d' q3|"dkr|#|" nd}+|+|"|#d(|$d)< td*|+d# |$S )+aJ  Run a multi-turn workload and verify cache hit rate.

    Sends requests in round-barrier mode: all clients complete round i
    before round i+1 starts, ensuring deterministic cache state.

    The expected cache hit rate is self-computed from the workload structure:
    - Round 0: expected cached_tokens = 0 (cold start after flush)
    - Round r (r >= 1): each client's prefix from round r-1 should be cached,
      minus up to previous round's (prompt_len + decoding output - miss_tolerance) // page * page.

    Returns metrics dict with per-round and overall cache_hit_rate.
    r   Nz	/generatez/flush_cacher   g      ?F)	input_lenrL   num_promptsrange_ratio	tokenizerrq   return_textc                 S      g | ]}t |jqS r=   listpromptrS   rr=   r=   r>   rU          z0run_multiturn_cache_hit_test.<locals>.<listcomp>c                 S   rx   r=   ry   r|   r=   r=   r>   rU      r~   c                 S   s   i | ]	}|g g g d qS ))r'   r   r!   r=   )rS   ir=   r=   r>   
<dictcomp>   s    z0run_multiturn_cache_hit_test.<locals>.<dictcomp>c                 S   s   g | ]}t |qS r=   )rz   )rS   idsr=   r=   r>   rU      s    c                    s   g | ]}t | qS r=   )rM   )rS   hrI   rn   r=   r>   rU      s    zRound z	, client z	 failed: r'   r   r!   z: cached_tokens=z, expected>=z (prev_prompt=z	, output=z, page_size=))roundsoverallr
   )cache_hit_rateaverage_ttfttotal_prompt_tokenstotal_cached_tokensrequest_countr   round_z  Round z: cache_hit_rate=z.4fz, avg_ttft=z
s, cached=/z tokens)r   r   r   r   z  Overall cache_hit_rate=)randomnumpyrr   rd   r`   r   r   sleepr   r   maxrangerQ   runr[   	enumerater%   r+   r#   r'   r   r!   r.   extendr   sumr(   ),r\   rj   rk   rl   rm   rn   ro   rp   rI   rq   rY   rr   r   npgenerate_urlr_   effective_sub_lenrv   initial_inputsinitial_token_idssub_question_inputssub_question_token_idsround_metricsprev_prompt_lens	historiessub_idx	round_numrX   	responsesr   rb   expected_cached	cacheablemsgtotal_prompttotal_cachedresultr}   rmr_promptr_cached
r_hit_rate
r_avg_ttftoverall_hit_rater=   r   r>   run_multiturn_cache_hit_test   s   



	
"
(
&

r   rN   )r	   )
re   rf   rg   rh   r   r   r	   r	   ri   r   )rQ   r   r   r   r`   sglang.bench_servingr   r   r   r   ClientTimeoutr   r?   rM   r[   r-   intrd   dictr   r=   r=   r=   r>   <module>   sb    

M	
