o
    پik                     @   s  d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
 ddlZddlmZ dZdZejeZeejeddZe ZW d   n1 sOw   Y  ejG d	d
 d
Z									d)dedededede	e de	e dede
eeef  defddZ	d*dede	e defddZd+ddZ dd  Z!ejG d!d" d"Z"d#d$ Z#d%d& Z$e%d'kre& Z'e(e' e') Z*e*j+du rd(e*_+e$e* dS dS ),a  
Batch the same prompt in random batch sizes, and test if the results are consistent across different trials.

Usage:
# Single mode: test determinism with varying batch sizes
python3 -m sglang.test.test_deterministic --n-trials 50 --test-mode single

# Prefix mode: test with shared prefixes
python3 -m sglang.test.test_deterministic --n-start 1 --n-trials 50 --test-mode prefix

# Radix Cache Consistency mode: test radix cache determinism (cached vs uncached prefill)
python3 -m sglang.test.test_deterministic --test-mode radix_cache
    N)AnyDictListOptional)run_profilezTell me about Richard Feynman: z{Generate 1000 random numbers. Go directly into it, don't say Sure and don't say here are numbers. Just start with a number.zlong_prompt.txtrc                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZ	e
ed< d	Zeed
< dZeed< dZe
ed< dZe
ed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< edejfddZedejfdd Zd!S )"	BenchArgs	localhosthosti0u  port   
batch_size        temperature*   sampling_seedd   max_new_tokensfrequency_penaltypresence_penaltyFreturn_logprobstreamprofile   profile_stepsprofile_by_stagesingle	test_mode2   n_trialsn_startparserc                 C   s  | j dttjd | j dttjd | j dttjd | j dttjd | j dttj	d | j dttj
d | j dttjd | j d	ttjd | j d
ttjd | j ddd | j ddd | j dttjg dd | j ddd | j dttjd | j ddd d S )Nz--host)typedefaultz--portz
--n-trialsz	--n-startz--temperaturez--sampling-seedz--max-new-tokensz--frequency-penaltyz--presence-penaltyz--return-logprob
store_true)actionz--streamz--test-mode)r   prefixradix_cachep_vs_d)r"   r#   choicesz	--profilez--profile-stepsz--profile-by-stage)add_argumentstrr   r
   intr   r   r    floatr   r   r   r   r   r   r   )r!    r.   R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/test_deterministic.pyadd_cli_args4   s<   zBenchArgs.add_cli_argsargsc                    s0   dd t | D }| di  fdd|D S )Nc                 S   s   g | ]}|j qS r.   )name.0attrr.   r.   r/   
<listcomp>\       z+BenchArgs.from_cli_args.<locals>.<listcomp>c                    s   i | ]}|t  |qS r.   )getattrr3   r1   r.   r/   
<dictcomp>]       z+BenchArgs.from_cli_args.<locals>.<dictcomp>r.   )dataclassesfields)clsr1   attrsr.   r9   r/   from_cli_argsZ   s   zBenchArgs.from_cli_argsN)__name__
__module____qualname__r
   r+   __annotations__r   r,   r   r   r-   r   r   r   r   r   boolr   r   r   r   r   r   r    staticmethodargparseArgumentParserr0   classmethod	Namespacer@   r.   r.   r.   r/   r   !   s*   
 %r   Fr   Tr   r   r   return_full_response	input_idspromptr   extra_paramspick_first_resultc
                 C   s  d| j  d| j }
|d ur1|d u sJ || j|d ur|n| j| j| jd| j| jd|p.i }n"|d u s7J || j|d ur@|n| j| j| jd| j| jd|pQi }| jd ur_| j|d d< |rkt	|
|dd	g|d
 t
j|
 d|| jd}|jdkr| }td|  d S | jr|jddD ] }|d}|r|dr|dkr nt|dd  d}qn| }|	rt|tr|d n|}|r|S |d S )Nhttp://:r   r   r   r   )rL   sampling_paramsr   r   textrS   r   r   rS   r   CPUGPU)url	num_steps
activitiesr   	/generatejsonr      zError: F)decode_unicodezutf-8zdata:zdata: [DONE]   
r   rU   )r
   r   r   r   r   r   r   r   r   r   requestspoststatus_coder]   print
iter_linesdecode
startswithloadsstrip
isinstancelist)r1   r   r   r   rK   rL   rM   r   rN   rO   base_url	json_dataresponseretchunkr.   r.   r/   send_single`   s|   




rr   r   promptsc                 C   s`  t d| j d| j d g }g }t|D ]}tdt|d }|| |||  q|| j	| j
| j| jd| j| jd}| jd urM| j|d d	< t jd| j d| j d
|| jd}	|	 }
|	jdkrnt|
 dS |rdd tt|D }t|D ]}|||  |
|  q|S dd tt|D }t|D ]}|||  |
| d  q|S )NrP   rQ   /flush_cacher   r   rR   rT   rS   r   r[   r\   r^   )ru   ru   c                 S      i | ]}|g qS r.   r.   r4   ir.   r.   r/   r:      r7   zsend_prefix.<locals>.<dictcomp>c                 S   rv   r.   r.   rw   r.   r.   r/   r:      r7   rU   )rb   rc   r
   r   rangerandomrandintlenappendr   r   r   r   r   r   r   r]   rd   re   )r1   r   rs   rK   
batch_datasampled_indices_sampled_indexrn   ro   rp   ret_dictrx   r.   r.   r/   send_prefix   sH   


r   c                 C   s   t | t |krddt |  dt | fS tt| |D ]L\}\}}|d |d kr>dd| d|d  d|d  f  S t|d |d  |kridd| d|d  d|d  d	t|d |d   d
	f  S qdS )z0Compare two logprobs sequences with a tolerance.FzLength mismatch: z vs r   zToken ID mismatch at position : r   zLogprob mismatch at position z (diff: ))TzLogprobs match)r|   	enumeratezipabs)	logprobs1	logprobs2	tolerancerx   lp1lp2r.   r.   r/   compare_logprobs   s   &4r   c                    s,  t   t d  td d| _ddd} fdd}t| dgd	 d
dd td| j d| j d | }t| || j	dd|d}t
|}td| j d| j d t| dd |D ddd|d}t
|}g }	tt||ddD ]\}
\}}t d|
 d t|d |d }|	t| qt|	S )Nz Execute: test p_vs_d batch_size=r   Tr   )logprob_start_lenreturn_text_in_logprobsc                     sf   t tg} t t|  D ] }tdd}t dk rd}ntd|}| t||  q| d   S )Nr   i   g      ?r   )PROMPT_1PROMPT_2ry   r|   rz   	randranger}   LONG_PROMPT)ansrx   endbeginr   r.   r/   _create_prompts  s   z*_test_mode_p_vs_d.<locals>._create_promptsr   @   A   rL   r   rK   rP   rQ   rt   F)rM   r   rK   rO   rN   c                 S   s   g | ]}|d  j qS )io)	token_idsr4   xr.   r.   r/   r6   -  s    z%_test_mode_p_vs_d.<locals>.<listcomp>)rL   r   rK   rO   rN   )strictzCompare sequence z in batch...r   input)re   rz   seedr   rr   rb   rc   r
   r   r   _extract_ids_and_logprobsr   r   TokenIdsAndLogprobscomparer}   r,   )r1   r   query_extra_paramsr   rs   resp_ainfo_aresp_binfo_br   rx   info_a_iteminfo_b_itemcorrectr.   r   r/   _test_mode_p_vs_d  sH   
r   c                   @   s@   e Zd ZU ee ed< ee ed< dZdd Ze	ddd	Z
d
S )r   r   logprobsr   c                 C   s   t | j|j | j|j dS )Nr   r   )r   r   r   )selfotherr.   r.   r/   __add__F  s   

zTokenIdsAndLogprobs.__add__abc                 C   s  dd l }t|jt|jksJ |j|jk}|j|jk}|r#td ntd|jd|j |r=td|jd d  ntd d }tt|j|jD ]\}\}}	||	krZ|} nqLd}
|d urtd| d	t|j  |j|||
  }|j|||
  }d
d t||D }d| d|t|  d}td| }td| ddd |D  td| ddd |D  tddd| ddd |D  nm|j|
 d  }|j|
 d  }dd t||D }tddd |D t|j|
krdt|j dnd tdd d |D t|j|
krdt|j dnd td!d"d |D t|j|
kr7dt|j dnd d#d t|j|jD }|r|r|d$d |D }|d%d |D }|| }||| j	k}t
||}t
|jd }|dkr||d& | }|| }t||}t||}t|||| }td'| d	|  td(|d) td*|d) td+|d) ntd,|  |o|S )-Nr   u   ✅ Token matchu    ❌ Token mismatch: a.token_ids=z b.token_ids=u   ✅ Logprobs match:r`   u   ❌ Logprobs mismatchz!    First divergence at position /c                 S   4   g | ]\}}|d ur|d urt || ntdqS Nnanr   r-   r4   r   yr.   r.   r/   r6   k      "z/TokenIdsAndLogprobs.compare.<locals>.<listcomp>[rQ   ]zA z    A r   c                 S       g | ]}|d ur|dndqS N.10fNoner.   r   r.   r.   r/   r6   s       z    B c                 S   r   r   r.   r   r.   r.   r/   r6   w  r   z    Diff<c                 S      g | ]}|d qS .10er.   r   r.   r.   r/   r6   {      c                 S   r   r   r   r   r.   r.   r/   r6     r   z    A:    ...  c                 S   r   r   r.   r   r.   r.   r/   r6     r   (z total) z    B:    ...  c                 S   r   r   r.   r   r.   r.   r/   r6     r   z    Diff: ...  c                 S   r   r   r.   r   r.   r.   r/   r6     r   c                 S   s(   g | ]\}}|d ur|d ur||fqS Nr.   )r4   lp_alp_br.   r.   r/   r6     s
    c                 S   s   g | ]\}}|qS r.   r.   )r4   lpr   r.   r.   r/   r6     r   c                 S   s   g | ]\}}|qS r.   r.   )r4   r   r   r.   r.   r/   r6     r   r   z    Divergent tokens: z    KL(A||B) mean (divergent): r   z    KL(A||B) max  (divergent): z,    Mean absolute logprob diff (divergent): z    Divergent tokens: 0/)numpyr|   r   r   re   r   r   arrayr   DIVERGENCE_EPSr,   count_nonzeroshapeexpr-   meanmax)r>   r   r   nptoken_matchlogprobs_match	first_dividxlalbn_showa_showb_show	diff_show	pos_rangelabel_widthvalid_pairs
logprobs_a
logprobs_blogrdiverge_maskdiverge_counttotal_countkl_per_tokenkl_divergentkl_meankl_maxmean_abs_logrr.   r.   r/   r   L  s   


"$$	


zTokenIdsAndLogprobs.compareN)r   r   r   r   )rA   rB   rC   r   r,   rD   r-   r   r   rI   r   r.   r.   r.   r/   r   >  s   
 r   c                    s6   dd fdd t | ts| g}  fdd| D S )Nc                 S   sF   g g }}| d | D ]}|\}}}| | | | qt||dS )N	meta_infor   )r}   r   )ro   r2   r   r   itemlogprobtoken_idrU   r.   r.   r/   _extract_part  s   


z0_extract_ids_and_logprobs.<locals>._extract_partc                    s&    | d} | d}t |||| dS )Ninput_token_logprobsoutput_token_logprobs)r   outputr   )dict)ro   r   r   )r   r.   r/   _extract_one_response  s   

z8_extract_ids_and_logprobs.<locals>._extract_one_responsec                    s   g | ]} |qS r.   r.   r   )r   r.   r/   r6     r   z-_extract_ids_and_logprobs.<locals>.<listcomp>)rk   rl   )	responsesr.   )r   r   r/   r     s
   
r   c           3         sX  | j dkrNg }td| jd D ]'}|}t| | jtg| d}|dd}td| d| d|  || qtd	t	| d
t	t
|  t	t
|gS | j dkrg d t	 }dd tdD } fddtdD }| jr|dd tdD }t| j| j| j D ]]}|}t| ||| jd}	d| d| d}
t|D ]}|
d |  dt	|	|  d7 }
qt|
 t|D ]&}| jr|| |	|  || dd |	| D  q|| |	|  qqt|D ]}td| d |  dt	||  d
t	t
||   qg }t|D ]}|t	t
||  q| jrtdd  td td g }t|D ]}td| d |  d || }t	|d k rQq5|d! }d"}g }t|dd  dd#D ]2\}}|d$ d% }|d$ d% }t||\}}
|std&|d  d|
  ||d |
f d'}qc|rtd(t	| d) |d q5td*t	| d+t	| d, |d! q5tdd  td-d. |D rtd/ |S td0 |S | j d1krd"| _td2 td3 t| dgd4 d5d"d6}td7| j d8| j d9}d:}td;| d< td= d>d t|D }td?t	| d@ tdA|  d }tdB| dCt	| dD t| ||d"d6}|dE }|dF }|d$ d% }|dG } |dG d! }!tdHt	| dI tdJ| dK tdLt	||d dG   dM ||d dG  }"tdNt	| dOt	|" dI tdP|"  t| |"dd"d6}#|#d$ d% }$|$d! }%|%d! }&|%d }'tdQ tdR|'  tdS|&dT tdU td7| j d8| j d9}tdV tdP|"  t| |"dd"d6}(|(d$ d% })|)d! }*|*d! }+|*d },tdW tdR|,  tdS|+dT tdd  tdX td | |'k}-|!|&k}.tdY|  dZ|!dT td[|' dZ|&dT td\|-rJd]nd^  td_|.rVd]nd^  |.slt|!|& }/td`|/da tdb tdd  tdc td |'|,k}0|&|+k}1tdd|' dZ|&dT tde|, dZ|+dT td\|0rd]nd^  |0stdf|'  tdg|,  td_|1rd]nd^  |1stdf|&dT tdg|+dT t|&|+ }/tdh|/da tdi tdd  |0r|1rtdj dgS tdk d!gS | j dlkr$g }2td| jd D ]}|2t| |dm7 }2q|2S tdn| j  )oNr   r   )rM   ra    zTrial z with batch size r   zTotal samples: z, Unique samples: r&   )r   i  i   i  c                 S   rv   r.   r.   rw   r.   r.   r/   r:     r7   z&test_deterministic.<locals>.<dictcomp>   c                    s   g | ]
}t d  |  qS r   )r   rw   
len_prefixr.   r/   r6     s    z&test_deterministic.<locals>.<listcomp>c                 S   rv   r.   r.   rw   r.   r.   r/   r:     r7   )rK   zTesting Trial ,z # prefix length c                 S   s   g | ]}|d  qS )rU   r.   )r4   respr.   r.   r/   r6     r   zPrompt z with prefix length z: total samples: z<============================================================z&Logprobs Comparison Across Batch Sizesz
Prompt z (prefix length z):   r   T)startr   r   u     ✗ Sample Fu
     ✓ All z  samples have identical logprobsu     ✗ Found z mismatches out of z samplesc                 s   s    | ]}|d kV  qdS )r   Nr.   )r4   r   r.   r.   r/   	<genexpr>)  s    z%test_deterministic.<locals>.<genexpr>uB   ✓✓✓ Logprobs are identical across all batch sizes! ✓✓✓u<   ✗✗✗ Some logprobs differ across batch sizes! ✗✗✗r'   z'
=== Prefill Cache Consistency Test ===zRThis test verifies prefill request produces consistent logprobs w/ and w/o cache.
r   r   r   rP   rQ   rt   r   zStep 1: Generating random z token IDs...r   c                 S   s   g | ]}t d dqS )r   iP  )rz   r{   )r4   r   r.   r.   r/   r6   G  r;   u
   ✓ Using z initial tokensz  Initial token IDs: z
Step 2: Generating z tokens from z token prefix...rU   
output_idsru   u   ✓ Generated z tokensz  Output text: ""z&
Step 3: Generating with radix cache (z8 tokens prefill, should hit cache based on page size)...z
  Prefix: z initial + 1 generated = zUsing Prompt: u   ✓ Generated with cache:z  Token ID: z  Logprob:  r   z
Step 4: Flushing cache...zH
Step 5: Generating without cache (same 164 tokens prefill, no cache)...u   ✓ Generated without cache:zBComparison 1: Decode (Request 1) vs Prefill with Cache (Request 2)z(  Decode token (Request 1):          ID=z
, logprob=z)  Prefill w/ cache token (Request 2): ID=z  Token ID match: u   ✓ YESu   ✗ NOz  Logprob match:  z  Logprob difference: r   zC  Note: We expect these to be DIFFERENT (decode vs prefill kernels)zHComparison 2: Cached Prefill (Request 2) vs Uncached Prefill (Request 3)z)  Cached prefill token (Request 2):   ID=z)  Uncached prefill token (Request 3): ID=z    Cached:   z    Uncached: z    Difference: z>  Note: We expect these to be IDENTICAL (both prefill kernels)u<   ✓✓✓ TEST PASSED - Radix cache is consistent! ✓✓✓uI   ✗✗✗ TEST FAILED - Radix cache produces different results! ✗✗✗r(   r   zInvalid test mode: )r   ry   r   rr   r   r   replacere   r}   r|   setr   r    r   extendr   r   allrb   rc   r
   r   rz   r   r   r   
ValueError)3r1   textsrx   r   rU   num_promptsoutputsrs   full_responsesr   msgresultslogprob_results
prompt_idxr   	reference	all_match
mismatchesjr  ref_logprobsresp_logprobsmatchwarmup_responseflush_response
prefix_leninitial_token_idsnum_tokens_to_generatefirst_responsefirst_output_textfirst_output_token_idsfirst_output_logprobsexpected_token_idexpected_logprobprefix_token_idscached_responsecached_logprobscached_token_datacached_logprobcached_token_iduncached_responseuncached_logprobsuncached_token_datauncached_logprobuncached_token_iddecode_vs_prefill_token_matchdecode_vs_prefill_logprob_matchdiffr   logprob_matchr   r.   r   r/   test_deterministic  s  
 
$	2
r7  __main__r   )	Fr   FFNNNNT)F)r   ),__doc__rG   r<   r]   osrz   typingr   r   r   r   rb   sglang.profilerr   r   r   pathdirname__file__dirpathopenjoinfreadr   	dataclassr   rE   r,   r+   rr   r   r   r   r   r   r7  rA   rH   r!   r0   
parse_argsr1   r   r.   r.   r.   r/   <module>   s    
@	

_

3;z  

