o
    i^                     @  s  d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlmZmZ ddlmZmZmZmZ ejejejeZejedZejedejedefD ]Zee	jvrre	jde qdd	ejd
< ddddZg dZeG dd dZG dd dZ eG dd dZ!eG dd dZ"dFd(d)Z#dGd0d1Z$dHd5d6Z%dId8d9Z&dJd?d@Z'dKdAdBZ(dLdCdDZ)e*dEkre)  dS dS )Mz
Direct-runtime stress benchmark with detailed timing for optimized local TTS path.

This bypasses HTTP serving and calls `tts_runtime` directly so we can isolate
generation + decode bottlenecks under concurrency.
    )annotationsN)	dataclassfield)AnyDictListOptionalexternalsparkttszAP-BWEtrueAUTH_BYPASS_MODEz&Hello, this is a quick benchmark test.zThe quick brown fox jumps over the lazy dog. This benchmark sentence checks token generation speed, decode latency, and concurrency behavior for Spark TTS.zIn the center of a crowded city, a quiet storyteller documented voices from dawn to dusk. Every pause, every laugh, every whisper revealed a different cadence. She replayed each recording at night, aligning tone and timing until each phrase felt natural.shortmediumlong)lipakshivardanreetNandinikrishnaanikac                   @  s6   e Zd ZU ded< ded< ded< ded< ded< dS )GPUSnapshotfloat	timestampmemory_used_mbmemory_total_mbgpu_utilization_pcttemperature_cN)__name__
__module____qualname____annotations__ r"   r"   'scripts/stress_test_runtime_detailed.pyr   4   s   
 r   c                   @  s@   e Zd ZddddZddd	Zdd
dZdddZdddZdS )
GPUMonitor      ?interval_secondsr   c                 C  s   || _ g | _d| _d | _d S )NF)interval	snapshots_running_thread)selfr&   r"   r"   r#   __init__>   s   
zGPUMonitor.__init__returnNonec                 C  s&   d| _ tj| jdd| _| j  d S )NT)targetdaemon)r)   	threadingThread
_poll_loopr*   startr+   r"   r"   r#   r4   D   s   zGPUMonitor.startc                 C  s"   d| _ | jr| jjdd d S d S )NF   timeout)r)   r*   joinr5   r"   r"   r#   stopI   s   zGPUMonitor.stopc              
   C  s   | j r_zGtjg ddddd}|jdkrI|jrIdd |j dD }t|d	krI| j	t
t t|d t|d
 t|d t|d d W n	 tyS   Y nw t| j | j sd S d S )N)z
nvidia-smizD--query-gpu=memory.used,memory.total,utilization.gpu,temperature.gpuz--format=csv,noheader,nounitsT   )capture_outputtextr8   r   c                 S  s   g | ]}|  qS r"   )strip).0pr"   r"   r#   
<listcomp>\       z)GPUMonitor._poll_loop.<locals>.<listcomp>,         r6   )r   r   r   r   r   )r)   
subprocessrun
returncodestdoutr>   splitlenr(   appendr   timer   	Exceptionsleepr'   )r+   respartsr"   r"   r#   r3   N   s4   




	zGPUMonitor._poll_loopDict[str, Any]c                 C  s   | j sdddS dd | j D }dd | j D }dd | j D }t| j t|t|t|| j d jdt|t|t|d	t|t|t|d	d
S )Nr   zno GPU samples captured)sampleserrorc                 S     g | ]}|j qS r"   )r   r?   sr"   r"   r#   rA   n       z&GPUMonitor.summary.<locals>.<listcomp>c                 S  rV   r"   )r   rW   r"   r"   r#   rA   o   rY   c                 S  rV   r"   )r   rW   r"   r"   r#   rA   p   rY   )minavgmaxtotal)rZ   r[   r\   )rT   	memory_mbgpu_util_pctr   )r(   rL   rZ   
statisticsmeanr\   r   )r+   memutiltempr"   r"   r#   summaryk   s(   

zGPUMonitor.summaryN)r%   )r&   r   r-   r.   )r-   rS   )r   r   r    r,   r4   r:   r3   re   r"   r"   r"   r#   r$   =   s    


r$   c                   @  s\   e Zd ZU ded< ded< ded< ded< ded	< ded
< dZded< eedZded< dS )RequestResultboolsuccessr   
latency_msintaudio_bytesaudio_secondsstrspeakertext_lengthNzOptional[str]rU   default_factoryrS   timing)r   r   r    r!   rU   r   dictrs   r"   r"   r"   r#   rg      s   
 rg   c                   @  s   e Zd ZU ded< ded< eedZded< dZded	< eedZ	d
ed< e
d$ddZe
d$ddZe
d%ddZe
d%ddZd&ddZd'ddZd(dd Zd'd!d"Zd#S ))LevelResultrk   concurrencytotal_requestsrq   List[RequestResult]results        r   wall_time_srS   gpu_summaryr-   c                 C     dd | j D S )Nc                 S  s   g | ]}|j r|qS r"   ri   r?   rr"   r"   r#   rA          z)LevelResult.successes.<locals>.<listcomp>ry   r5   r"   r"   r#   	successes      zLevelResult.successesc                 C  r}   )Nc                 S  s   g | ]}|j s|qS r"   r~   r   r"   r"   r#   rA      r   z(LevelResult.failures.<locals>.<listcomp>r   r5   r"   r"   r#   failures   r   zLevelResult.failuresc                 C  s   | j rt| jt| j  S dS )Nrz   )ry   rL   r   r5   r"   r"   r#   success_rate   s   zLevelResult.success_ratec                 C  s   | j dkr| j| j  S dS )Nr   rz   )r{   rw   r5   r"   r"   r#   throughput_rps   s   zLevelResult.throughput_rpsvaluesList[float]pctc                 C  s0   |sdS t |}tt|d |d  }|| S )Nrz   rE   d   )sortedrk   rL   )r+   r   r   rX   idxr"   r"   r#   r@      s
   zLevelResult.pkeyrn   Dict[str, float]c                 C  sh   g }| j D ]}|j|}t|ttfr|t| q|s i S t|| 	|d| 	|dt
|dS )N2   _   r[   p50p95r\   )r   rs   get
isinstancerk   r   rM   r`   ra   r@   r\   )r+   r   valsr   vr"   r"   r#   timing_stats   s   


zLevelResult.timing_statsc                 C  s@   dd | j D }|si S t|| |d| |dt|dS )Nc                 S  rV   r"   rj   r   r"   r"   r#   rA      rY   z-LevelResult.latency_stats.<locals>.<listcomp>r   r   r   )r   r`   ra   r@   r\   )r+   r   r"   r"   r#   latency_stats   s   

zLevelResult.latency_statsc                 C  sx   g }| j D ]}|j|}t|ttfr#|tdt|jt|  q|s(i S t	
|| |d| |dt|dS )Nrz   r   r   r   )r   rs   r   r   rk   r   rM   r\   rj   r`   ra   r@   )r+   r   r   r   baser"   r"   r#   latency_over_metric_stats   s   


z%LevelResult.latency_over_metric_statsN)r-   rx   )r-   r   )r   r   r   rk   r-   r   )r   rn   r-   r   )r-   r   )r   r   r    r!   r   listry   r{   rt   r|   propertyr   r   r   r   r@   r   r   r   r"   r"   r"   r#   ru      s$   
 


ru   
model_pathrn   gpu_memr   num_enginesrk   max_num_batched_tokensOptional[int]max_num_seqsenable_chunked_prefillOptional[bool]enable_prefix_cachingdisable_log_statsenforce_eagerr-   r.   c	                 C  sV   ddl m}	 ddlm}
m} |	| \}}| s)|
||d||||||||dd d S d S )Nr   )resolve_model_paths)initialize_runtimeis_initializedcudaF)r   bicodec_pathdevicegpu_memory_utilizationr   r   r   r   r   r   r   precompute_speaker_globals)veena3modal.local_serverr    veena3modal.services.tts_runtimer   r   )r   r   r   r   r   r   r   r   r   r   r   r   llm_pathr   r"   r"   r#   init_runtime   s&   
r   r=   ro   
max_tokens	timeout_schunkingrh   c                   sl  ddl m} t }zc|r|j| ||d}n|j| ||d}tj||dI d H \}}	t | d }
|sFtd|
dd|t	| d|	pBi d	W S t
|	pJi d
d}tdt	|d |d  }td|
t	|||t	| |	pki dW S  tjy   tdt | d dd|t	| dd Y S  ty } ztdt | d dd|t	| t|d d dW  Y d }~S d }~ww )Nr   )tts_runtime)r=   ro   r   r7   i  Frz   no_audiori   rj   rl   rm   ro   rp   rU   rs   output_sample_ratei>  ,   rF   T)ri   rj   rl   rm   ro   rp   rs   r8   )ri   rj   rl   rm   ro   rp   rU      )veena3modal.servicesr   rN   perf_countergenerate_speech_chunkedgenerate_speechasynciowait_forrg   rL   rk   r   r\   TimeoutErrorrO   rn   )r=   ro   r   r   r   r   t0cororl   rs   
elapsed_mssample_raterm   excr"   r"   r#   run_one_request   sz   
	
	r   rv   rw   gpu_monitorc           	        s   t | |d}t| d
 fdd|j  t }tjfdd	t|D  I d H |_	t | |_
| |_|S )N)rv   rw   irk   r-   rg   c              	     sf   4 I d H  t | tt   }t| dI d H W  d   I d H  S 1 I d H s,w   Y  d S )Nr=   ro   r   r   r   )SPEAKERSrL   r   )r   ro   )r   r   	semaphorer=   r   r"   r#   wrappedL  s   
0zrun_level.<locals>.wrappedc                   s   g | ]} |qS r"   r"   )r?   r   )r   r"   r#   rA   Y  rB   zrun_level.<locals>.<listcomp>)r   rk   r-   rg   )ru   r   	Semaphorer(   clearrN   r   gatherrangery   r{   re   r|   )	rv   rw   r=   r   r   r   r   levelr   r"   )r   r   r   r=   r   r   r#   	run_level@  s   	

$
r   r   c                 C  s8  t d| j d| j d t dt| j d| j d| jdd t d	| jd
d| jd
d |  }|rRt d|d dd|d dd|d dd|d d g d}|D ]'}| 	|}|rt | d|d d
d|d d
d|d d
d|d d
	 qX| 	d}| 	d}| 	d}| 
d}| 
d}	|rt d|d |d |d |d  |	rt d|	d |	d |	d |	d  |rD|rD|dd}
|dd}|r|ddnd}td|
| | }|
dkrt d ||
 d! ||
 d! ||
 d!  d"d# | jD }|rDt|d$d% d&t|d'  }|j}t d(t|d)dt|d*dt|d+dt|d,dt|dd d-| jv rg| j}t d.|d/ d |d/ d |d- d |d- d  i }| jD ]}|jpsd0}||dd1 ||< qlt| d2d% d&D ]\}}t d3| d4|  qd S )5Nz
=== concurrency=z
 requests=z ===zsuccess=/z (z.0%)zwall=z.2fzs throughput=z req/szlatency_ms avg=r[   z.0fz p50=r   z p95=r   z max=r\   )generation_msllm_time_in_queue_msllm_scheduler_msllm_first_token_msllm_request_lifecycle_msllm_queued_to_scheduled_msllm_scheduled_to_first_token_msllm_first_to_last_token_msllm_queued_to_last_token_mstimeline_llm_first_batch_mstimeline_llm_done_mstimeline_parse_done_mstimeline_bicodec_done_mstimeline_total_msllm_batch_wall_ms_totalllm_batch_gpu_ms_totalllm_decode_wall_ms_totalllm_decode_gpu_ms_totalllm_parse_msllm_time_per_token_msbicodec_decode_wall_msbicodec_decode_gpu_msz: avg=r   r   r   r   zElatency_over_timeline_ms: avg={:.2f} p50={:.2f} p95={:.2f} max={:.2f}zGlatency_over_generation_ms: avg={:.2f} p50={:.2f} p95={:.2f} max={:.2f}rz   r   z9timeline_share: llm={:.1f}% bicodec={:.1f}% other={:.1f}%r   c                 S  s&   g | ]}t |jd ttfr|qS )r   )r   rs   r   rk   r   r   r"   r"   r#   rA     s    zprint_level.<locals>.<listcomp>c                 S  s   | j S )Nr   )r   r"   r"   r#   <lambda>  s    zprint_level.<locals>.<lambda>)r   rF   zitimeline_sample_ms: first_batch={:.1f} llm_done={:.1f} parse_done={:.1f} bicodec_done={:.1f} total={:.1f}r   r   r   r   r^   zHgpu: util_avg={:.0f}% util_max={:.0f}% mem_avg={:.0f}MB mem_max={:.0f}MBr_   failedrE   c                 S  s
   | d  S )NrE   r"   )kvr"   r"   r#   r     s   
 zerror[z] )printrv   rw   rL   r   r   r{   r   r   r   r   formatr   r\   r   rs   r   r|   r   rU   items)r   latkeysr   sttl_totalllm_wallbicodeclatency_over_timelinelatency_over_generation	total_avgllm_avgbicodec_avg	other_avgrT   sampletgerrorsr   kr   r"   r"   r#   print_level_  s   (8
<











	
r  pathlevelsList[LevelResult]argsargparse.Namespacec                 C  s:  i d|j d|jd|jd|jd|jd|jd|jd|jd	|jd
|j	d|j
d|jd|jd|jd|jd|jd|jdtt ig d}|D ]}|d |j|j|j|j|j| |ji d|dd|dd|dd|dd|dd|dd|dd|dd|dd|dd|dd|dd |d d!|d!d"|d"d#|d#d$|d$|d%|d&|d'|d(|d)|d)|dd*d+d, |jD d-	 qSt| d.d/d0}tj ||d1d2 W d    d S 1 sw   Y  d S )3Nr
  r=   r   r   r   requests_multiplier
gpu_memoryr   r   r   r   disable_chunked_prefilldisable_prefix_cachingdisable_engine_stats_logsr   enable_gpu_decode_timingnon_stream_final_onlyr   )configr
  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   latency_over_timeline_mslatency_over_generation_msc                 S  s2   g | ]}|j |j|j|j|j|j|j|jd qS )r   r   r   r"   r"   r#   rA   	  s    z save_results.<locals>.<listcomp>)	rv   rw   r   r   r{   rj   gpurs   requestswzutf-8)encodingrF   )indent)!r
  r=   r   r8   r   r  r  r   r   r   r   r  r  r  r   r  r  rk   rN   rM   rv   rw   r   r   r{   r   r|   r   r   ry   openjsondump)r	  r
  r  payloadlvlfdr"   r"   r#   save_results  s   	









	








2$r#  c           	   
     s   t | j }dd | jdD }tdd}|  t|td | j| j	| j
dI d H  g }t|D ]7\}}t| j|| j }t|||| j| j	| j
|dI d H }t| || |t|d	 k rhtd
I d H  q1|  t| j||  td| j  d S )Nc                 S  s    g | ]}|  rt|  qS r"   )r>   rk   )r?   xr"   r"   r#   rA     s     zmain_async.<locals>.<listcomp>rC   r%   )r&   r   r   )rv   rw   r=   r   r   r   r   rE   rF   z
results saved: )
TEST_TEXTSr=   r
  rK   r$   r4   r   r   r   r8   r   	enumerater\   min_requestsr  r   r  rM   rL   r   rP   r:   r#  outputr   )	r  r=   r
  r   ry   r   cnr   r"   r"   r#   
main_async  s@   

	
r+  c                  C  s  t jdd} | jdddd | jdg dd	d
 | jdtddd | jdtddd | jdtddd | jdtdd | jddddd | jdtdd | jdtd d!d | jd"td d | jd#td d | jd$dd d% | jd&ddd% | jd'ddd% | jd(ddd% | jd)ddd% | jd*ddd+d | jd,ddd-d | jd.td/d | jd0d1d2 |  }d3d4lm} |j	p|}|j
rd5tjd6< ntjd6d7 |jrd5tjd8< ntjd8d7 |j}|jrd}t||j|j|j|j||jrdnd |jrd9nd |jrd9nd d:	 tt| d S );Nz2Direct runtime stress benchmark (detailed timings))descriptionz--levelsz1,2,4,8z"Comma-separated concurrency levels)defaulthelpz--textr   r   )choicesr-  z--max-tokens   zmax_tokens sent to generation)typer-  r.  z	--timeoutg      ^@zPer-request timeout secondsz--requests-multiplierrD   z.Requests per level ~= concurrency * multiplierz--min-requests   )r1  r-  z
--chunking
store_trueFzUse generate_speech_chunked)actionr-  r.  z--gpu-memoryg      ?z--num-enginesrE   zNumber of vLLM enginesz--max-num-batched-tokensz--max-num-seqsz--enable-chunked-prefill)r4  r-  z--disable-chunked-prefillz--disable-prefix-cachingz--disable-engine-stats-logsz--enforce-eagerz--non-stream-final-onlyzXUse vLLM FINAL_ONLY output mode (less per-token driver overhead, less granular timeline)z--enable-gpu-decode-timingz=Enable precise BiCodec GPU timing (adds synchronize overhead)z--model-path z--outputzstress_runtime_detailed.json)r-  r   )DEFAULT_LOCAL_MODEL_DIR1VEENA3_PERF_GPU_TIMING0VEENA3_NON_STREAM_FINAL_ONLYT)	r   r   r   r   r   r   r   r   r   )argparseArgumentParseradd_argumentrk   r   rn   
parse_argsr   r6  r   r  osenviron
setdefaultr  r   r  r   r  r   r   r   r  r  r   r   rH   r+  )parserr  r6  r   r   r"   r"   r#   mainA  sn   
rC  __main__)r   rn   r   r   r   rk   r   r   r   r   r   r   r   r   r   r   r   r   r-   r.   )r=   rn   ro   rn   r   rk   r   r   r   rh   r-   rg   )rv   rk   rw   rk   r=   rn   r   rk   r   r   r   rh   r   r$   r-   ru   )r   ru   r-   r.   )r	  rn   r
  r  r  r  r-   r.   )r  r  r-   r.   rf   )+__doc__
__future__r   r;  r   r  r?  r`   rG   sysr1   rN   dataclassesr   r   typingr   r   r   r   r	  dirnameabspath__file__	REPO_ROOTr9   EXTERNAL_DIRinsertr@  r%  r   r   r$   rg   ru   r   r   r   r  r#  r+  rC  r   r"   r"   r"   r#   <module>   sT   "

I
H
 
E

n
O
%>
