o
    iϐ                     @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	m
Z
mZ ddlmZmZmZmZmZ ddlZdZdZddd	d
Zg dZe
G dd dZG dd dZe
G dd dZe
G dd dZdedee fddZdee defddZdedeeef fddZ		d>dej d ed!ed"ed#e!d$e"d%e!d&edefd'd(Z#	d?d ed)e"d*e"d+ed#e!d$e"d,ed%e!d-ee defd.d/Z$d0efd1d2Z%d3ee fd4d5Z&d3ee d6efd7d8Z'd9d: Z(d;d< Z)e*d=kre)  dS dS )@a  Detailed local stress benchmark for optimized TTS path.

This script is focused on collecting per-request GPU/LLM timing for local tuning:
- Request wall-clock latency and server reported TTFB
- GPU timing breakdown (LLM batch wall / GPU wall, parse, decode)
- Time per token and per batch
- Concurrency visibility from in-flight counters
- End-to-end throughput and RTF with GPU utilization snapshots

Usage:
    python scripts/stress_test_optimized_local.py
    python scripts/stress_test_optimized_local.py --levels 1,5,10,20 --concurrency-scaling both
    python scripts/stress_test_optimized_local.py --text-category long --output benchmarks.json
    N)	dataclassfield)AnyDictListOptionalTuplezhttp://localhost:8000z/v1/tts/generatezHello, this is a quick test.zThe quick brown fox jumps over the lazy dog. This is a baseline quality sentence used for benchmarking local TTS latency, TTFB, and tokenization behavior under load.av  In the heart of a bustling city, where towering skyscrapers cast shadows over crowded streets, there lived a quiet linguist who believed every sentence carried a rhythm. She recorded conversations at dawn and dusk, trying to capture how meaning shifted with breath. By evening she would review them, matching sound to emotion, and every night she learned a new way to speak.)shortmediumlong)lipakshivardanreetNandinikrishnaanikac                   @   s6   e Zd ZU eed< eed< eed< eed< eed< dS )GPUSnapshot	timestampmemory_used_mbmemory_total_mbgpu_utilization_pcttemperature_cN)__name__
__module____qualname__float__annotations__ r   r   >/home/ubuntu/veenaModal/scripts/stress_test_optimized_local.pyr   5   s   
 r   c                   @   sN   e Zd ZdZddefddZdd Zdd	 Zd
d Zde	e
ef fddZdS )
GPUMonitorz2Background polling for quick GPU health snapshots.      ?interval_secondsc                 C   s   || _ g | _d| _d | _d S )NF)interval	snapshots_running_thread)selfr!   r   r   r   __init__A   s   
zGPUMonitor.__init__c                 C   s&   d| _ tj| jdd| _| j  d S )NT)targetdaemon)r$   	threadingThread
_poll_loopr%   startr&   r   r   r   r-   G   s   zGPUMonitor.startc                 C   s"   d| _ | jr| jjdd d S d S )NF   timeout)r$   r%   joinr.   r   r   r   stopL   s   zGPUMonitor.stopc              
   C   s   | j r_zGtjg ddddd}|jdkrI|jrIdd |j dD }t|d	krI| j	t
t t|d t|d
 t|d t|d d W n	 tyS   Y nw t| j | j sd S d S )N)z
nvidia-smizD--query-gpu=memory.used,memory.total,utilization.gpu,temperature.gpuz--format=csv,noheader,nounitsT   )capture_outputtextr1   r   c                 S   s   g | ]}|  qS r   )strip).0pr   r   r   
<listcomp>_   s    z)GPUMonitor._poll_loop.<locals>.<listcomp>,         r/   )r   r   r   r   r   )r$   
subprocessrun
returncodestdoutr7   splitlenr#   appendr   timer   	Exceptionsleepr"   )r&   resultpartsr   r   r   r,   Q   s4   




	zGPUMonitor._poll_loopreturnc                 C   s   | j sdddS dd | j D }dd | j D }dd | j D }t| j t|t|t|| j d jdt|t|t|d	t|t|t|d	d
S )Nr   zno GPU samples captured)sampleserrorc                 S      g | ]}|j qS r   )r   r8   sr   r   r   r:   u       z&GPUMonitor.summary.<locals>.<listcomp>c                 S   rN   r   )r   rO   r   r   r   r:   v   rQ   c                 S   rN   r   )r   rO   r   r   r   r:   w   rQ   )minmaxavgtotal)rR   rS   rT   )rL   	memory_mbgpu_util_pctr   )r#   rD   rR   rS   
statisticsmeanr   )r&   mem_usedgpu_utiltempr   r   r   summaryn   s,   
zGPUMonitor.summaryN)r    )r   r   r   __doc__r   r'   r-   r3   r,   r   strr   r]   r   r   r   r   r   >   s    r   c                   @   s   e Zd ZU eed< eed< eed< dZeed< dZeed< dZ	eed< dZ
eed	< d
Zeed< dZee ed< d
Zeed< dZeed< dZeed< eedZeeef ed< dZee ed< dS )RequestResultsuccessstatus_code
latency_ms        ttfb_msserver_ttfb_msr   audio_bytesaudio_seconds 
request_idNrM   speakertext_lengthFstreamdefault_factorytimingrequest_inflight)r   r   r   boolr   intr   re   rf   rg   rh   rj   r_   rM   r   rk   rl   rm   r   dictrp   r   r   rq   r   r   r   r   r`      s   
 r`   c                   @   s  e Zd ZU eed< eed< eed< eed< eed< eed< eedZ	e
e ed< d	Zeed
< eedZeeef ed< ede
e fddZede
e fddZedefddZedefddZedefddZedefddZde
e dedefddZedeeef fddZedeeef fdd Zd!edeeef fd"d#Zd$ed%edeeef fd&d'Zdee fd(d)Z deeef fd*d+Z!d,S )-LevelResult
level_nameconcurrencytotal_requestsmodetext_categorychunkingrn   resultsrd   wall_time_secondsgpu_summaryrK   c                 C      dd | j D S )Nc                 S   s   g | ]}|j r|qS r   ra   r8   rr   r   r   r:          z)LevelResult.successes.<locals>.<listcomp>r|   r.   r   r   r   	successes      zLevelResult.successesc                 C   r   )Nc                 S   s   g | ]}|j s|qS r   r   r   r   r   r   r:      r   z(LevelResult.failures.<locals>.<listcomp>r   r.   r   r   r   failures   r   zLevelResult.failuresc                 C   s   | j rt| jt| j  S dS Nrd   )r|   rD   r   r.   r   r   r   success_rate   s   zLevelResult.success_ratec                 C   s   | j dkr| j| j  S dS Nr   rd   )r}   rx   r.   r   r   r   throughput_rps   s   zLevelResult.throughput_rpsc                 C   s   t dd | jD S )Nc                 s   s    | ]}|j V  qd S N)rh   r   r   r   r   	<genexpr>   s    z3LevelResult.audio_duration_total.<locals>.<genexpr>)sumr   r.   r   r   r   audio_duration_total   s   z LevelResult.audio_duration_totalc                 C   s   | j }|dkr| j| S dS r   )r   r}   )r&   total_audior   r   r   effective_rtf   s   zLevelResult.effective_rtfvaluespctc                 C   s:   |sdS t |}tt|| d }|t|t|d  S )Nrd   d   r=   )sortedrs   rD   rR   )r&   r   r   sorted_valsidxr   r   r   _percentile   s
   zLevelResult._percentilec                 C   sP   dd | j D }|si S t|t|| |d| |d| |dt|dS )Nc                 S   rN   r   rc   r   r   r   r   r:      rQ   z-LevelResult.latency_stats.<locals>.<listcomp>2   _   c   rR   rT   p50p95p99rS   r   rR   rX   rY   r   rS   )r&   latr   r   r   latency_stats   s   


zLevelResult.latency_statsc                 C   sF   dd | j D }|si S t|t|| |d| |dt|dS )Nc                 S   s   g | ]
}|j d kr|j qS )r   )rf   r   r   r   r   r:          z*LevelResult.ttfb_stats.<locals>.<listcomp>r   r   )rR   rT   r   r   rS   r   )r&   valsr   r   r   
ttfb_stats   s   

zLevelResult.ttfb_statskeyc                 C   sx   g }| j D ]}|j|}t|ttfr|t| q|s i S t|t	|| 
|d| 
|d| 
|dt|dS )Nr   r   r   r   )r   rp   get
isinstancers   r   rE   rR   rX   rY   r   rS   )r&   r   r   r   vr   r   r   timing_stats   s   



zLevelResult.timing_statsnumerator_keydenominator_keyc                 C   s   d}d}| j D ],}|j|}|j|}t|ttfr3t|ttfr3|dkr3|t|7 }|t|7 }q|dkr:dS || |fS )Nrd   r   )rd   r   )r   rp   r   r   rs   r   )r&   r   r   numdenr   ndr   r   r   timing_rate   s   
$zLevelResult.timing_ratec                 C   s    dd | j D }|sd S t|S )Nc                 S   s   g | ]}t |jtr|jqS r   )r   rq   rs   r   r   r   r   r:     s    z5LevelResult.observed_inflight_max.<locals>.<listcomp>)r   rS   )r&   r   r   r   r   observed_inflight_max  s   z!LevelResult.observed_inflight_maxc                 C   sJ   i }| j D ]}|jpd|j }||d d dd ||d d < q|S )NHTTP r   r   r=   )r   rM   rb   r   )r&   errorsr   r   r   r   r   error_summary  s
   
&zLevelResult.error_summaryN)"r   r   r   r_   r   rs   rr   r   listr|   r   r`   r}   r   rt   r~   r   r   propertyr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ru      s>   
 ru   rawrK   c                 C   sz   | d u rd S t |  }|dkrd S zd|v rt|W S tt|W S  ty<   zt|W  Y S  ty;   Y Y d S w w )Nri   .)r_   r7   r   rs   
ValueError)r   r   r   r   r   _coerce_number  s    
r   c                 C   s   t | }|d u r
dS t|S r   )r   r   )r   valuer   r   r   _normalize_time_ms$  s   r   headersc           
      C   s  i }t | dr| dnd}|r+zt|}t|tr || W n	 ty*   Y nw i dddddd	d
dddddddddddddddddddddd d!d"d#d$d%i d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdCdPdQdRdS}t | dTr|  D ]7\}}t	|
 }||}|du rqt|}	|	dur|dUv rt|	||< qt|	 rt|	nt|	||< qd|v rdI|v r|d rzt|dI t|d  |dM< W n
 ty   Y nw d|v r9dI|v r9|d r9zt|dI t|d  |dN< W n
 ty8   Y nw d|v red|v re|d rezt|d t|d  |dO< W |S  tyd   Y |S w |S )Vz7Parse server timing headers into numeric metric fields.r   zx-perf-detailsNzx-api-preprocess-msapi_preprocess_mszx-api-generate-await-msapi_generate_await_mszx-api-postprocess-msapi_postprocess_mszx-api-total-msapi_total_mszx-api-overhead-vs-pipeline-msapi_overhead_vs_pipeline_mszx-api-overhead-vs-generation-msapi_overhead_vs_generation_mszx-timeline-first-batch-mstimeline_llm_first_batch_mszx-timeline-llm-done-mstimeline_llm_done_mszx-timeline-bicodec-done-mstimeline_bicodec_done_mszx-timeline-total-mstimeline_total_mszx-llm-token-totalllm_token_totalzx-llm-batch-countllm_batch_countzx-llm-batch-wall-msllm_batch_wall_ms_totalzx-llm-batch-gpu-msllm_batch_gpu_ms_totalzx-llm-batch-wall-ms-minllm_batch_wall_ms_minzx-llm-batch-wall-ms-maxllm_batch_wall_ms_maxzx-llm-batch-wall-ms-p50llm_batch_wall_ms_p50zx-llm-batch-gpu-ms-minllm_batch_gpu_ms_minzx-llm-batch-gpu-ms-maxllm_batch_gpu_ms_maxzx-llm-batch-gpu-ms-p50llm_batch_gpu_ms_p50zx-llm-decode-wall-msllm_decode_wall_ms_totalzx-llm-decode-gpu-msllm_decode_gpu_ms_totalzx-llm-decode-wall-ms-minllm_decode_wall_ms_minzx-llm-decode-wall-ms-maxllm_decode_wall_ms_maxzx-llm-decode-wall-ms-p50llm_decode_wall_ms_p50zx-llm-decode-gpu-ms-minllm_decode_gpu_ms_minzx-llm-decode-gpu-ms-maxllm_decode_gpu_ms_maxzx-llm-decode-gpu-ms-p50llm_decode_gpu_ms_p50zx-llm-decode-callsdecode_callszx-llm-decode-cpu-msllm_decode_cpu_mszx-llm-parse-msllm_parse_msx-llm-parse-avg-msllm_parse_ms_avgzx-llm-tokens-per-batch-mintokens_per_batch_minzx-llm-tokens-per-batch-maxtokens_per_batch_maxtokens_per_batch_p50generation_msrq   text_chunkedchunks_processedllm_time_per_token_msllm_time_per_batch_wall_msllm_time_per_batch_gpu_msbicodec_decode_wall_msbicodec_decode_gpu_msbicodec_decode_cpu_ms)zx-llm-tokens-per-batch-p50zx-generation-mszx-request-inflightzx-llm-text-chunkedzx-llm-chunks-processedzx-llm-time-per-token-mszx-llm-time-per-batch-wall-mszx-llm-time-per-batch-gpu-msr   zx-bicodec-decode-wall-mszx-bicodec-decode-gpu-mszx-bicodec-decode-cpu-msitems>   r   )hasattrr   jsonloadsr   rt   updaterG   r   r_   lowerr   rr   r   
is_integerrs   )
r   metricsdetail_payloadparsed
header_mapraw_keyraw_vallk
target_keyr   r   r   r   _parse_perf_headers+  s   


	
 !"#
1
   r   F      ^@clientbase_urlr6   rk   r{   
max_tokensrm   r1   c                    sp  | t  }||||d|d}	t }
d}zW|r| jd||	ddi|d4 I d H }d}d	}| 2 z3 d H W }|rEt |
 d
 }d}|t|7 }q36 t |
 d
 }t|j}|jdkrtd	|d }|d }t	dd|||||j
dd|t|d||
dt|j
ddW  d   I d H  W S t	d|j||d|j |t|d||j
dd|
ddW  d   I d H  W S 1 I d H sw   Y  W d S | j||	ddi|dI d H }t |
 d
 }t|j}t|j
d}d}|j
d}|rt|}|d ur|}n|jdkrtd	t|jd }|d }|jdkr@t	dd|||t|j||j
dd|t|d||
ddW S t	d|j|||t|j|jrV|jd d nd|j |t|d||j
dd|
ddW S  tjy   t	dd	t |
 d
 d|t||d Y S  ty } zt	dd	t |
 d
 t|d d |t||dW  Y d }~S d }~ww )Nwav)r6   rk   rm   r{   formatr   rd   POSTzContent-Typezapplication/json)r   r   r1   Tr     F   ,   i }  zx-request-idri   rq   z	x-ttfb-ms)ra   rb   rc   re   rg   rh   rj   rk   rl   rm   rp   rq   rf   r   )ra   rb   rc   re   rM   rk   rl   rm   rp   rj   rq   zx-audio-seconds)ra   rb   rc   re   rf   rg   rh   rj   rk   rl   rm   rp   rq   )ra   rb   rc   re   rf   rg   rM   rk   rl   rm   rp   rj   rq   TIMEOUT)ra   rb   rc   rM   rk   rl   rm   )GENERATE_PATHrF   rm   aiter_bytesrD   r   r   rb   rS   r`   r   r   postr   contentr6   httpxTimeoutExceptionrG   r_   )r   r   r6   rk   r{   r   rm   r1   urlpayloadr-   	ttfb_timeresponsefirsttotal_byteschunklatencyrp   	pcm_bytesrh   server_ttfbaudio_sec_headerr   er   r   r   make_request  s  
	

%
43

 
	r  num_requestsrw   rz   request_timeoutgpu_monitorc	              
      s  rdnd}	t | d| d|	 |||	|d}
t||dkr)tt n	t|td gdtd	tj	d
t
f fdd|rM|j  t }tj|d |d}tj	|d4 I d H "fddt|D }tj|ddiI d H }W d   I d H  n1 I d H sw   Y  t | |
_|D ]!}t|t
r|
j| q|
jt
dddt|d d d q|r| |
_|
S )N	streamingnon-streamingzc--)rv   rw   rx   ry   rz   r{   mixedr	   ir   rK   c                    s|   4 I d H * | t   }t| t t  }t| ||dI d H W  d   I d H  S 1 I d H s7w   Y  d S )N)r   r   r6   rk   r{   r   rm   r1   )rD   SPEAKERSr  )r"  r   r6   rk   )r   r{   r   r  	semaphorerm   textsr   r   limited_requestA  s   
0z"run_level.<locals>.limited_request
   )max_connectionsmax_keepalive_connections)limitsc                    s   g | ]}| qS r   r   )r8   r"  )r   r&  r   r   r:   Y  r   zrun_level.<locals>.<listcomp>return_exceptionsTFr   r  )ra   rb   rc   rM   rm   )ru   asyncio	Semaphorer   
TEST_TEXTSr   r   rs   r  AsyncClientr`   r#   clearrF   Limitsrangegatherr}   r   r|   rE   r_   r]   r~   )r   r  rw   rz   r{   r   r  rm   r  ry   level
start_timer*  tasks	responsesitemr   )	r   r{   r   r&  r   r  r$  rm   r%  r   	run_level%  sT   
	(
(


r9  r4  c           &      C   sL  t dd  t d| j   t d| j d| j d| j d| j  t d  t dt| j d| j d	| j	d
d t d| j
dd t d| jdd t d| jdd t d| jd |  d urtt d|    | j}|rt d t d|d dd|d dd|d dd|d dd|d  dd!|d" d | j}|r| jd#st d$ t d|d dd|d dd|d dd|d dd!|d" d
 | jd%kr| d&}|r
t d' t d|d dd|d dd|d dd|d dd!|d" d
 | d(}|r8t d) t d|d dd|d dd|d dd|d dd!|d" d
 | d*}|rft d+ t d|d dd|d dd|d dd|d dd!|d" d
 | d,}|rt d- t d.|d dd|d d | d/}|rt d0 t d.|d dd|d d | d1}|rt d2 t d.|d dd|d d | d3}	|	rt d4 t d.|	d dd|	d d | d5}
| d6}| d7}| d8}| d9}| d:}|rt d; |
rt d<|
d dd|
d d |r!t d=|d dd|d d |r4t d>|d dd|d d t d?|d dd|d d |rWt d@|d dd|d d |rjt dA|d dd|d d |rtdB|d |d  }tdB|d |d  }t dC|dd|d | dD}| dE}| dF}| dG}| dH}|rt dI |rt dJ|d dd|d d |rt dK|d dd|d d |rt dL|d dd|d d |rt dM|d dd|d d t dN|d dd|d d |r|ddBndB}|r|ddBndB}|ddB}|dOkrJtdB|| | }t dP|| dQ || dQ || dQ  dRdS | jD }|rt|dTdU dVt|dW  }t dXt|jdDdBt|jdEdBt|jdFdBt|jdGdBt|jdHdB | d&dY\}}|rt dZ|d[ dd\ | jrd]| jv r| j}|d] } |d^ }!|d_ }"t d` t da| d ddb| d" ddc| dd dde t df|!d ddg|!d" ddh t di|"d ddj|"d" ddk |  }#|#r"t dl t|# dmdU dVD ]\}$}%t dn|% do|$  qd S d S )pN
zH========================================================================z  z requests, z concurrent, z, chunking=z  Success:     /z (.0%)z  Wall time:   z.2frP   z  Throughput:  z req/sz  Audio total: z  Eff. RTF:    z.3fz  Peak inflight: z
  Latency (ms):z    min=rR   .0fz avg=rT   z p50=r   z p95=r   z p99=r   z max=rS   rm   z
  Server TTFB (ms):r  r   z
  Generation (ms):r   z
  LLM batch wall (ms):r   z
  LLM batch GPU (ms):r   z
  Parse (ms):z    avg=r   z
  Decode (ms):r   z
  BiCodec decode (ms):r   z
  BiCodec decode GPU (ms):r   r   r   r   r   r   z
  API overhead (ms):z    preprocess avg=z    generate_await avg=z    postprocess avg=z    api_total avg=z    overhead_vs_pipeline avg=z    overhead_vs_generation avg=rd   z    client+queue overhead avg=r   r   timeline_parse_done_msr   r   z
  Timeline markers (ms):z    first_batch avg=z    llm_done avg=z    parse_done avg=z    bicodec_done avg=z    request_done avg=r   z:    Stage share: llm={:.1f}% bicodec={:.1f}% other={:.1f}%r   c                 S   s&   g | ]}t |jd ttfr|qS )r   )r   rp   r   rs   r   r   r   r   r   r:     s    z&print_level_report.<locals>.<listcomp>c                 S   s   | j S r   r   )r   r   r   r   <lambda>  s    z$print_level_report.<locals>.<lambda>)r   r>   zr    Sample (median-latency): first_batch={:.1f} llm_done={:.1f} parse_done={:.1f} bicodec_done={:.1f} total={:.1f}r   z  Token timing: avg r  z	 us/tokenrV   rW   r   z
  GPU:z    Memory: zMB avg, z
MB peak / rU   MBz    Util:   z% avg, z% peakz    Temp:   zC avg, zC peakz

  Errors:c                 S   s
   | d  S )Nr=   r   )kvr   r   r   r@    s   
 z    [zx] )printrv   upperrx   rw   ry   r{   rD   r   r   r}   r   r   r   r   r   r   
startswithr   rS   r   r  r   r   rp   r   r~   r   r   )&r4  r  ttfbllmwallgpuparsedecodebdecodebdecode_gpuapi_preapi_waitapi_post	api_totalapi_over_pipelineapi_over_generationclient_overhead_avgclient_overhead_p95tl_firsttl_llm_donetl_parse_donetl_bicodec_donetl_totalllm_avgbicodec_avg	total_avg	other_avgtimeline_samplessampletoken_mstoken_countr~   memutilr\   r   r   countr   r   r   print_level_reportr  s:  (
(
D
D
D
 
 
 
 





    




     



.""rf  levelsc           	      C   sT  t dd  t d t d  ddddddd	d
ddd
ddd
dddddddddddd}t | t d | D ]d}|j}|rN|dddnd}|rZ|dddnd}|jdi dd}|jdi dd}| pwd}t |jdd|jdd|jdd|d
d|d
d|dd|dd |jd!d|d" q?t d d S )#Nr:  zn==============================================================================================================z  STRESS BENCHMARK SUMMARYLevelz<26 zOK%z>5RPSz>7r   r   zGPU%GPUmemz>8RTFz>6Inflightzn--------------------------------------------------------------------------------------------------------------r   r>  z---rW   rT   rV   rS   z>4.0%z>7.2fz>4.0fz% z>7.0fzMB z>6.3fz>8d)	rC  r   r   r~   r   rv   r   r   r   )	rg  headerr4  r   r   r   r[   gpu_meminflightr   r   r   print_summary_table  sB   
J
rq  pathc              
   C   s  g }| D ]}| i d|jd|jd|jd|jd|jd|jd|jd|jd	|j	d
|j
d|jd|jd| d| d|jdi d|dd|dd|dd|dd|dd|dd|dd|dd|dd|dd|dd|dd|dd|dd|dd |d d!|d!d"|d"id#d$d% |jD  qt|d&}tj||d'd( W d    d S 1 sw   Y  d S ))Nr4  rw   ry   r{   rz   r   r   wall_time_src   re   r   audio_total_secondsr   r   rI  timing_averagesr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r?  r   r   requestsc                 S   s>   g | ]}|j |j|j|j|j|j|j|j|j|j	|j
d qS )rj   ra   rb   rc   re   rf   rg   rh   rq   rM   rp   rw  r   r   r   r   r:   U  s    z save_results.<locals>.<listcomp>wr>   )indent)rE   rv   rw   ry   r{   rz   r   r   r}   r   r   r   r   r   r   r~   r   r|   openr   dump)rg  rr  r  r4  fdr   r   r   save_results.  s   	









	









$6"r}  c                    s$  | j }zOt 4 I d H :}|j| dddI d H }| }td|d  td|d  td|d	  W d   I d H  n1 I d H sMw   Y  W n! tyt } ztd
| d|  td W Y d }~nd }~ww dd | j	
dD }tdd}|  | jrdnd}g }	td t|dd| j| j| j| jd|d	I d H }
td|
jd |D ]>}t|d d}t|d}t|||| j| j| j| j| j|d	I d H }t| |	| ||d krtd tdI d H  q|  t|	 | jrt|	| j td| j  d S d S )Nz/v1/tts/healthr'  r0   z
  Health: statusz  Version: model_versionz  GPU: gpu_availablezERROR: server not reachable at z: r=   c                 S   s   g | ]
}|  rt|qS r   )r7   rs   )r8   r   r   r   r   r:   y  r   z#run_stress_test.<locals>.<listcomp>r;   g      ?)r!   r  r  z)
Warming up (1 request, non-streaming)...F)	r   r  rw   rz   r{   r   r  rm   r  z  warmup ok: r<  r/   r  z  cooldown 3s...z
Results written to )r  r  r/  r   r   rC  rG   sysexitrg  rC   r   r-   rm   r9  r6   r{   r   r1   r   rS   rR   rf  rE   r,  rH   r3   rq  outputr}  )argsr   r   r  healthr  rg  r  
mode_labelr|   warmupconcr  r4  r   r   r   run_stress_testk  sx   (


r  c                  C   s   t jdd} | jdtdt dd | jddd	d | jd
dg ddd | jddddd | jddddd | jdddd | jdtddd | jd td!d"d | jd#d$d%d |  }tt	| d S )&Nz+Detailed local optimized-path TTS benchmark)descriptionz--urlzServer URL (default: r=  )defaulthelpz--levelsz1,5,10,20,50z"Comma-separated concurrency levelsz--textr	   )r	   r
   r   r!  zInput text category)r  choicesr  z
--chunking
store_truezEnable server-side chunkingT)actionr  r  z--no-chunkingstore_falser{   zDisable chunking)r  destr  z--streamzUse streaming endpoint)r  r  z--max-tokens   zMax tokens per request)typer  r  z	--timeoutr   zPer-request timeout secondsz--outputz stress_test_optimized_local.jsonzOutput JSON path)
argparseArgumentParseradd_argumentDEFAULT_URLrs   r   
parse_argsr,  r@   r  )parserr  r   r   r   main  s   r  __main__)Fr   r   )+r^   r  r,  r   rX   r?   r  r*   rF   dataclassesr   r   typingr   r   r   r   r   r  r  r  r.  r#  r   r   r`   ru   r_   r   r   r   r   r/  rr   rs   r  r9  rf  rq  r}  r  r  r   r   r   r   r   <module>   s   Prj	
 !	

M =?
