o
    Qix                     @  s  d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlmZmZ ddlmZmZmZmZ ejejejeZejedZejedejedefD ]Zee	jvrre	jde qdd	ejd
< ddddZg dZeG dd dZG dd dZ eG dd dZ!eG dd dZ"dId)d*Z#dJd2d3Z$dKd8d9Z%dLd;d<Z&dMdBdCZ'dNdDdEZ(dOdFdGZ)e*dHkre)  dS dS )Pa  
Direct-runtime streaming stress benchmark for Veena3 TTS.

Focuses on streaming KPIs:
- TTFB (time to first chunk)
- End-to-end latency
- Chunk cadence (inter-chunk interval)
- Emission speed (audio seconds emitted per second after first chunk)
- Throughput under concurrency
    )annotationsN)	dataclassfield)AnyDictListOptionalexternalsparkttszAP-BWEtrueAUTH_BYPASS_MODEz<Hello, this is a quick streaming benchmark for voice agents.zThis streaming benchmark sentence is designed to evaluate first audio latency, chunk cadence, and sustained emission speed under concurrent request load.zIn a bustling city, a narrator collected voice notes from people walking through markets, subways, and parks. The stream needed to start quickly, stay stable, and deliver clear speech without stalls as many listeners connected at once.shortmediumlong)lipakshivardanreetNandinikrishnaanikac                   @  s.   e Zd ZU ded< ded< ded< ded< dS )GPUSnapshotfloat	timestampmemory_used_mbmemory_total_mbgpu_utilization_pctN)__name__
__module____qualname____annotations__ r!   r!   (scripts/stress_test_streaming_runtime.pyr   5   s
   
 r   c                   @  s@   e Zd ZddddZddd	Zdd
dZdddZdddZdS )
GPUMonitor      ?interval_secondsr   c                 C  s   || _ g | _d| _d | _d S )NF)interval	snapshots_running_thread)selfr%   r!   r!   r"   __init__>   s   
zGPUMonitor.__init__returnNonec                 C  s&   d| _ tj| jdd| _| j  d S )NT)targetdaemon)r(   	threadingThread
_poll_loopr)   startr*   r!   r!   r"   r3   D   s   zGPUMonitor.startc                 C  s"   d| _ | jr| jjdd d S d S )NF   timeout)r(   r)   joinr4   r!   r!   r"   stopI   s   zGPUMonitor.stopc              	   C  s   | j rZzBtjg ddddd}|jdkrD|jrDdd |j dD }t|d	krD| j	t
t t|d t|d
 t|d d W n	 tyN   Y nw t| j | j sd S d S )N)z
nvidia-smiz4--query-gpu=memory.used,memory.total,utilization.gpuz--format=csv,noheader,nounitsT   )capture_outputtextr7   r   c                 S  s   g | ]}|  qS r!   )strip).0pr!   r!   r"   
<listcomp>\       z)GPUMonitor._poll_loop.<locals>.<listcomp>,r5         )r   r   r   r   )r(   
subprocessrun
returncodestdoutr=   splitlenr'   appendr   timer   	Exceptionsleepr&   )r*   respartsr!   r!   r"   r2   N   s2   



zGPUMonitor._poll_loopDict[str, Any]c                 C  sx   | j sdddS dd | j D }dd | j D }t| j t|t|t|| j d jdt|t|t|dd	S )
Nr   zno GPU samples captured)sampleserrorc                 S     g | ]}|j qS r!   )r   r>   sr!   r!   r"   r@   m       z&GPUMonitor.summary.<locals>.<listcomp>c                 S  rT   r!   )r   rU   r!   r!   r"   r@   n   rW   )minavgmaxtotal)rX   rY   rZ   )rR   	memory_mbgpu_util_pct)r'   rJ   rX   
statisticsmeanrZ   r   )r*   memutilr!   r!   r"   summaryj   s   

zGPUMonitor.summaryN)r$   )r%   r   r,   r-   )r,   rQ   )r   r   r   r+   r3   r9   r2   rb   r!   r!   r!   r"   r#   =   s    


r#   c                   @  s   e Zd ZU ded< ded< ded< ded< ded< ded	< d
ed< dZded< eedZded< dZded< ee	dZ
ded< dS )RequestResultboolsuccessr   
latency_msttfb_msintchunks_senttotal_bytesaudio_secondsstrspeakerNzOptional[str]rS   default_factoryList[float]inter_chunk_ms        tail_emit_speed_xrQ   final_metrics)r   r   r   r    rS   r   listrr   rt   dictru   r!   r!   r!   r"   rd      s   
 rd   c                   @  s   e Zd ZU ded< ded< eedZded< dZded	< eedZ	d
ed< e
d1ddZe
d1ddZe
d2ddZe
d2ddZd3ddZd4ddZd5ddZd5dd Zd5d!d"Zd5d#d$Zd5d%d&Zd6d)d*Zd6d+d,Zd7d.d/Zd0S )8LevelResultri   concurrencytotal_requestsro   List[RequestResult]resultsrs   r   wall_time_srQ   gpu_summaryr,   c                 C     dd | j D S )Nc                 S  s   g | ]}|j r|qS r!   rf   r>   rr!   r!   r"   r@          z)LevelResult.successes.<locals>.<listcomp>r|   r4   r!   r!   r"   	successes      zLevelResult.successesc                 C  r   )Nc                 S  s   g | ]}|j s|qS r!   r   r   r!   r!   r"   r@      r   z(LevelResult.failures.<locals>.<listcomp>r   r4   r!   r!   r"   failures   r   zLevelResult.failuresc                 C  s   | j rt| jt| j  S dS Nrs   )r|   rJ   r   r4   r!   r!   r"   success_rate   s   zLevelResult.success_ratec                 C  s   | j dkr| j| j  S dS )Nr   rs   )r}   rz   r4   r!   r!   r"   throughput_rps   s   zLevelResult.throughput_rpsvalsrq   r?   c                 C  s0   |sdS t |}tt|d |d  }|| S )Nrs   rC   d   )sortedri   rJ   )r*   r   r?   rV   idxr!   r!   r"   _pct   s
   zLevelResult._pctDict[str, float]c                 C  s0   |si S t || |d| |dt|dS )N2   _   )rY   p50p95rZ   )r^   r_   r   rZ   )r*   r   r!   r!   r"   _stats   s   

zLevelResult._statsc                 C     |  dd | jD S )Nc                 S  rT   r!   )rg   r   r!   r!   r"   r@      rW   z-LevelResult.latency_stats.<locals>.<listcomp>r   r   r4   r!   r!   r"   latency_stats      zLevelResult.latency_statsc                 C  r   )Nc                 S     g | ]
}|j d kr|j qS r   )rh   r   r!   r!   r"   r@          z*LevelResult.ttfb_stats.<locals>.<listcomp>r   r4   r!   r!   r"   
ttfb_stats   r   zLevelResult.ttfb_statsc                 C  r   )Nc                 S  s   g | ]}t |jqS r!   )r   rj   r   r!   r!   r"   r@      r   z1LevelResult.chunk_count_stats.<locals>.<listcomp>r   r4   r!   r!   r"   chunk_count_stats   r   zLevelResult.chunk_count_statsc                 C  r   )Nc                 S  r   r   )rt   r   r!   r!   r"   r@      r   z0LevelResult.emit_speed_stats.<locals>.<listcomp>r   r4   r!   r!   r"   emit_speed_stats   r   zLevelResult.emit_speed_statsc                 C  s&   g }| j D ]}||j q| |S N)r   extendrr   r   )r*   r   r   r!   r!   r"   inter_chunk_stats   s   

zLevelResult.inter_chunk_statskeyrm   c                 C  sB   g }| j D ]}|j|}t|ttfr|t| q| |S r   )r   ru   get
isinstanceri   r   rK   r   )r*   r   r   r   vr!   r!   r"   timing_stats   s   

zLevelResult.timing_statsc                 C  sR   g }| j D ]}|j|}t|ttfr#|tdt|jt|  q| 	|S r   )
r   ru   r   r   ri   r   rK   rZ   rg   r   )r*   r   r   r   baser!   r!   r"   latency_over_metric_stats   s   

z%LevelResult.latency_over_metric_statsDict[str, int]c                 C  sB   i }| j D ]}|jpd  }|sd}||dd ||< q|S )Nunknownr   rC   )r   rS   r=   lowerr   )r*   countsr   reasonr!   r!   r"   failure_reason_counts   s   
z!LevelResult.failure_reason_countsN)r,   r{   )r,   r   )r   rq   r?   ri   r,   r   )r   rq   r,   r   )r,   r   )r   rm   r,   r   )r,   r   )r   r   r   r    r   rv   r|   r}   rw   r~   propertyr   r   r   r   r   r   r   r   r   r   r   r   r   r   r!   r!   r!   r"   rx      s0   
 









rx   
model_pathrm   gpu_memr   stream_output_kindnum_enginesri   max_num_batched_tokensOptional[int]max_num_seqsenable_chunked_prefillOptional[bool]enable_prefix_cachingdisable_log_statsenforce_eagerr,   r-   c
                 C  s`   ddl m}
 ddlm}m} |tjd< |
| \}}| s.|||d||||||||	dd d S d S )Nr   )resolve_model_paths)initialize_runtimeis_initializedVEENA3_STREAM_OUTPUT_KINDcudaF)r   bicodec_pathdevicegpu_memory_utilizationr   r   r   r   r   r   r   precompute_speaker_globals)veena3modal.local_serverr    veena3modal.services.tts_runtimer   r   osenviron)r   r   r   r   r   r   r   r   r   r   r   r   r   llm_pathr   r!   r!   r"   init_runtime   s(   

r   r<   rn   
max_tokens	timeout_senable_chunkingre   seedc                   s  ddl m t }d d g d d
i d 	
fdd}zOtj| |dI d H  t }|| d }	d urI| d nd	}
td
d	pSd	}td|	|
 d }|dkrf|| nd	}t	d|	|
 
||t
d
W S  tjy   t	dt | d d	 
d	dt
d
 Y S  ty } z"t	dt | d d	 
d	t|d d t
d
W  Y d }~S d }~ww )Nr   )tts_runtimer,   r-   c                    s   j 	d} | 2 z23 d H W \}}t }d u r|d ur,| d  | d7  
t|7 
t|tr?|q6 d S )N)r<   rn   r   r   r        @@rC   )generate_speech_streamingrL   perf_counterrK   rJ   r   rw   )genaudio_chunkmetricsnowchunksr   ru   first_chunk_trr   r   prev_chunk_tr   rn   r<   rk   r   r!   r"   _consume_stream  s*   
z(run_one_request.<locals>._consume_streamr6   r   rs   audio_duration_secondsgư>T)
rf   rg   rh   rj   rk   rl   rn   rr   rt   ru   Fr7   )
rf   rg   rh   rj   rk   rl   rn   rS   rr   ru      rc   )veena3modal.servicesr   rL   r   asynciowait_forr   r   rZ   rd   rw   TimeoutErrorrM   rm   )r<   rn   r   r   r   r   t0r   endrg   rh   rl   tail_srt   excr!   r   r"   run_one_request  st   $
r   ry   rz   gpu_monitor	seed_basec           
        s   t | |d}t| d
 fdd|j  t }	tjfdd	t|D  I d H |_	t |	 |_
| |_|S )N)ry   rz   iri   r,   rd   c              	     s|   4 I d H * t | tt   }d ur|  nd }t| |dI d H W  d   I d H  S 1 I d H s7w   Y  d S )Nr<   rn   r   r   r   r   )SPEAKERSrJ   r   )r   rn   r   )r   r   r   	semaphorer<   r   r!   r"   wrappedl  s   
0zrun_level.<locals>.wrappedc                   s   g | ]} |qS r!   r!   )r>   r   )r   r!   r"   r@   {  rA   zrun_level.<locals>.<listcomp>)r   ri   r,   rd   )rx   r   	Semaphorer'   clearrL   r   gatherranger|   r}   rb   r~   )
ry   rz   r<   r   r   r   r   r   levelr   r!   )r   r   r   r   r<   r   r   r"   	run_level_  s   


$
r   r   c                 C  sL  t d| j d| j d t dt| j d| j d| jdd t d	| jd
d| jd
d | jrQ| 	 }d
dd t| dd ddD }t d|  |  }|rst d|d dd|d dd|d dd|d d |  }|rt d|d dd|d dd|d dd|d d |  }|rt d|d d d|d d d|d d d|d d  |  }|rt d!|d |d |d |d  |  }|rt d"|d |d |d |d  g d#}|D ](}	| |	}
|
rt |	 d$|
d d
d|
d d
d|
d d
d|
d d
	 q| d%}|r0t d&|d |d |d |d  | d%}| d'}| d(}|r|r|dd)}|dd)}|rZ|dd)nd)}td)|| | }|d*krt d+|| d, || d, || d,  d-| jv r| j}t d.|d/ d |d/ d |d- d |d- d  d S d S )0Nz
=== streaming concurrency=z
 requests=z ===zsuccess=/z (z.0%)zwall=z.2fzs throughput=z req/sz, c                 s  s"    | ]\}}| d | V  qdS ):Nr!   )r>   kr   r!   r!   r"   	<genexpr>  s     zprint_level.<locals>.<genexpr>c                 S  s   | d S )NrC   r!   )kvr!   r!   r"   <lambda>  s    zprint_level.<locals>.<lambda>T)r   reversez
failures: zlatency_ms avg=rY   z.0fz p50=r   z p95=r   z max=rZ   zttfb_ms avg=zchunks_per_req avg=z.1fz:inter_chunk_ms avg={:.1f} p50={:.1f} p95={:.1f} max={:.1f}z=tail_emit_speed_x avg={:.2f} p50={:.2f} p95={:.2f} max={:.2f})timeline_to_first_batch_ms"timeline_to_first_chunk_emitted_mstimeline_total_msllm_time_in_queue_msllm_scheduler_msllm_first_token_msllm_request_lifecycle_msllm_generation_wall_msllm_time_per_token_msllm_batch_wall_ms_totalllm_decode_wall_ms_totaltokens_per_batch_p50bicodec_decode_callsbicodec_decode_wall_ms_totalbicodec_decode_interval_ms_p50"decode_interval_applied_tokens_p50decode_pending_requests_p50admission_wait_msadmission_inflight_on_grantadmission_waiters_on_grantadmission_queue_depth_on_entrybatch_avg_sizebatch_max_seenbatch_workers_livebatch_workers_targetbatch_queue_wait_ms_avgbatch_compute_ms_avgbatch_queue_depth_avgz: avg=r   zElatency_over_timeline_ms: avg={:.2f} p50={:.2f} p95={:.2f} max={:.2f}r  r
  rs   r   z9timeline_share: llm={:.1f}% bicodec={:.1f}% other={:.1f}%r   r\   zHgpu: util_avg={:.0f}% util_max={:.0f}% mem_avg={:.0f}MB mem_max={:.0f}MBr]   )printry   rz   rJ   r   r   r}   r   r   r   r8   r   itemsr   r   r   r   formatr   r   r   r   rZ   r~   )r   reasonsrb   latttfbccicestiming_keysr   stlatency_over_timelinetl_total	llm_totalbicodec_total	total_avgllm_avgbicodec_avg	other_avggr!   r!   r"   print_level  s   (&888
<

	









r-  pathlevelsList[LevelResult]argsargparse.Namespacec                 C  s  i d|j d|jd|jd|jd|jd|jd|jd|jd	|jd
|j	d|j
d|jd|jd|jd|jd|jd|ji d|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd |jd!|j d"|j!|j"|j#|j$t%t&& d#g d$}|D ]}|d '|j(|j)|j*|j+|j,|- |. |/ |0 |1 |j2i d%|3d%d&|3d&d'|3d'd(|3d(d)|3d)d*|3d*d+|3d+d,|3d,d-|3d-d.|3d.d/|3d/d0|3d0d1|3d1d2|3d2d3|3d3d4|3d4d5|3d5|3d6|3d7|3d8|3d9|3d:|3d;|3d<|3d=|3d>|3d?|3d@|4d'dA|5 dBdC |j6D dD qt7| dEdFdG}t8j9||dHdI W d    d S 1 sw   Y  d S )JNr/  r<   r   r   r   requests_multipliermin_requests
gpu_memoryr   chunkingr   r   r   r   disable_chunked_prefilldisable_prefix_cachingdisable_engine_stats_logsr   stream_decode_intervalstream_window_sizestream_min_semantic_firststream_crossfade_msadaptive_decodeadaptive_decode_first_intervaladaptive_decode_busy_intervaladaptive_decode_busy_pendingdisable_windowed_decodedisable_bicodec_batchingbicodec_batch_maxbicodec_batch_timeout_msbicodec_batch_workersbicodec_batch_scale_pendingbicodec_batch_scale_modestream_admission_max_inflight)stream_admission_max_queuestream_admission_max_wait_msstream_admission_poll_msr   )configr/  r   r   r   r   r  r  r  r  r  r  r  r  r	  r
  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )r  r  r  r  r  r  r  r  r  r  r  latency_over_timeline_msc                 S  s>   g | ]}|j |j|j|j|j|j|j|j|j|j	|j
d qS )rf   rg   rh   rj   rk   rl   rn   rS   rr   rt   ru   rO  r   r!   r!   r"   r@   F  s    z save_results.<locals>.<listcomp>)ry   rz   r   r   r}   rg   rh   chunks_per_reqrr   rt   gputimingfailure_reasonsrequestswzutf-8)encodingrD   )indent):r/  r<   r   r7   r   r3  r4  r5  r   r6  r   r   r   r   r7  r8  r9  r   r:  r;  r<  r=  r>  r?  r@  rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rK  rL  ri   rL   rK   ry   rz   r   r   r}   r   r   r   r   r   r~   r   r   r   r|   openjsondump)r.  r/  r1  payloadlvlfdr!   r!   r"   save_results  s$  	
 !"#
(+








	








?$r^  c           	        s  t | j }dd | jdD }tdd}|  t|td | j| j	| j
| jdI d H  g }t|D ]9\}}t| j|| j }t|||| j| j	| j
|| jdI d H }t| || |t|d	 k rltd
I d H  q3|  t| j||  td| j  d S )Nc                 S  s    g | ]}|  rt|  qS r!   )r=   ri   )r>   xr!   r!   r"   r@   ^  s     zmain_async.<locals>.<listcomp>rB   r$   )r%   r   r   )ry   rz   r<   r   r   r   r   r   rC   rD   z
results saved: )
TEST_TEXTSr<   r/  rI   r#   r3   r   r   r   r7   r6  r   	enumeraterZ   r4  r3  r   r-  rK   rJ   r   rN   r9   r^  outputr  )	r1  r<   r/  r   r|   r   cnr   r!   r!   r"   
main_async\  sD   

	

re  c                  C  sn  t jdd} | jdddd | jdg dd	d
 | jdtddd | jdtddd | jdtddd | jdtddd | jdtdd | jddddd | jd td!d | jd"d#d$gd#d
 | jd%td&d'd | jd(td d | jd)td d | jd*dd d+ | jd,ddd+ | jd-ddd+ | jd.ddd+ | jd/ddd+ | jd0td1d | jd2td3d | jd4td5d | jd6td7d | jd8ddd+ | jd9td1d | jd:td;d | jd<td=d | jd>ddd+ | jd?ddd+ | jd@tdAd | jdBtdCd | jdDtd&d | jdEtdFd | jdGdHdIgdHd
 | jdJtdFd | jdKtdFd | jdLtdMd | jdNtdOd | jdPtdQd | jdRdSdT |  }dFdUlm} |j	pX|}|j
tjdV< t|jtjdW< t|jtjdX< t|jtjdY< t|jtjdZ< |jrd[nd\tjd]< t|jtjd^< t|jtjd_< t|jtjd`< |jrd\nd[tjda< |jrd\nd[tjdb< t|jtjdc< t|jtjdd< t|jtjde< t|jtjdf< t|jtjdg< t|jtjdh< t|jtjdi< t|jtjdj< t|jtjdk< |j }|j!r	d}t"||j#|j
|j$|j%|j&||j'rdnd |j(r#dlnd |j)r*dlnd dm
 t*+t,| d S )nNz)Direct runtime streaming stress benchmark)descriptionz--levelsz16,32,64z"Comma-separated concurrency levels)defaulthelpz--textr   r   )choicesrg  z--max-tokensi   z'max_tokens sent to streaming generation)typerg  rh  z--seed-basei9  z,Base random seed; request i uses seed_base+iz	--timeoutg     f@zPer-request timeout secondsz--requests-multiplierrD   z.Requests per level ~= concurrency * multiplierz--min-requests   )rj  rg  z
--chunking
store_trueFzEnable long-text chunking path)actionrg  rh  z--gpu-memoryg      ?z--stream-output-kinddelta
cumulativez--num-enginesrC   zNumber of vLLM enginesz--max-num-batched-tokensz--max-num-seqsz--enable-chunked-prefill)rm  rg  z--disable-chunked-prefillz--disable-prefix-cachingz--disable-engine-stats-logsz--enforce-eagerz--stream-decode-interval0   z--stream-window-size   z--stream-min-semantic-first
   z--stream-crossfade-msr   z--adaptive-decodez --adaptive-decode-first-intervalz--adaptive-decode-busy-interval@   z--adaptive-decode-busy-pending    z--disable-windowed-decodez--disable-bicodec-batchingz--bicodec-batch-max   z--bicodec-batch-timeout-msg      @z--bicodec-batch-workersz--bicodec-batch-scale-pendingr   z--bicodec-batch-scale-modestickydynamicz--stream-admission-max-inflightz--stream-admission-max-queuez--stream-admission-max-wait-msrs   z--stream-admission-poll-msg       @z--model-path z--outputzstress_streaming_runtime.json)rg  )DEFAULT_LOCAL_MODEL_DIRr   VEENA3_STREAM_DECODE_INTERVALVEENA3_STREAM_WINDOW_SIZE VEENA3_STREAM_MIN_SEMANTIC_FIRSTVEENA3_STREAM_CROSSFADE_MS10VEENA3_STREAM_ADAPTIVE_DECODE#VEENA3_STREAM_DECODE_INTERVAL_FIRST"VEENA3_STREAM_DECODE_INTERVAL_BUSY!VEENA3_STREAM_DECODE_BUSY_PENDINGVEENA3_STREAM_WINDOWED_DECODEVEENA3_BICODEC_BATCHINGVEENA3_BICODEC_BATCH_MAXVEENA3_BICODEC_BATCH_TIMEOUT_MSVEENA3_BICODEC_BATCH_WORKERS"VEENA3_BICODEC_BATCH_SCALE_PENDINGVEENA3_BICODEC_BATCH_SCALE_MODE$VEENA3_STREAM_ADMISSION_MAX_INFLIGHT!VEENA3_STREAM_ADMISSION_MAX_QUEUE#VEENA3_STREAM_ADMISSION_MAX_WAIT_MSVEENA3_STREAM_ADMISSION_POLL_MST)
r   r   r   r   r   r   r   r   r   r   )-argparseArgumentParseradd_argumentri   r   rm   
parse_argsr   ry  r   r   r   r   r:  r;  r<  r=  r>  r?  r@  rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rK  rL  r   r7  r   r5  r   r   r   r8  r9  r   r   rF   re  )parserr1  ry  r   r   r!   r!   r"   main  s   r  __main__)r   rm   r   r   r   rm   r   ri   r   r   r   r   r   r   r   r   r   r   r   r   r,   r-   )r<   rm   rn   rm   r   ri   r   r   r   re   r   r   r,   rd   )ry   ri   rz   ri   r<   rm   r   ri   r   r   r   re   r   r#   r   r   r,   rx   )r   rx   r,   r-   )r.  rm   r/  r0  r1  r2  r,   r-   )r1  r2  r,   r-   rc   )+__doc__
__future__r   r  r   rY  r   r^   rE   sysr0   rL   dataclassesr   r   typingr   r   r   r   r.  dirnameabspath__file__	REPO_ROOTr8   EXTERNAL_DIRinsertr   r`  r   r   r#   rd   rx   r   r   r   r-  r^  re  r  r   r!   r!   r!   r"   <module>   sT   "

B
T
#
Y
"
k
p
'W
