o
    iYL                     @  s  d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlmZ ddlmZ ddlZejejdd edZd	Zd
Zd$ddZeG dd dZedddedddedddgZG dd dZd%dd Zd!d" Zed#kre  dS dS )&a  Vast.ai benchmark orchestrator: deploy benchmarks on different GPU types.

Workflow:
  1. Search for cheapest offers matching target GPU types
  2. Spin up SSH instances with our pipeline image
  3. Upload code + .env, run gpu_benchmark.py on each
  4. Collect results, compare cross-GPU
  5. Tear down instances

Usage:
  python scripts/vastai_benchmark.py --api-key <key> --gpus RTX_4090,A100_SXM4
  python scripts/vastai_benchmark.py --api-key <key> --list-gpus  # show available
    )annotationsN)	dataclass)Pathz'%(asctime)s [%(levelname)s] %(message)s)levelformatvastai_benchzhttps://console.vast.ai/api/v0z%bharathkumar192/codecbench-sft:latestlog_pathr   returndictc                   s   |    td fd( fdd}i d|dtdd|d	td
d|dtd
d|dtdd|dtdd|dtd
d|dtd
d|dtd
d|dtd
d|dtd
d|dtdd|dtd
d |d!td
d"d# v d$|d%td
d&|d'td
S ))Npatternstrc                   s"   t |  }|s
|S ||dS )N   )researchgroup)r   castdefaultmtext scripts/vastai_benchmark.pygrab)   s   z&_parse_sft_benchmark_log.<locals>.grabgpu_namezGPU:\s+(.+) vram_peak_mbz$VRAM:\s+([0-9.]+)\s+/\s+[0-9.]+\s+MBg        vram_total_mbz$VRAM:\s+[0-9.]+\s+/\s+([0-9.]+)\s+MB
batch_sizezBatch size:\s+([0-9]+)r   download_workerszDownload workers:\s+([0-9]+)
download_szDownload:\s+([0-9.]+)sdecode_szDecode:\s+([0-9.]+)sencode_szEncode:\s+([0-9.]+)supload_szUpload:\s+([0-9.]+)saudio_szAudio:\s+([0-9.]+)ssegmentszSegments:\s+([0-9]+)
encode_rtfz'Encode:\s+[0-9.]+s\s+\(RTF=([0-9.]+)x\)effective_per_shard_sz!Effective per shard:\s+([0-9.]+)snext_download_readyz*Next download ready by upload finish: True	eta_100_hzETA\s+100 GPUs:\s+([0-9.]+)h	eta_200_hzETA\s+200 GPUs:\s+([0-9.]+)h)r   r   )	read_textfloatr   int)r   r   r   r   r   _parse_sft_benchmark_log&   sF   	
r-   c                   @  s&   e Zd ZU ded< ded< ded< dS )	GPUTargetr   namer+   min_vram_gb	max_priceN)__name__
__module____qualname____annotations__r   r   r   r   r.   C   s   
 r.   RTX 4090   g333333?RTX 3090g?RTX 3090 Tig?c                   @  s   e Zd Zedfd7ddZd8ddZd9ddZd9ddZd:ddZd;d<ddZ	d=d>d#d$Z
d?d%d&Z	'd@dAd)d*ZdBd+d,Zd9d-d.Z	/dCdDd5d6Zd/S )EVastAIBenchOrchestratorFapi_keyr   docker_image
sync_localboolc                 C  sR   || _ || _|| _d| dd| _i | _tt jj| _	t
 d d | _d S )NzBearer zapplication/json)AuthorizationzContent-Type.ssh
id_ed25519)r;   r<   r=   headers	instancesr   __file__resolveparentproject_roothomeidentity_file)selfr;   r<   r=   r   r   r   __init__S   s   z VastAIBenchOrchestrator.__init__targetr.   r	   
list[dict]c                 C  s   t jt d| jddddiddiddiddid|jid|jd	 id
|jiddiddiddiddiddggdd}|  | 	dg }|S )z#Find cheapest offer for a GPU type.z	/bundles/   z	on-demandeqTFr   gte   ltegtgffffff?   2   g      (@	dph_totalasc)limittypeverifiedrentablerentednum_gpusr   gpu_ramrV   reliability	inet_down
disk_space	cuda_versorderrB   jsonoffers)
requestspostBASE_URLrB   r/   r0   r1   raise_for_statusre   get)rJ   rL   resprf   r   r   r   search_offers_   s,   z%VastAIBenchOrchestrator.search_offersNonec                 C  s   g d}t ddddddddd	dd
d t d |D ]C}t|dd}| |}|rXtdd |D }|d ddd }t d|ddt|dd|dd|dd	 qt d|dd qt   dS )z5Show what GPU types are available and cheapest price.)r6   r8   r9   L40Sz	A100 SXM4z	A100 PCIe
GPU<20 Offers>7Cheapestz>10VRAM>8z2--------------------------------------------------r   i  c                 s  s    | ]}|d  V  qdS )rV   Nr   ).0or   r   r   	<genexpr>   s    z>VastAIBenchOrchestrator.list_available_gpus.<locals>.<genexpr>r^   rQ     z<18z>5z $z>8.3fz/hr z>6.0fGBz     0    ---       ---N)printr.   rm   minrk   len)rJ   	gpu_namesgnrL   rf   cheapestvramr   r   r   list_available_gpusy   s   (
0
z+VastAIBenchOrchestrator.list_available_gpusc                 C  s   t  d d }| st  d d }| s<td tjddddtt  d d	 d
dgddd t  d d }| 	 }t
jt d| jd}|jrT| ng }|D ]}|dd |ddv rntd  dS qXt
jt d| jd|id}|jrtd dS td|j dS )z.Ensure our SSH key is registered with Vast.ai.r@   zid_ed25519.pubz
id_rsa.pubz!Generating SSH key for Vast.ai...z
ssh-keygenz-ted25519z-frA   z-Nr   T)checkcapture_outputz/ssh/rB   NrU   
public_keyz'SSH key already registered with Vast.aissh_keyrd   zSSH key registered with Vast.aizFailed to register SSH key: %s)r   rH   existsloggerinfo
subprocessrunr   r*   striprg   rk   ri   rB   okre   rh   warningr   )rJ   pub_key_pathpub_keyrl   existingkr   r   r   ensure_ssh_key   s4   
 
z&VastAIBenchOrchestrator.ensure_ssh_keyoffer_idr,   r   
int | Nonec              	   C  s   | j d }i }| r/|  D ]}| }|r.|ds.d|v r.|dd\}}|||< qtjt	 d| d| j
| jddd	| |d
d}|jsUtd||j dS | }	|	d}
|
rntd|
|| j |
| j|< |
S )zECreate a Vast.ai instance from an offer using pre-built Docker image..env#=r   z/asks//rU   
ssh_directzbench-)imagediskruntypelabelenvrd   z$Failed to create instance for %s: %sNnew_contractz&Created instance %s for %s (image: %s))rG   r   r*   
splitlinesr   
startswithsplitrg   putri   rB   r<   r   r   errorr   re   rk   r   rC   )rJ   r   r   env_fileenv_varsliner   vrl   datainstance_idr   r   r   create_instance   s<   


z'VastAIBenchOrchestrator.create_instanceX  r   timeoutdict | Nonec                 C  s   t   }t   | |k rptjt d| d| jd}|js#t d q| d| }|dd}|dkrO|d	rO|d
rOt	d||d
 |d	  |S |dv r\t
d|| dS t	d|| t d t   | |k st
d| dS )z2Wait until instance is running and SSH-accessible./instances/r   r   
   rC   actual_statusr   runningssh_hostssh_portz)Instance %s is running: ssh -p %s root@%s)exitedofflinezInstance %s failed: %sNz"Instance %s status: %s, waiting...   zInstance %s timed out)timerg   rk   ri   rB   r   sleepre   r   r   r   )rJ   r   r   t0rl   inststatusr   r   r   wait_for_instance   s0   

z)VastAIBenchOrchestrator.wait_for_instance   r   r   c                 C  s   t   }t   | |k rTz1tjddddddt| jdt|d| dgd	d	d
d}|jdkr<td|||j	  W d	S W n	 t
yF   Y nw t d t   | |k sdS )zBWait until SSH is responsive (image has everything pre-installed).ssh-oStrictHostKeyChecking=nozConnectTimeout=10-i-proot@zpython3 --versionTrN   r   r   r   r   zSSH ready on %s:%s (%s)r   F)r   r   r   r   rI   
returncoder   r   stdoutr   	Exceptionr   )rJ   r   r   r   r   resultr   r   r   wait_for_ssh   s*   


z$VastAIBenchOrchestrator.wait_for_sshc                 C  s   t | jd }g d}ddddd| j d| g||d	| d
}tj|dddd}|jdkr>td|jdd  dS t	d| dS )z@Optionally sync local code changes over the baked-in image code.r   )
z--exclude=venv/z--exclude=repos/z--exclude=data/z--exclude=results/z--exclude=metafiles/z--exclude=__pycache__/z--exclude=.git/z--exclude=*.pycz--exclude=.cursor/z--exclude=models/rsyncz-avzz--timeout=60z-ezssh -i z  -o StrictHostKeyChecking=no -p r   z:/app/Tx   r   r   zrsync failed: %sNFzCode synced to %s)
r   rG   rI   r   r   r   r   r   stderrr   )rJ   r   r   srcexcludescmdr   r   r   r   	sync_code   s"   

z!VastAIBenchOrchestrator.sync_code   r   c                 C  s  | j d }| r9tjddt| jdddt|t|d| dg	d	d	d
d}|jdkr9td||j	dd  dS d| d|
dd d}td|| z:tjddt| jdddddt|d| |gd	d	dd}t|jdd  |jdkrtd|j	dd  W dS W n tjy   td| Y dS w zCtjddt| jdddt|d| dd|
dd dg	d	d	d d}	|	jdkrtd|
dd d}
||t|
t|
d!W S W dS  ty } ztd"|| W Y d}~dS d}~ww )#z?Run the SFT benchmark on the remote instance and fetch the log.r   scpr   r   r   z-Pr   z
:/app/.envT<   r   r   zFailed to copy .env to %s: %sr   Nzicd /app && mkdir -p /tmp/pipeline && python3 -m codecbench.pipeline.cli sft-run --benchmark --batch-size=z --offer-id vast_bench_rs   _z" 2>&1 | tee /tmp/sft_benchmark.logz&Running SFT benchmark on %s (BS=%d)...r   zServerAliveInterval=30r   i   iHzBenchmark failed: %szBenchmark timed out on %sz:/tmp/sft_benchmark.logz/tmp/bench_z.log   )r   r   r   summaryz#Failed to fetch results from %s: %s)rG   r   r   r   r   rI   r   r   r   r   replacer   r~   r   TimeoutExpiredr   r-   r   )rJ   r   r   r   r   env_pathput_env
remote_cmdr   fetchr   er   r   r   run_benchmark_remote  sr   


	


z,VastAIBenchOrchestrator.run_benchmark_remotec                 C  sF   t jt d| d| jd}|jrtd| d S td||j d S )Nr   r   r   zDestroyed instance %szFailed to destroy %s: %s)	rg   deleteri   rB   r   r   r   r   r   )rJ   r   rl   r   r   r   destroy_instanceW  s   z(VastAIBenchOrchestrator.destroy_instancec                 C  s,   | j  D ]	\}}| | q| j   d S N)rC   itemsr   clear)rJ   gpuiidr   r   r   destroy_alla  s   z#VastAIBenchOrchestrator.destroy_allNtargetslist[GPUTarget]batch_sizeslist[int] | Nonedict[str, list[dict]]c                 C  s  |du rg d}|    i }|D ]}td td|j|j | |}|s/td|j q|d }td|d |d	 |d
 |ddd  | |d |j}|sTqzx| 	|}|snW | 
| |j| jv rm| j|j= q|d }	|d }
| |	|
std|j W | 
| |j| jv r| j|j= q| jr| |	|
 g }|D ]}| j|	|
|j|d}|r|| q|||j< W | 
| |j| jv r| j|j= q| 
| |j| jv r| j|j= w |S )u\   Full multi-GPU benchmark: search → create → setup → benchmark → collect → destroy.N)r   r      z	
{'='*60}z"Searching for %s (max $%.2f/hr)...z No offers found for %s, skippingr   z,Best offer: ID=%s, $%.3f/hr, %s, %.0fGB VRAMidrV   r   r^   rQ   r   r   zSSH timed out on %s, skipping)r   )r   r   r   r/   r1   rm   r   rk   r   r   r   rC   r   r   r=   r   r   append)rJ   r   r   all_resultsrL   rf   offerr   r   r   r   gpu_resultsbsr   r   r   r   run_full_benchmarkf  sp   










z*VastAIBenchOrchestrator.run_full_benchmark)r;   r   r<   r   r=   r>   )rL   r.   r	   rM   )r	   rn   )r   r,   r   r   r	   r   )r   )r   r,   r   r,   r	   r   )r   )r   r   r   r,   r   r,   r	   r>   )r   r   r   r,   r	   r>   )r   )
r   r   r   r,   r   r   r   r,   r	   r   )r   r,   r	   rn   r   )r   r   r   r   r	   r   )r2   r3   r4   DOCKER_IMAGErK   rm   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r:   R   s    



!#

A

r:   r   r   rn   c                 C  s&  t dd  t d t d  t dddddd	dd
ddddddddddddddddddd t dd  |  D ]F\}}|D ]?}|d }t d|dd|d d	d|d dd|d dd|d dd|d dd|d dd|d  dd|d! dd" qIqCt d d d#S )$z!Print cross-GPU comparison table.rp   zd====================================================================================================z$  SFT CROSS-GPU BENCHMARK COMPARISONr|   rq   rr   rs   BSz>3VRAM_pkrx   Enc_RTFdl_sz>6dec_senc_sru   eff_sETA100z_-----------------------------------------------------------------------------------------------r   r   r   z>7.0fr%   z>8.1fr   z>6.1fr    r!   z>7.1fr&   r(   hN)r~   r   )r   r   resultsrsr   r   r   print_comparison  s2   
Pr	  c            	   
     s  t jdd} | jdtddd | jdtd dd	 | jd
ddd | jdtdd | jdtddd	 | jdtddd	 | jdttdt dd	 | jdddd | jdtdd |   t j	 j
 jd} jrn|  d S  jrdd   jd!D } fd"d |D }nt}d#d   jd!D }zB|||}|rt| t j}|jjddd$ t|d%}tj||d&d' W d    n1 sw   Y  td(| W d S W d S  ty   td) |  Y d S  ty } ztj d*|dd+ |   d }~ww ),Nz(Vast.ai multi-GPU benchmark orchestrator)descriptionz	--api-keyTzVast.ai API key)rY   requiredhelpz--gpusz3Comma-separated GPU names (e.g. RTX_4090,A100_SXM4))rY   r   r  z--list-gpus
store_truezList available GPUs and prices)actionr  z--num-videosr   )rY   r   z--batch-sizesz1,2,4z#Comma-separated batch sizes to testz--max-priceg      ?zMax $/hr per GPUz--imagezDocker image to use (default: )z--sync-localz5Rsync local code changes over the baked-in image codez--outputzresults/vastai_benchmark.json)r<   r=   c                 S  s   g | ]}|  qS r   )r   ry   gr   r   r   
<listcomp>      zmain.<locals>.<listcomp>,c                   s   g | ]	}t |d  jqS )r   )r.   r1   r  argsr   r   r    s    c                 S  s   g | ]}t |qS r   )r,   )ry   xr   r   r   r    r  )parentsexist_okwr   )indentzAll results saved to %sz(Interrupted! Destroying all instances...zFatal error: %s)exc_info)!argparseArgumentParseradd_argumentr   r,   r+   r   
parse_argsr:   r;   r   r=   	list_gpusr   gpusr   DEFAULT_TARGETSr   r   r	  r   outputrF   mkdiropenre   dumpr   r   KeyboardInterruptr   r   r   )	parserorchr   r   r   r   outfr   r   r  r   main  sd   






r-  __main__)r   r   r	   r
   )r   r   r	   rn   )__doc__
__future__r   r  re   loggingosr   r   sysr   dataclassesr   pathlibr   rg   basicConfigINFO	getLoggerr   ri   r   r-   r.   r#  r:   r	  r-  r2   r   r   r   r   <module>   s@   




  
[5
