o
    پi&A                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZmZmZ dZded	efd
dZdd Zddee ded	dfddZG dd dZG dd deZeZdS )    N)Path)SimpleNamespace)temp_set_env)kill_process_tree)!DEFAULT_TIMEOUT_FOR_SERVER_LAUNCHDEFAULT_URL_FOR_TESTCustomTestCasepopen_launch_serverg?error_outputreturnc                 C   s    d| v od| v od| v pd| v S )z6Check if error is due to MMMU parquet file corruption.ArrowInvalidzParquet magic bytes not foundMMMUzlmms-lab--MMMU )r
   r   r   Q/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/kits/mmmu_vlm_kit.py_is_mmmu_parquet_corruption   s
   r   c               
   C   s   t d} |  r| }ntjdtjd}t |d d }| rVtd|  zt	| td|  W dS  t
yU } ztd	| d
|  W Y d}~dS d}~ww td| d dS )z>Clean up corrupted MMMU dataset cache to allow fresh download.z%/hf_home/hub/datasets--lmms-lab--MMMUHF_HOMEz~/.cache/huggingfacehubzdatasets--lmms-lab--MMMUz4Detected corrupted MMMU parquet cache. Cleaning up: z&Successfully removed corrupted cache: Tz Warning: Failed to remove cache : NFzMMMU cache not found at z, skipping cleanup)r   existsosenvirongetpath
expanduserprintshutilrmtreeOSError)
ci_hf_homemmmu_cache_pathhf_homeer   r   r   _cleanup_mmmu_dataset_cache    s$   
r"     cmdtimeoutc                 C   s  zt j| d|ddd}|j|j }t|rjtd t rZtd tddd& t j| d|ddd}|jr;t|jdd	 |jrEt|jdd	 W d
   n1 sOw   Y  W d
S W d
S td|j d|j  td|jrtt|jdd	 |jrt|jdd	 W d
S W d
S  t j	y } zk|j|j }t|rtd t rtd tddd& t j| d|ddd}|jrt|jdd	 |jrt|jdd	 W d
   n1 sw   Y  ntd|j d|j   td|j d|j   W Y d
}~d
S d
}~ww )zFRun lmms_eval command with automatic retry on MMMU parquet corruption.T)checkr%   capture_outputtextzHDetected MMMU parquet corruption error in output. Attempting recovery...z)Retrying lmms_eval with fresh download...0force_redownload)HF_HUB_OFFLINEHF_DATASETS_DOWNLOAD_MODE )endNzGFailed to cleanup corrupted MMMU cache. Output from lmms_eval:
Stdout:
z	
Stderr:
z&Failed to cleanup corrupted MMMU cachez>Detected MMMU parquet corruption error. Attempting recovery...zFFailed to cleanup corrupted MMMU cache. Error from lmms_eval:
Stdout:
z2lmms_eval failed with an unhandled error.
Stdout:
)

subprocessrunstdoutstderrr   r   r"   r   RuntimeErrorCalledProcessError)r$   r%   resultcombined_outputretry_resultr!   r
   r   r   r   _run_lmms_eval_with_retry9   s   	
(
r8   c                   @   sR   e Zd ZU dZeed< g Zee ed< dZ	de
dedefdd	Zde
fd
dZdS )	MMMUMixinzMixin for MMMU evaluation.

    Use with MMMUServerBase for single-model tests:
        class TestMyModel(MMMUMixin, MMMUServerBase):
            model = "my/model"
            accuracy = 0.4
    accuracy	mmmu_args	sk-123456selfmodel_versionoutput_pathc           
      C   s   d}d}d}d}d}t j|dd d| d| }d	d
dd|d|d|dt|dd|dt|g| j}	t| j| j dd t|	 W d   dS 1 sMw   Y  dS )
        Evaluate a VLM on the MMMU validation set with lmms-eval.
        Only `model_version` (checkpoint) and `chat_template` vary;
        We are focusing only on the validation set due to resource constraints.
        openai_compatible   mmmu_val@   Texist_okmodel_version="",tp=python3-m	lmms_eval--model--model_args--tasks--batch_size--log_samples--log_samples_suffix--output_path/v1)OPENAI_API_KEYOPENAI_API_BASEN)r   makedirsstrr;   r   api_keybase_urlr8   )
r=   r>   r?   modeltptasks
batch_size
log_suffix
model_argsr$   r   r   r   run_mmmu_eval   s@   

"zMMMUMixin.run_mmmu_evalc                 C   s  t  |}| | j| tj| ddd}|s t| d}|s)td| |d }t|d}t|}t	d|  W d	   n1 sIw   Y  |d
 d d }t	d| j d|d | 
|| jd| j d|dd| jdd W d	   d	S 1 sw   Y  d	S )zRun MMMU evaluation test.
/**/*.jsonT	recursive/*.jsonNo JSON result files found in r   rzResult: NresultsrC   mmmu_acc,noneModel z achieved accuracy: .4f accuracy () below expected threshold ())tempfileTemporaryDirectoryr`   rZ   globFileNotFoundErroropenjsonloadr   assertGreaterEqualr:   )r=   r?   result_filesresult_file_pathfr5   mmmu_accuracyr   r   r   	test_mmmu   s(   

"zMMMUMixin.test_mmmuN)__name__
__module____qualname____doc__float__annotations__r;   listrW   rX   r   r`   rz   r   r   r   r   r9      s   
 
2r9   c                   @   sn   e Zd ZdZdZg Zg Zedd Zedd Z	ddde
d	e
d
edB fddZ				dddZdd ZdS )MMMUMultiModelTestBasea   Base class for multi-model MMMU tests.

    This class is for tests that need to evaluate multiple models,
    starting and stopping a server for each model within the test method.
    For single-model tests, use MMMUMixin with MMMUServerBase instead.
    Nc                 C   sf   t | _d| _t| _| jd u rttd| _tj	
d| _tj	
d| _| jtj	d< | j dtj	d< d S )Nr<   )mem_fraction_staticrT   rU   rS   )r   rY   rX   r   time_outparsed_argsr   DEFAULT_MEM_FRACTION_STATICr   r   r   _original_openai_api_key_original_openai_api_baseclsr   r   r   
setUpClass   s   
z!MMMUMultiModelTestBase.setUpClassc                 C   s^   | j d ur| j tjd< n	dtjv rtjd= | jd ur"| jtjd< d S dtjv r-tjd= d S d S )NrT   rU   )r   r   r   r   r   r   r   r   tearDownClass  s   



z$MMMUMultiModelTestBase.tearDownClass)envr>   r?   r   c                C   sp   d}d}d}d}d}t j|dd d| d| }	d	d
dd|d|	d|dt|dd|dt|g| j}
t|
 dS )r@   rA   rB   rC   rD   TrE   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   N)r   rV   rW   r;   r8   )r=   r>   r?   r   rZ   r[   r\   r]   r^   r_   r$   r   r   r   r`     s6   z$MMMUMultiModelTestBase.run_mmmu_evalr-   infoFc                 C   s   t d|j |  d}d}d}	zzEtj }
|r |
| d|
d< d}d}|r4tdd}td	d}t|j| j| j	| j
d
ddddt| jjd|g| j|
|rT||fndd}| |j| tj| ddd}|sst| d}|s|td| |d }t|d}t|}t d| d|  W d   n1 sw   Y  |d d d }t d|j d| d|d |r|r|  }	| ||jd|j d |dd!|jdd"|  |	W W |dur| du rt d#|j  zt|j W n ty } zt d$|  W Y d}~nd}~ww |rW|r|  |r%|  d%D ]0}ztj|r6t| W q' tyV } zt d&| d|  W Y d}~q'd}~ww S S  ty } z!t d'|j | d|  | d(|j | d|  W Y d}~nd}~ww W |dur| du rt d#|j  zt|j W n ty } zt d$|  W Y d}~nd}~ww |r|r|  |r|  d%D ]1}ztj|rt| W q ty  } zt d&| d|  W Y d}~qd}~ww dS dS |dur;| du r;t d#|j  zt|j W n ty: } zt d$|  W Y d}~nd}~ww |r~|rE|  |rL|  d%D ]0}ztj|r]t| W qN ty} } zt d&| d|  W Y d}~qNd}~ww w w ))a  
        Common method to run VLM MMMU benchmark test.

        Args:
            model: Model to test
            output_path: Path for output logs
            test_name: Optional test name for logging
            custom_env: Optional custom environment variables
            log_level: Log level for server (default: "info")
            capture_output: Whether to capture server stdout/stderr
        z
Testing model: Nr   r-   1SGLANG_USE_CUDA_IPC_TRANSPORT/tmp/server_stdout.logw/tmp/server_stderr.logz--trust-remote-codez--cuda-graph-max-bs64z--enable-multimodalz--mem-fraction-staticz--log-level)rY   r%   rX   
other_argsr   return_stdout_stderrra   Trb   rd   re   rf   Resultz
: rg   rC   rh   ri   z achieved accuracyr   rj   rk   rl   rm   zCleaning up process zError killing process: )r   r   zError removing zError testing zTest failed for ) r   rZ   r   r   copyupdaterr   r	   rY   r   rX   rW   r   r   r   r`   rp   rq   rs   rt   _read_output_from_filesru   ry   pollpidr   	Exceptioncloser   r   removefail)r=   rZ   r?   	test_name
custom_env	log_levelr'   processry   server_outputprocess_envstdout_filestderr_filerv   rw   rx   r5   r!   filenamer   r   r   _run_vlm_mmmu_test>  s  




	
"
"(
"
"z)MMMUMultiModelTestBase._run_vlm_mmmu_testc                 C   s   g }ddg}|D ]P\}}z.t j|r9t|d}|D ]}|| d|   qW d    n1 s4w   Y  W q tyX } ztd|  d|  W Y d }~qd }~ww d	|S )N)r   z[STDOUT])r   z[STDERR]rf    zError reading z file: 
)
r   r   r   rr   appendrstripr   r   lowerjoin)r=   output_lines	log_filesr   tagrx   liner!   r   r   r   r     s$   $
z.MMMUMultiModelTestBase._read_output_from_files)r-   Nr   F)r{   r|   r}   r~   r   r   r;   classmethodr   r   rW   dictr`   r   r   r   r   r   r   r      s.    


2
r   )r#   )rp   rs   r   r   r/   rn   pathlibr   typesr   sglang.srt.environr   sglang.srt.utilsr   sglang.test.test_utilsr   r   r   r	   r   rW   boolr   r"   r   intr8   r9   r   MMMUVLMTestBaser   r   r   r   <module>   s&    	Hb o