o
    پi^1                    @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlmZmZmZ ddlZddlmZ eeZdZdededee d	ed
ee f
ddZded
ee fddZd
efddZded
ee fddZded
ee fddZ	dddededee d
dfddZded
dfddZ ded
ee fddZ!deded
dfdd Z"d!ed"ed
efd#d$Z#			%dededee dee d	ed
eeee f f
d&d'Z$ded(ee d)ed
efd*d+Z%d,ed-ed
efd.d/Z&d0ed1ed
eeee f fd2d3Z'd0ed
efd4d5Z(dfd7ee d8e)d
efd9d:Z*ded
eeee f fd;d<Z+ded(ee d)ed
eeee f fd=d>Z,	%dgded?ed
efd@dAZ-d!ed
efdBdCZ.d!ed
efdDdEZ/ded
eeee f fdFdGZ0ded(ee d
eeee ee f fdHdIZ1d)edJee d
e)fdKdLZ2d)ededMed
dfdNdOZ3d)edPedQee d
efdRdSZ4dTedUee d)ed
efdVdWZ5	ddd)edXee d
efdYdZZ6d)edXee d
e)fd[d\Z7	]dhd)edUee dXee dee d^e)d
efd_d`Z8daed
dfdbdcZ9dS )ia  
CI-specific weight validation and cache cleanup utilities.

This module contains validation and cleanup logic that is ONLY used in CI environments.
These functions handle:
- Validating safetensors files for corruption
- Checking for missing shards in sharded models
- Cleaning up corrupted files (selective or full cache deletion)
- Automatic retry logic for corrupted downloads
- Validating config/tokenizer files completeness to enable offline mode

For regular users, weight_utils.py provides simple download functionality without
the overhead of validation and automatic cleanup. The CI-specific behavior is
gated by is_in_ci() checks in weight_utils.py.
    N)ListOptionalTuple)log_info_on_rank05repo_idfilenamerevisionallow_remote_checkreturnc              
   C   s   |st d| | dS zddlm} | }|j| ||d}t d| ||p%d| |W S  tyH } zt d| ||p:d| W Y d}~dS d}~ww )	aV  
    Check if a file exists on Hugging Face Hub for a specific revision.

    Args:
        repo_id: Repository ID (e.g., "meta-llama/Llama-2-7b-hf")
        filename: File name to check (e.g., "hf_quant_config.json")
        revision: Git revision (commit hash, branch, or tag). None means default branch.
        allow_remote_check: Whether remote checks are allowed (e.g., CI validation phase)

    Returns:
        True if file exists on hub, False if it doesn't exist, None if we cannot determine
        (network error or remote check not allowed - be conservative and assume incomplete)
    z9Remote check disabled for %s/%s, returning None (unknown)Nr   )HfApi)r   r   r	   z0Remote file check: %s/%s (revision=%s) exists=%sdefaultzZFailed to check remote file existence for %s/%s (revision=%s): %s. Will treat as optional.)loggerdebughuggingface_hubr   file_exists	Exception)r   r   r	   r
   r   apiexistse r   `/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/model_loader/ci_weight_validation.py_remote_file_exists)   s<   r   snapshot_dirc                 C   sN   | rt j| s
dS t j| d}t|d dd }d| dS )a  
    Get the path to validation marker file for a snapshot.

    Marker is stored in /tmp to avoid permission issues with HF cache directory.
    Marker key is sha256(snapshot_dir) to avoid any collisions regardless of
    model_name_or_path format.

    Args:
        snapshot_dir: Path to snapshot directory

    Returns:
        Path to marker file or None if snapshot_dir is invalid
    N/utf-8   z/tmp/sglang_hf_validation_.json)	ospathisdirrealpathrstriphashlibsha256encode	hexdigest)r   normalized_dirdir_hashr   r   r   _get_validation_marker_path\   s
   r)   c                  C   s8   t jdt jdd} t j| d}t j|dd |S )a(  
    Get the directory for per-run validation markers.

    These markers are specific to the current CI run and are not shared across
    runners. They are stored in a temporary directory that is cleaned up after
    the run completes.

    Returns:
        Path to per-run marker directory
    RUNNER_TEMPTMPDIRz/tmpsglang_ci_offline_markersTexist_ok)r   environgetr   joinmakedirs)base_dir
marker_dirr   r   r   _get_per_run_marker_diry   s   r5   c                 C   s\   | rt j| s
dS t j| d}t|d dd }t	 }t j
|| dS )aj  
    Get the path to per-run validation marker file for a snapshot.

    Per-run markers are specific to the current CI run and are not shared
    across runners. This prevents cross-runner cache state pollution.

    Args:
        snapshot_dir: Path to snapshot directory

    Returns:
        Path to per-run marker file or None if snapshot_dir is invalid
    Nr   r   r   r   )r   r   r    r!   r"   r#   r$   r%   r&   r5   r1   )r   r'   r(   r4   r   r   r   _get_per_run_marker_path   s   r6   c              
      s   t | }|rtj|sdS zCt|ddd}t| W d   n1 s&w   Y  t ts3W dS g d}t	 fdd|D sEW dS  
dd	urOW dS  W S  tyk } ztd
|| W Y d}~dS d}~ww )z
    Read per-run validation marker for a snapshot.

    Args:
        snapshot_dir: Path to snapshot directory

    Returns:
        Marker dict if exists and valid, None otherwise
    Nrr   encoding)	timestampmodel_idsnapshot_hashvalidation_passedc                 3       | ]}| v V  qd S Nr   ).0kmarkerr   r   	<genexpr>       z'_read_per_run_marker.<locals>.<genexpr>r=   Tz)Failed to read per-run marker from %s: %s)r6   r   r   r   openjsonload
isinstancedictallr0   r   r   r   r   marker_pathfrequired_keysr   r   rB   r   _read_per_run_marker   s(   

rP   r;   required_filesc                 C   sP  t | }|std dS ddlm} tj| }|  d ||d|p&g d}zAtj	|}tj
|dd tjd	d
|ddd}|j}	tj||dd W d   n1 sWw   Y  t|	| td| W dS  ty }
 z1td||
 zdt v rtj|	rt|	 W n	 ty   Y nw W Y d}
~
dS W Y d}
~
dS d}
~
ww )z
    Write per-run validation marker for a snapshot.

    Args:
        snapshot_dir: Path to snapshot directory
        model_id: Model identifier
        required_files: List of required files that were validated
    z1Cannot write per-run marker: invalid snapshot_dirNr   datetimeZT)r:   r;   r<   r=   rQ   r-   wr   F.tmpmoder9   dirdeletesuffix   indentzWrote per-run marker to %sz(Failed to write per-run marker to %s: %s	temp_path)r6   r   r   rS   r   r   basenameutcnow	isoformatdirnamer2   tempfileNamedTemporaryFilenamerG   dumpreplacer   warninglocalsr   remove)r   r;   rQ   rM   rS   r<   rC   r4   rN   r_   r   r   r   r   _write_per_run_marker   sP   


rl   c              
   C   sr   t | }|r5tj|r7zt| td| W dS  ty4 } ztd|| W Y d}~dS d}~ww dS dS )zv
    Remove per-run validation marker for a snapshot.

    Args:
        snapshot_dir: Path to snapshot directory
    zRemoved per-run marker: %sz&Failed to remove per-run marker %s: %sN)	r6   r   r   r   rk   r   r   r   ri   )r   rM   r   r   r   r   _remove_per_run_marker   s   
rm   c              
      s  t | }|sdS tj|sdS z^t|ddd}t| W d   n1 s(w   Y  t ts5W dS g d}t	 fdd|D sGW dS  d t
krYtd	 d t
 W dS  d
durltd d
 W dS  W S  tjtfy } ztd|| W Y d}~dS d}~ww )a  
    Read validation marker for a snapshot.

    Args:
        snapshot_dir: Path to snapshot directory

    Returns:
        Marker dict with keys: version, validated_at, validation_passed
        None if marker doesn't exist or is invalid or validation_passed is not True
    Nr7   r   r8   versionvalidated_atr=   c                 3   r>   r?   r   )r@   keyrB   r   r   rD   &  rE   z*_read_validation_marker.<locals>.<genexpr>ro   z>Validation marker version mismatch: %s != %s, will re-validater=   Tz?Validation marker has validation_passed=%s, treating as invalidz*Failed to read validation marker at %s: %s)r)   r   r   r   rF   rG   rH   rI   rJ   rK   VALIDATION_MARKER_VERSIONr   r   r0   JSONDecodeErrorOSErrorrL   r   rB   r   _read_validation_marker  sB   
ru   passedc           	      C   sF  |sdS t | }|std dS ddlm} t|  d |d}zBtj	|}tj
|dd tjd	d
|ddd}|j}tj||dd W d   n1 sQw   Y  t|| td|| W dS  ty } z1td|| zdt v rtj|rt| W n	 ty   Y nw W Y d}~dS W Y d}~dS d}~ww )a-  
    Write validation marker for a snapshot (atomic write).

    IMPORTANT: We only cache successful validations. Failed validations are NOT
    cached to allow retry after files are downloaded.

    Args:
        snapshot_dir: Path to snapshot directory
        passed: Whether validation passed
    Nz)Cannot write marker: invalid snapshot_dirr   rR   rT   rn   Tr-   rU   r   FrV   rW   r\   r]   z)Wrote validation marker to %s (passed=%s)z+Failed to write validation marker to %s: %sr_   )r)   r   r   rS   rr   ra   rb   r   r   rc   r2   rd   re   rf   rG   rg   rh   r   ri   rj   r   rk   )	r   rv   rM   rS   rC   r4   rN   r_   r   r   r   r   _write_validation_markerB  sN   

rw   	file_path	file_namec              
   C   sN  t j| std||  dS t j| std||  dS zt j| }|dkr3td||  W dS W n tyN } ztd|| W Y d}~dS d}~ww z!t	| dd	d
}t
| W d   W dS 1 siw   Y  W dS  t
jy } ztd|| | W Y d}~dS d}~w ty } ztd|| | W Y d}~dS d}~ww )z
    Validate that a JSON file exists, is non-empty, and can be parsed.

    Args:
        file_path: Path to the JSON file
        file_name: Name of the file (for logging)

    Returns:
        True if the file is valid, False otherwise
    z'CI cache validation: %s not found at %sFz)CI cache validation: %s is not a file: %sr   z$CI cache validation: %s is empty: %sz.CI cache validation: Cannot get size of %s: %sNr7   r   r8   Tz2CI cache validation: %s is not valid JSON: %s - %sz/CI cache validation: Failed to read %s: %s - %s)r   r   r   r   r   isfileri   getsizert   rF   rG   rH   rs   r   )rx   ry   	file_sizer   rN   r   r   r   _validate_json_filez  sZ   
r}   Fc           !   
   C   sB  g }ddg}|D ]}t j| |}t||s|| qt j| d}t j|r3t|ds3|d t j| d}	t j|	}
d}|duoVd|v oVt j| oV|d }|rc|rct|d||d}|d	u r|
s|d
|pod d t	t
d| d|p}d d n3t|	ds|d n(|du r|
rt|	ds|d n|
rt|	ds|d t
d|pd|pd dD ]}t j| |}t j|rt||s|| d  nqt j| d}t j|rt|ds|d t j| d}t j|rt|ds|d t j| d}t j|rzt|ddd}t|}W d   n	1 s.w   Y  |di }|rt|trt }| D ]\}}t|trgd|v rg|dd }|| d  qI|D ]5}t j| |}t j|s|| d! t
d"| d#|   qkt j|s|| d$ qkW n tjttfy } zt
d%| W Y d}~nd}~ww g d&}d}|D ]F}t j| |}t j|rt j|r|d'krt||rd	} n!qzt j|dkrd	}W  nW q ty   Y qw q|s|d( t|dk} | |fS ))a  
    Validate that critical config and tokenizer files exist and are valid.

    This checks for:
    - config.json (required)
    - tokenizer_config.json (required)
    - generation_config.json (optional but validated if present)
    - hf_quant_config.json (conditionally required based on Hub) - for FP4/FP8/ModelOpt
    - quantize_config.json / quant_config.json (optional but validated if present) - for AWQ/GPTQ
    - params.json (optional but validated if present) - for Mistral native format
    - preprocessor_config.json (optional but validated if present) - for vision models
    - trust_remote_code dynamic modules (required if auto_map present in config.json)
    - At least one tokenizer file: tokenizer.json, tokenizer.model, or tiktoken.model

    Args:
        snapshot_dir: Path to the model snapshot directory
        model_id: Model repository ID (e.g., "meta-llama/Llama-2-7b-hf"), used for remote checks
        revision: Git revision (commit hash), used for remote checks
        allow_remote_check: Whether to check Hub for file existence to determine requirements

    Returns:
        Tuple of (is_valid, missing_files)
        - is_valid: True if all required files are present and valid
        - missing_files: List of missing or invalid file names
    config.jsontokenizer_config.jsonzgeneration_config.jsonz+generation_config.json (exists but invalid)hf_quant_config.jsonNr   )r   r   r	   r
   Tz;hf_quant_config.json (required: exists on Hub for revision r   z but missing locally)z!Hub has hf_quant_config.json for z
 revision zH but local snapshot missing it. Cache incomplete, will not write marker.z)hf_quant_config.json (exists but invalid)FzpCannot verify hf_quant_config.json on Hub for %s (revision=%s), treating as optional since remote status unknownunknown)zquantize_config.jsonzquant_config.jsonz (exists but invalid)zparams.jsonz params.json (exists but invalid)preprocessor_config.jsonz-preprocessor_config.json (exists but invalid)r7   r   r8   auto_map.r   .pyz! (required for trust_remote_code)z$Custom module file not in snapshot:  for z (exists but not a file)+Failed to check auto_map in config.json: %stokenizer.jsontokenizer.modelztiktoken.modelr   ztokenizer file)r   r   r1   r}   appendr   isabs
startswithr   r   r   r   rF   rG   rH   r0   rI   rJ   setitemsstrsplitaddrz   rs   rt   KeyErrorr{   len)!r   r;   r	   r
   missing_filesrQ   ry   rx   generation_config_pathhf_quant_config_pathlocal_hf_quant_existsremote_hf_quant_exists
is_hf_repoquant_config_namequant_config_pathparams_json_pathpreprocessor_config_pathconfig_pathrN   configr   custom_filesrq   valuemodule_namecustom_filecustom_file_pathr   tokenizer_filestokenizer_foundtokenizer_filetokenizer_pathis_validr   r   r   $_validate_config_and_tokenizer_files  s   

















r   weight_filesmodel_name_or_pathc                 C   s  |st td|  dS t| }|dur/t| }|r tj|nd}t td| d|  dS tj| }ddl}|jj	 }t td	| d
| d|  t
| |||d\}	}
|	set td|
 d|  dS t| |\}}}|s|t td| d|  dS t td|  t| dd dS )aq  
    Validate local cache completeness (config/tokenizer/weights) and determine
    if offline mode can be safely enabled.

    This function uses a snapshot-level marker to cache validation results,
    so the heavy validation is done at most once per snapshot per runner.

    This function checks:
    1. Validation marker (if exists and version matches, skip re-validation)
    2. Config and tokenizer files (config.json, tokenizer_config.json, etc.)
    3. Weight files (safetensors shards, index files, corruption check)

    If all are present and valid, it returns True to signal that offline
    mode can be safely enabled.

    IMPORTANT: This should be called BEFORE any HF operations, and if it
    returns True, the caller should set HF_HUB_OFFLINE=1 for the server
    subprocess env ONLY (not global environment).

    Args:
        snapshot_dir: Path to the model snapshot directory
        weight_files: List of weight file paths to validate (must be non-empty)
        model_name_or_path: Model identifier for logging

    Returns:
        True if cache is complete and offline mode can be enabled, False otherwise
    zJCI_OFFLINE: No weight files provided, skip offline, keep online allowed - FNr   zCI_OFFLINE: Marker hit (marker=z6), skip re-validation, offline mode will be enabled - Tr   zBCI_OFFLINE: No marker found, performing full validation (snapshot=z, allow_remote_check=z) - r   r;   r	   r
   z+CI_OFFLINE: Missing config/tokenizer files z&, skip offline, keep online allowed - z&CI_OFFLINE: Weight validation failed (z'), skip offline, keep online allowed - zDCI_OFFLINE: Cache validation PASSED, offline mode will be enabled - )rv   )r   r   ru   r)   r   r   r`   huggingface_hub.constants	constantsHF_HUB_OFFLINEr   _validate_sharded_modelrw   )r   r   r   rC   rM   marker_namer	   r   r
   config_validmissing_config_filesweights_valid	error_msg_r   r   r   0ci_validate_cache_and_enable_offline_if_complete  sj   !

r   component_namecomponent_infoc                 C   sH   |   }d|v r
dS d|v rdS d|v rdS d|v rdS d|v r"dS dS )a<  
    Infer component type from component name and info.

    Args:
        component_name: Name of the component (e.g., "scheduler", "tokenizer")
        component_info: Component info from model_index.json (e.g., ["diffusers", "SchedulerClass"])

    Returns:
        Component type string for validation rules
    	scheduler	tokenizerimage_processorfeature_extractor	processormodel)lower)r   r   
name_lowerr   r   r   _infer_component_type  s   r   component_dircomponent_typec           
      C   s  |dkr#ddg}|D ]}t j| |}t||rd|f  S q
d|fS |dkrg d}t j| d}t|dr;d|fS t j| d	}t j|rgt j|rgzt j|d
kr\d|fW S W n	 tyf   Y nw t j| d}t j| d}t|drt j|rd|fS d|fS |dv rddg}|D ]}t j| |}t||rd|f  S qd|fS dg}t j| d}	t|	drd|fS d|fS )a(  
    Check if component has required config files based on type.

    Args:
        component_dir: Path to component directory
        component_type: Type of component (scheduler, tokenizer, processor, model, etc.)

    Returns:
        Tuple of (has_valid_config, list_of_candidates_tried)
    r   zscheduler_config.jsonr~   TFr   )r   r   zvocab.json+merges.txtr   r   r   z
vocab.jsonz
merges.txt)r   r   r   r   )r   r   r1   r}   r   rz   r{   rt   )
r   r   
candidates	candidatecandidate_pathtokenizer_json_pathtokenizer_model_path
vocab_pathmerges_pathr   r   r   r   _check_component_config  sX   




r   c                 C   s4   g d}|D ]}t tj| |}|r dS qdS )z
    Check if component directory has weight files.

    Args:
        component_dir: Path to component directory

    Returns:
        True if weight files found, False otherwise
    )*.safetensors*.binz*.ptz*.pthTF)glob_moduleglobr   r   r1   )r   weight_patternspatternr   r   r   r   _check_component_weights[  s   
r      
componentsmax_showc                 C   sD   t | |krd| S | d| }t | | }d| d| dS )z
    Format component list with truncation.

    Args:
        components: List of component names
        max_show: Maximum number to show before truncating

    Returns:
        Formatted string like "comp1, comp2, comp3" or "comp1, comp2, +3 more"
    , Nz, +z more)r   r1   )r   r   shown	remainingr   r   r   _format_component_listo  s
   
r   c              
   C   s8  t jddk}t j| d}t j|sdS zt|ddd}t|}W d   n1 s/w   Y  W n% tj	t
fyZ } z|rOd	d
| fW  Y d}~S W Y d}~dS d}~ww dd | D }|shdS g }g }g }	g }
| D ]N\}}t j| |}t j|s|| qtt||}t||\}}|s|| |rd|}|	| d| d qt|dv}|rt|}|s|
| qt|s|s|
rg }|rt|}|r|d|  n|d|  |r|rd|	}|d|  nt|}|d|  |
rt|
}|d|  d	d|fS dS )a  
    Validate diffusion model (diffusers pipeline) cache completeness.

    This validation is based on model_index.json as the single source of truth.
    Error reporting uses coarse-grained error codes unless verbose mode is enabled.

    Error codes:
    - DIFFUSERS_INVALID_INDEX: model_index.json missing or corrupted
    - DIFFUSERS_INVALID_COMPONENTS: model_index.json has no valid components
    - DIFFUSERS_MISSING_COMPONENT: component directory or config missing
    - DIFFUSERS_MISSING_WEIGHTS: component weights missing

    Args:
        snapshot_dir: Path to the model snapshot directory

    Returns:
        Tuple of (is_valid, error_message)
        - (True, None) if validation passed
        - (False, error_code_with_components) if validation failed
    SGLANG_CI_VALIDATE_VERBOSE1zmodel_index.json)Fz3DIFFUSERS_INVALID_INDEX: model_index.json not foundr7   r   r8   NFz8DIFFUSERS_INVALID_INDEX: model_index.json parse error - )Fz3DIFFUSERS_INVALID_INDEX: model_index.json corruptedc                 S   s*   i | ]\}}| d st|tr||qS )r   )r   rI   list)r@   rA   vr   r   r   
<dictcomp>  s    z-_validate_diffusion_model.<locals>.<dictcomp>)Fz9DIFFUSERS_INVALID_COMPONENTS: no valid components definedr   z	 (tried: ))r   r   r   r   r   z$DIFFUSERS_MISSING_COMPONENT (dirs): z"DIFFUSERS_MISSING_COMPONENT(dir): z; z'DIFFUSERS_MISSING_COMPONENT (configs): z"DIFFUSERS_MISSING_COMPONENT(cfg): zDIFFUSERS_MISSING_WEIGHTS: z | TN)r   r/   r0   r   r1   r   rF   rG   rH   rs   rt   r   r    r   r   r   r   r   )r   verbosemodel_index_pathrN   model_indexr   r   missing_dirsmissing_configsmissing_configs_verbosemissing_weightsr   r   r   r   has_valid_configconfig_candidatescandidates_strneeds_weightshas_weightserrorsdir_str
config_str
weight_strr   r   r   _validate_diffusion_model  s   





r   c                 C   sz   |sdS t j| }t jddk}t| |||d\}}|s*d|}dd| fS t| |\}}	}
|s;dd|	 fS d	S )
a=  
    Validate cache and return detailed reason for failure.

    This function performs validation without relying on shared validation markers.
    Used by prevalidate_cached_models.py to provide detailed feedback.

    Args:
        snapshot_dir: Path to the model snapshot directory
        weight_files: List of weight file paths to validate
        model_name_or_path: Model identifier for logging

    Returns:
        Tuple of (success, reason):
        - (True, None) if validation passed
        - (False, reason_str) if validation failed with specific reason
    )FzNo weight files providedr   r   r   r   Fz Missing config/tokenizer files: zWeight validation failed: r   )r   r   r`   r/   r0   r   r1   r   )r   r   r   r	   r
   r   r   missing_files_strr   r   r   r   r   r   #validate_cache_with_detailed_reason  s"   

r   requires_hf_quant_configc           &   
      s  ddg}|D ]}t jt j |s dS qg d}t fdd|D }|s*dS t j d}t j|rz|t|ddd	}t|}W d
   n1 sOw   Y  |di }	|	rt	|	t
rt }
|	 D ]\}}t	|trd|v r|dd }|
| d qh|
D ]*}t j |}t j|std|   W dS t j|std|  W dS qW n tjttfy } ztd| W Y d
}~nd
}~ww t j d}t j|}|rLzLt|ddd	}t|}W d
   n1 sw   Y  |di }|r+t| }|D ]}t j |}t j|s)td|   W dS qW n tjttfyK } ztd|| W Y d
}~dS d
}~ww tt j d}|s[dS td}i }|D ]A}t j|}||}|r|d}t|d}t|d}| d| } | |vr|t d|| < ||  d | qd| D ]*\} }!|!d }|!d }"ttd|d }#|#|" }$|$rtdt |$|    dS q|rt j d }%t j|%sdS d!S )"a&  
    Lightweight runtime validation for cache completeness.

    This is used during test runs to ensure the current runner's cache
    is complete before enabling offline mode. Much faster than full validation
    as it only checks file existence, not corruption.

    Args:
        snapshot_dir: Path to the model snapshot directory
        requires_hf_quant_config: If True, hf_quant_config.json must exist
                                  (required for modelopt quantization)

    Returns:
        True if cache is complete, False otherwise
    r~   r   Fr   c                 3   s&    | ]}t jt j |V  qd S r?   )r   r   r   r1   )r@   fnamer   r   r   rD   T  s    
z-validate_cache_lightweight.<locals>.<genexpr>r7   r   r8   Nr   r   r   r   z-Custom module file not in snapshot: %s for %sz,Custom module path exists but not a file: %sr   zmodel.safetensors.index.json
weight_mapz/Index validation failed: missing shard %s in %sz$Failed to validate index file %s: %sr   z"(.*?)-(\d+)-of-(\d+)\.safetensors$   r\      -of-)totalfound_shardsr   r   z7Shard validation failed: missing shards %s in %s for %sr   T)!r   r   r   r1   anyrF   rG   rH   r0   rI   rJ   r   r   r   r   r   r   r   rz   rs   rt   r   valuesr   r   recompiler`   matchgroupintrangesorted)&r   r   rQ   r   r   has_tokenizerr   rN   r   r   r   rq   r   r   r   r   r   
index_path	has_index
index_datar   required_shards
shard_name
shard_pathsafetensors_filesshard_patternshard_groups	base_namer  prefixshard_idtotal_shards	group_key
group_infor   expected_shardsmissing_shardshf_quant_pathr   r   r   validate_cache_lightweight1  s   





r  c              
   C   s   z#t j| ddd}t| }W d   W dS 1 sw   Y  W dS  tyC } ztd| t|jt	| W Y d}~dS d}~ww )z
    Validate that a safetensors file is readable and not corrupted.

    Args:
        file_path: Path to the safetensors file

    Returns:
        True if the file is valid, False if corrupted
    ptcpu)	frameworkdeviceNTz0Corrupted safetensors file detected: %s - %s: %sF)
safetensors	safe_openr   keysr   r   ri   type__name__r   )rx   rN   r   r   r   r   r   _validate_safetensors_file  s"   

r&  c              
   C   sb   zddl }|j| dddd W dS  ty0 } ztd| t|jt| W Y d}~dS d}~ww )a  
    Validate that a PyTorch .bin file is readable and not corrupted.

    This catches corruption issues like truncated downloads or invalid archives
    that would cause errors like:
    "RuntimeError: PytorchStreamReader failed reading file data/X: invalid header
    or archive is corrupted"

    Args:
        file_path: Path to the .bin file

    Returns:
        True if the file is valid, False if corrupted
    r   Nr  TF)map_locationweights_onlymmapz0Corrupted PyTorch bin file detected: %s - %s: %s)torchrH   r   r   ri   r$  r%  r   )rx   r*  r   r   r   r   _validate_pytorch_bin_file  s   r+  c                 C   s  dd t | D }|sdS |D ]}t j| |}t j|rgt j|sgzt j|}t | t	d| t j|rBt | W n t
y\ } ztd|| W Y d}~nd}~ww dd| d	f  S zft|}t|}W d   n1 s|w   Y  |d
i }|sW qt| }	g }
|	D ]}t j| |}t j|s|
| q|
rddt|
 d| d|
dd  t|
dkrdnd fW   S W q ty } zt	d|| dd| dfW  Y d}~  S d}~w t
y	 } zt	d|| W Y d}~qd}~ww dS )ad  
    Check if all files listed in safetensors index files actually exist on disk.

    This catches cases where the snapshot directory exists but files are missing
    (e.g., due to incomplete downloads or corrupted cache).

    Args:
        snapshot_dir: Path to the model snapshot directory

    Returns:
        Tuple of (all_exist, error_message)
    c                 S   s   g | ]	}| d r|qS ).safetensors.index.json)endswithr@   rN   r   r   r   
<listcomp>  s
    
z,_check_index_files_exist.<locals>.<listcomp>r   z/Removed broken index symlink: %s (blob missing)z&Failed to remove broken symlink %s: %sNFzBroken index file symlink: z (cleaned up, will re-download)r   zMissing z file(s) from index : r   z... z Failed to read index file %s: %szIndex file z unreadable (will re-download))r   listdirr   r1   islinkr   r!   rk   r   ri   r   errorrF   rG   rH   r0   r   r  r   r   FileNotFoundError)r   index_files
index_filer
  	blob_pathr   rN   r  r   rQ   r   ry   rx   r   r   r   _check_index_files_exist  sn   




4

r9  c                 C   s  t | \}}|sd|g fS td}i }|D ]Q}tj|}||}|rg|d}	|d}
|d}|	 d|
 d| }||vrN|	t|
|g g d||< t|d	}|| d
 	| || d 	| qg }|
 D ]\}}|d }t|d
 }ttd|d }|| }|rdd| dt| g f  S |d dkr|d D ]}t|s|	| qn|d dkr|d D ]}t|s|	| q|d dkrtj| |d  d}tj|sddtj| g f  S qn|rdddd |D  |fS ddg fS )a`  
    Validate that all model shards are present and not corrupted.

    Args:
        snapshot_dir: Path to the model snapshot directory
        weight_files: List of weight file paths

    Returns:
        Tuple of (is_valid, error_message, corrupted_files)
        - corrupted_files: List of file paths that are corrupted (for selective cleanup)
    Fz'(.*?)-(\d+)-of-(\d+)\.(safetensors|bin)r   r      r   r   )r  r   r[   r   filesr\   r   r;  r   zMissing shards in r0  r[   r!  binr  r,  zMissing index file: zCorrupted shard files: c                 S   s   g | ]}t j|qS r   )r   r   r`   r.  r   r   r   r/    s    z+_validate_sharded_model.<locals>.<listcomp>TN)r9  r  r  r   r   r`   r  r  r  r   r   r   r  r  r&  r+  r1   r   )r   r   index_check_validindex_errorr  r  rN   r  r  r  total_shards_strr[   r  r  corrupted_filesr  r  r   r  r  r7  r   r   r   r   _  s|   









r   r@  c                 C   s  d}|D ]o}zPt j|r<t j|}t | tdt j| t j|r7t | tdt j| |d7 }nt j|rUt | tdt j| |d7 }W q t	ys } zt
dt j|| W Y d}~qd}~ww |dkrtd||  |S )	a  
    Selectively remove corrupted files and their blobs to force re-download.

    This is more efficient than removing the entire model cache as it only
    re-downloads corrupted files rather than the entire model.

    Args:
        model_name_or_path: Model identifier
        corrupted_files: List of corrupted file paths (symlinks in snapshot)

    Returns:
        Number of files successfully cleaned up
    r   zRemoved corrupted symlink: %szRemoved corrupted blob: %sr   zRemoved corrupted file: %sz&Failed to remove corrupted file %s: %sNzNRemoved %d corrupted file(s) for %s. These will be re-downloaded on next load.)r   r   r3  r!   rk   r   infor`   r   r   r4  ri   )r   r@  cleaned_countrx   r8  r   r   r   r   "_cleanup_corrupted_files_selective  sF   




rC  reasonc              
   C   sx   t jt j|dd}ztd| || t| td W dS  t	y; } zt
d|| W Y d}~dS d}~ww )af  
    Remove entire corrupted model cache directory to force a clean re-download.

    This is used when we cannot selectively clean (e.g., missing shards, incomplete
    downloads with unknown affected files).

    Args:
        model_name_or_path: Model identifier
        snapshot_dir: Path to the snapshot directory
        reason: Reason for cleanup
    ..z.Removing entire cache for %s at %s. Reason: %sz.Successfully removed corrupted cache directoryzRFailed to remove corrupted cache directory %s: %s. Manual cleanup may be required.N)r   r   abspathr1   r   ri   shutilrmtreerA  r   r4  )r   r   rD  repo_folderr   r   r   r   _cleanup_corrupted_model_cache  s$   
rJ  found_local_snapshot_dirlocal_weight_filesc              	   C   s  t jt j|dd}t j|d}g }t j|r%tt j|d}|rFttdt	| d| d|  d t
| |dt	| d	 d
S |rt||\}}}|sz|rlttdt	| d|  d| d t| | d
S ttd|  d| d d
S |D ]C}	t j|	}
|
dv rt|	sttd|
 d|  d t| |	g  d
S q||
dv rt|	sttd|
 d|  d t| |	g  d
S q|dS )a  
    CI-specific validation and cleanup for local model snapshots.

    This function validates the local snapshot and performs automatic cleanup
    if corruption or missing files are detected. This behavior is only appropriate
    for CI environments where we want automatic recovery.

    Args:
        model_name_or_path: Model identifier for logging
        found_local_snapshot_dir: Path to the local snapshot directory
        local_weight_files: List of weight file paths found in the snapshot

    Returns:
        True if the snapshot is valid and can be used, False if it was invalid
        and cleanup was performed (caller should re-download)
    rE  blobs*.incompletezFound z .incomplete files in r   z . Will clean up and re-download.zIncomplete download detected (z incomplete files)Fz corrupted file(s) for r0  z:. Will selectively clean and re-download only these files.zValidation failed for z). Will attempt to download missing files.)zmodel.safetensorszpytorch_model.safetensorszadapter_model.safetensorszCorrupted model file z3. Will selectively clean and re-download this file.)zpytorch_model.binz	model.binzadapter_model.binT)r   r   rF  r1   r    r   r   r   r   r   rJ  r   rC  r`   r&  r+  )r   rK  rL  rI  	blobs_dirincomplete_filesr   r   r@  rN   r  r   r   r   &ci_validate_and_cleanup_local_snapshot!  sx   


rQ  	hf_folderallow_patternsc              	      s   g }|D ]}| ttj | q|sdS g }|D ]3}|dr7tj|r7t|s6|	tj
| q|drOtj|rOt|sO|	tj
| q|rlt| fdd|D  ttd| d| d d	S dS )
at  
    Validate downloaded weight files to catch corruption early.

    This function validates safetensors files after download to catch
    corruption issues (truncated downloads, network errors, etc.) before
    model loading fails with cryptic errors. If corruption is found,
    the corrupted files are automatically cleaned up.

    Args:
        hf_folder: Path to the downloaded model folder
        allow_patterns: Patterns used to match weight files
        model_name_or_path: Model identifier for error messages

    Returns:
        True if all files are valid, False if corrupted files were found and cleaned up
    Tz.safetensorsz.binc                    s   g | ]	}t j |qS r   )r   r   r1   r.  rR  r   r   r/    s    z4_validate_weights_after_download.<locals>.<listcomp>z)Downloaded model files are corrupted for r0  z=. The corrupted files have been removed. Will retry download.F)extendr   r   r   r   r1   r-  r   r&  r   r`   r+  rC  r   r   )rR  rS  r   r   r   r@  rN   r   rT  r    _validate_weights_after_download  s8   rV  	cache_dirc                 C   s   t |   dd }z,ddl}|p|jj}tj	|r8tj
|d}tj|dd tj
|d| dW S W n	 tyB   Y nw tj	d	rNd
| S d| S )aM  
    Generate a unique lock file path for download coordination.

    In CI environments where multiple containers share an NFS-mounted HF cache,
    the lock file is placed on the shared cache directory so ALL containers
    coordinate on the same lock. This prevents cross-container .incomplete
    file race conditions.

    Falls back to /dev/shm (container-local) for non-CI or when the cache
    dir is not accessible.

    Args:
        model_name_or_path: Model identifier
        cache_dir: HF cache directory (None to use default)

    Returns:
        Path to the lock file
    N   r   z.sglang_locksTr-   	download_z.lockz/dev/shmz/dev/shm/sglang_download_lock_z/tmp/sglang_download_lock_)r#   r$   r%   r&   r   r   HF_HUB_CACHEr   r   r    r1   r2   r   )r   rW  key_hashr   effective_cache_dirlock_dirr   r   r   _get_lock_file_path  s   

r^  c           
      C   s(  zzddl }|p
|jj}|jjdg| d}tj||d}tj|s)W dS t	
tj|d}d}|D ]4}zt| |d7 }tdtj| W q7 tyk }	 ztd	tj||	 W Y d}	~	q7d}	~	ww |dkrxtd
|| | |W S  ty }	 ztd|	 W Y d}	~	dS d}	~	ww )a  
    Remove stale .incomplete files from the model's blobs directory.

    This is lighter than _cleanup_corrupted_model_cache (which deletes the
    entire cache). We only remove .incomplete files so snapshot_download
    starts fresh on retry, preserving any successfully downloaded blobs.

    Args:
        model_name_or_path: Model identifier (e.g., "meta-llama/Llama-2-7b-hf")
        cache_dir: HF cache directory (None to use default)

    Returns:
        Number of .incomplete files removed
    r   Nmodelsr   rM  rN  r   zRemoved incomplete blob: %sz'Failed to remove incomplete blob %s: %sz.Cleaned up %d .incomplete blob(s) for %s in %sz'Failed to clean up incomplete blobs: %s)r   r   rZ  REPO_ID_SEPARATORr1   r   r   r   r    r   r   rk   r   r   r`   rt   ri   r   )
r   rW  r   r\  repo_folder_namerO  rP  removedrN   r   r   r   r   _cleanup_incomplete_blobs  sF   
rc  r   max_retriesc                 C   s  ddl }ddl}ddlm} ddlm}	 G dd d|	}
t| |}tdt	
 | |j|dd	d
}tdt	
 |  |
 tdt	
 |  z&ddlm} || |||}|durmtdt	
 | |W W  d   S W n ty } ztd| W Y d}~nd}~ww t| |}|dkrtd||  d}t|D ]}z|| ||||
||jjdd}W nW ttfy } zHtdt	
 |d || t|j| ||d k rdd|  }td| t| | t| W Y d}~qtd|  d| dt|j d| |d}~ww t||| }|r|  W  d   S ||d k r4ttd|  d|d  d| d qtd|  d| d |W  d   S 1 sLw   Y  dS )!a  
    CI-specific download with validation and automatic retry on corruption.

    This function handles the download of model weights in CI environments,
    with automatic validation and retry logic for handling corrupted downloads.

    Uses filelock.FileLock on the shared HF cache directory to coordinate
    downloads across all processes AND all containers sharing the same
    NFS-mounted cache. Only one process downloads at a time; others wait
    for the lock then use the cached result.

    Args:
        model_name_or_path: The model name or path
        allow_patterns: The allowed patterns for weight files
        ignore_patterns: The patterns to filter out weight files
        cache_dir: The cache directory to store model weights
        revision: The revision of the model
        max_retries: Maximum number of download retries if corruption is detected

    Returns:
        str: The path to the downloaded model weights

    Raises:
        RuntimeError: If download fails after max_retries attempts
    r   N)snapshot_download)tqdmc                       s   e Zd Z fddZ  ZS )z;ci_download_with_validation_and_retry.<locals>.DisabledTqdmc                    s   d|d< t  j|i | d S )NTdisable)super__init__)selfargskwargs	__class__r   r   ri  L  s   zDci_download_with_validation_and_retry.<locals>.DisabledTqdm.__init__)r%  
__module____qualname__ri  __classcell__r   r   rm  r   DisabledTqdmK  s    rr  z,[CI Download] Process %d using lock file: %si  )timeoutrX   z7[CI Download] Process %d waiting to acquire lock for %sz-[CI Download] Process %d ACQUIRED lock for %s)$_find_local_hf_snapshot_dir_unlockedzf[CI Download] Process %d found cached model after acquiring lock (downloaded by another container): %sz>[CI Download] Re-check for cached model failed (non-fatal): %szO[CI Download] Pre-download cleanup: removed %d stale .incomplete file(s) for %sr   )rS  ignore_patternsrW  
tqdm_classr	   local_files_onlymax_workerszJ[CI Download] Process %d hit download error (attempt %d/%d) for %s: %s: %s
   r\   zB[CI Download] Cleaning up .incomplete files and retrying in %ds...zDownload failed for z after z. attempts due to download errors. Last error: r0  zRetrying download for z
 (attempt r   z)...z/Downloaded model files are still corrupted for zm attempts. This may indicate a persistent issue with the model files on Hugging Face Hub or network problems.)filelockr   r   re  	tqdm.autorf  r^  r   rA  r   getpidFileLock$sglang.srt.model_loader.weight_utilsru  r   r   rc  r  r   r   r5  rt   ri   r$  r%  timesleepRuntimeErrorrV  r   )r   rS  rv  rW  r	   rd  r{  r   re  rf  rr  lock_file_pathlockru  cached_pathr   cleanedrR  attemptbackoffr   r   r   r   %ci_download_with_validation_and_retry%  s   !
	




!c&r  
model_pathc              
   C   s  ddl m} | sdS tj| rdS zddl}|jj}tj||jj	dg| 
d}tj|s7W dS tj|d}tj|sGW dS g }t|D ]N}tj||}tj|s^qNttj|d}	|	D ]}
tj|
ssqjt|
s|||
 qjttj|d}|D ]}tj|sqt|s|| qqN|rtd	t||  t| | W dS W dS  ty } ztd
| W Y d}~dS d}~ww )a  
    Validate and clean corrupted safetensors files in HF cache before loading.

    This function is needed because HFRunner (used in tests) calls transformers'
    from_pretrained() directly, which bypasses SGLang's weight validation.
    Corrupted cached files can cause cryptic errors like "EOF while parsing"
    from safetensors.

    Only runs in CI to avoid overhead for regular users.

    Args:
        model_path: Model identifier (e.g., "meta-llama/Llama-2-7b")
    r   )is_in_ciNr_  r   	snapshotsr   r   zRHFRunner: Found %d corrupted weight file(s) for %s. Removing to force re-download.z*HF cache validation failed (non-fatal): %s)sglang.utilsr  r   r   r    r   r   rZ  r1   r`  r   r2  r   r   r   r&  r   r+  r   ri   r   rC  r   r   )r  r  r   rW  rI  snapshots_dirr@  r<   r   r  sf_file	bin_filesbin_filer   r   r   r   ci_validate_and_clean_hf_cache  sj   

	r  r?   )NNF)r   )F)r   ):__doc__r   r   r#   rG   loggingr   r  rG  rd   r  typingr   r   r   r!  sglang.srt.utilsr   	getLoggerr%  r   rr   r   boolr   r)   r5   r6   rJ   rP   r   rl   rm   ru   rw   r}   r   r   r   r   r   r  r   r   r   r  r&  r+  r9  r   rC  rJ  rQ  rV  r^  rc  r  r  r   r   r   r   <module>   sT   

3%
5788
 S
m
N
}
3
 " R
a
>
#
l
9
+;
 7