o
    }oi@                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z m!Z! ej"dkrfeZ#neZ#dd Z$eddG dd dZ%G dd de#Z&dS )    N)	dataclass)Path	PosixPathWindowsPath)OptionalUnion)io)NEMO_MODELS_CACHE)ADAPTER_META_FILENAME)RestoreConfig)logging)AppState)uninject_model_parallel_rank)import_multistorageclientis_multistorageclient_urlntc              
   C   s   ddl m} ddlm} z||d}W n ty/ } ztd|  | W  Y d }~S d }~ww t||r?|| _|j	| j	_| S td | S )Nr   )TokenizerSpec)load_contextzmodel.tokenizerz`Encountered error while trying to restore tokenizer. Tokenizer is not restored. Original error: zJCheckpoint does not have model.tokenizer field. Tokenizer is not restored.)
"nemo.collections.common.tokenizersr   nemo.lightning.ior   
ValueErrorr   warning
isinstance	tokenizer__io__)model	ckpt_pathr   r   r   e r   I/home/ubuntu/.local/lib/python3.10/site-packages/nemo/lightning/resume.py_try_restore_tokenizer(   s"   


r    T)kw_onlyc                   @   s   e Zd ZU dZdZee ed< dZee	 ed< dZ
ee	 ed< dZeed< dZeed< dZeed	< d
ZdefddZddeejejf fddZde	defddZdd Zdee fddZddeej dee fddZddeej dee fddZdS )
AutoResumeas  Class that handles the logic for setting checkpoint paths and restoring from
    checkpoints in NeMo.

    Attributes:
        restore_config (Optional[RestoreConfig]): Optional config for selectively restoring specific parts like model
            weights, optimizer states, etc.
            If the config contains a path from HF or another non-NeMo checkpoint format, the checkpoint will be
            automatically converted to a NeMo compatible format.
            resume_from_folder or the run's log_dir takes precedence over restore_config.
        resume_from_directory (str): Path to the checkpointing directory to restore from.
        resume_from_path (str): Path to a specific checkpoint to restore from.
        resume_if_exists (bool): Whether this experiment is resuming from a previous run. If
            True, it sets trainer._checkpoint_connector._ckpt_path so that the trainer should
            auto-resume. exp_manager will move files under log_dir to log_dir/run_{int}.
            Defaults to False.
        resume_past_end (bool): By default, AutoResume throws an error if resume_if_exists is
            True and a checkpoint matching ``*end.ckpt`` indicating a previous training run
            fully completed. Setting resume_past_end=True disables this behavior and loads the
            last checkpoint.
        resume_ignore_no_checkpoint (bool): AutoResume throws an error if resume_if_exists is
            True and no checkpoint could be found. Setting resume_ignore_no_checkpoint=True
            disables this behavior, in which case exp_manager will print a message and
            continue without restoring.
    Nrestore_configresume_from_directoryresume_from_pathFresume_if_existsresume_past_endresume_ignore_no_checkpointweightsreturnc                 C   s
   || j  S )zReturns the path to the weights directory within the specified path.

        Args:
            path: The checkpoint directory path

        Returns:
            Path: A Path object pointing to the weights directory
        )WEIGHTS_PATH)selfpathr   r   r   get_weights_pathb   s   
	zAutoResume.get_weights_pathtrainerc                 C   s  t |tjr
td| |}|r?||_||j_t| j	ddr=t |t
r1|jd }| s0|j}n| |}t||}dS dS | j	r| j| j	jd}t |t
rSJ dt|| j	_| j	|j_	| j	jrt |t
rot|jd }n|d }| sy|}t|| dS dS dS )a  Sets up checkpoint restoration for the Pytorch Lightning trainer.

        This method configures the trainer with the appropriate checkpoint path for resuming
        training and handles loading model artifacts like tokenizers when specified.

        Args:
            trainer: The PyTorch Lightning trainer or Fabric instance
            model: Optional model instance to load artifacts into

        Raises:
            NotImplementedError: If trainer is a Fabric instance (not yet supported)
        zFabric is not supported yet.load_artifactsFcontext)r-   z/AdapterPath is not supported for restore_configN)r   flFabricNotImplementedErrorget_trainer_ckpt_pathr   checkpoint_callbacklast_model_pathgetattrr#   AdapterPathbase_model_pathexistsget_context_pathr    _extract_pathr-   strstrategyr0   r   is_dir)r,   r/   r   trainer_ckpt_pathcontext_pathnew_pathr   r   r   setupm   s>   





zAutoResume.setupr-   c                 C   sR   d|v r| dsJ d|d\}}tjt|}n|}t|tr't|}|S )Nz://znemo://zDOnly NeMo based paths starting with nemo:// are currently supported.)	
startswithsplitosr-   joinr	   r   r>   r   )r,   r-   __pathrC   r   r   r   r=      s   
zAutoResume._extract_pathc                 C   sZ   t |d}t|}W d    n1 sw   Y  t|d }| s+| r+|j}|S )Nrmodel_ckpt_path)openjsonloadr   r@   r;   parent)r,   adapter_meta_pathr   fmetadatar:   r   r   r    _get_base_model_path_for_adapter   s   z+AutoResume._get_base_model_path_for_adapterc                    s@  ddl m}m} t }|j}d }| jr(t| jr"t }|| j}nt| j}n|d ur5tt|d }nd S dd t	|
dD }dd |D }	dd |D }
t|	}||	}t|}|dkri|dkritd	t|
}||
 t }|dkr|dkrtd
| rt|dkst dks| jrd| d}|d u r|d7 }t| |S | jrd S |d| dt|dkr| jstd|d  dt|dkrdt|d v r|d }|S td| d|S t dkrt fdddD r d }t|}|S t dd ddd }td  d |S  d }|S )Nr   )NotFoundError"_filter_out_unfinished_checkpointscheckpointsc                 S   s   g | ]}|  r|qS r   )r@   .0dr   r   r   
<listcomp>   s    z6AutoResume._find_trainer_ckpt_path.<locals>.<listcomp>*c                 S      g | ]	}| d r|qS )z*endmatchrX   r   r   r   r[          c                 S   r]   )z*lastr^   rX   r   r   r   r[      r`   zEnd checkpoint is unfinished and cannot be used to resume the training. Please remove the checkpoint manually to avoid unexpected cosequences, such as restarting from scratch.aJ  Last checkpoint is unfinished and cannot be used to resume the training. Please remove the checkpoint manually to avoid unexpected cosequences, such as restarting from scratch. Hint: Iteration number can be added to the checkpoint name pattern to maximize chance that there is at least one finished last checkpoint to resume from.z]There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir :z. zTraining from scratch.z. Cannot resume.zFound z= indicating that the last training run has already completed.   mp_rankzMultiple checkpoints z that matches *end.ckpt.c                    s    g | ]}|t  d  v r|qS )r   )r>   )rY   slast_checkpointsr   r   r[     s     )rb   tp_rank
fsdp_shardc                 S   s
   |   jS N)lstatst_mtime)pthr   r   r   <lambda>  s   
 z4AutoResume._find_trainer_ckpt_path.<locals>.<lambda>T)keyreversezA matches *last.ckpt. Selecting one with the latest modified time.)nemo.utils.exp_managerrU   rV   r   log_dirr$   r   r   r   listgloblenr   r;   r(   r   r   r#   r'   r>   anyr   sorted)r,   rU   rV   	app_staterp   
checkpointmsccheckpoint_dirdist_checkpointsend_dist_checkpointslast_dist_checkpointsend_chkpt_cntend_checkpointsfinished_end_chkpt_cntlast_chkpt_cntfinished_last_chkpt_cntwarnr   rd   r   _find_trainer_ckpt_path   s   
 
$
z"AutoResume._find_trainer_ckpt_pathr   c                 C   s<   d}t  }| j|_| jr|  }|r|d }| r|}|S )a  Retrieves the path to the context directory of a checkpoint.

        The context directory contains serialized objects like tokenizers. This method
        handles both cases where the context is directly in the checkpoint directory
        or in a subdirectory called "context".

        Args:
            model: Optional model instance

        Returns:
            Optional[Path]: Path to the context directory if found, None otherwise
        Nr1   )r   r&   restorer   r@   )r,   r   rw   rv   maybe_context_pathr   r   r   r<     s   zAutoResume.get_context_pathc           	      C   s   | j r<t| j rt }|| j }nt| j }| |}| r9|t }| r7| ||}t	t| j |dS |S | j S d}t
 }| j|_| jrL|  }|rY| |}| rY|}|rq|t }| ro| ||}t	||dS |S dS )av  Resolves the path to a checkpoint for resuming training.

        This method handles various checkpoint sources with the following priority:
        1. Explicit path specified in resume_from_path
        2. Automatic discovery in the checkpoint directory when resume_if_exists=True

        For adapter checkpoints (PEFT), it also retrieves the base model path from metadata.

        Args:
            model: Optional model instance

        Returns:
            Optional[Path]: Path to the checkpoint if found, or AdapterPath for PEFT checkpoints,
                           or None if no checkpoint is found or needed
        r:   N)r%   r   r   r   r.   r@   r
   r;   rT   r9   r   r&   r   r   )	r,   r   rx   r%   maybe_weights_pathrQ   r:   rw   rv   r   r   r   r5   /  s:   



z AutoResume.get_trainer_ckpt_pathrh   ) __name__
__module____qualname____doc__r#   r   r   __annotations__r$   r>   r%   r&   boolr'   r(   r+   r   r.   r   plTrainerr2   r3   rD   BasePathr=   rT   r   r   ConnectorMixinr<   r5   r   r   r   r   r"   >   s    
 2]"r"   c                       sF   e Zd ZU dZee ed< dddee f fddZdd Z  Z	S )	r9   zPath object for adapter paths which include a field for the base model the adapters are trained on
    to facilitate model loading.r:   Nr   c                   s$   t  j| g|R i |}||_|S rh   )super__new__r:   )clsr:   argskwargsoutput	__class__r   r   r   o  s   zAdapterPath.__new__c                 C   s   d | jj|  | jS )Nz{}({!r}, base_model_path={}))formatr   r   as_posixr:   )r,   r   r   r   __repr__t  s   zAdapterPath.__repr__)
r   r   r   r   r   r   r   r   r   __classcell__r   r   r   r   r9   i  s
   
 r9   )'rN   rG   dataclassesr   pathlibr   r   r   typingr   r   lightning.fabricfabricr2   lightning.pytorchpytorchr   nemo.lightningr   nemo.lightning.baser	   nemo.lightning.ckpt_utilsr
   'nemo.lightning.pytorch.strategies.utilsr   
nemo.utilsr   nemo.utils.app_stater   nemo.utils.model_utilsr   nemo.utils.msc_utilsr   r   namer   r    r"   r9   r   r   r   r   <module>   s0   
  ,