o
    ॵi+                     @   sn   d dl Z d dlZd dlZd dlmZ d dlmZmZmZ d dl	m
Z
 d dlmZ d dlmZ G dd dZdS )	    N)	Pipelines)load_checkpointsave_checkpointsave_configuration)	ModelFile)
get_logger	is_masterc                   @   s   e Zd ZdZdZdd Zedd Zedd Z			
dddZ	dd Z
dd Zdd Zdd Zdd Zdd Zedd Zdd Zedd Zd	S ) CheckpointProcessorz_trainer_state.pthz.pthc                 C   sD   |j }|d dd ttD v rd|d i|d< | |||d dS )zPrepares the output of target folder.

        This is a strategic function which can be registered by other hook's function.

        Args:
            trainer: The trainer instance.
            output_dir: The target folder used in inference.
        taskc                 S   s    g | ]}| d stt|qS )__)
startswithgetattrr   ).0attr r   m/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/trainers/hooks/checkpoint/checkpoint_processor.py
<listcomp>!   s    z6CheckpointProcessor.prepare_output.<locals>.<listcomp>typepipelinez*.binN)cfgdirr   copy_files_and_dump_config)selftrainer
output_dirconfigr   r   r   prepare_output   s   	z"CheckpointProcessor.prepare_outputc                 C   s   |  | j}G dd d}dD ]$}|d| dur"|d| |d| dur3|d| q|||}t|d	rK|j||d
d |j|d | jdurZ| jj||j|d | jduri| jj||j|d |	  dS )z[Copy useful files to target output folder and dumps the target configuration.json.
        c                   @   s$   e Zd Zdd Zdd Zdd ZdS )zBCheckpointProcessor.copy_files_and_dump_config.<locals>.SaveConfigc                 S   s   || _ || _d S N)r   r   )r   r   r   r   r   r   __init__2   s   
zKCheckpointProcessor.copy_files_and_dump_config.<locals>.SaveConfig.__init__c                 S   s
   || _ d S r   )r   )r   _output_dir_configr   r   r   __call__6   s   
zKCheckpointProcessor.copy_files_and_dump_config.<locals>.SaveConfig.__call__c                 S   s   t | j| j d S r   )r   r   r   )r   r   r   r   save_config9   s   zNCheckpointProcessor.copy_files_and_dump_config.<locals>.SaveConfig.save_configN)__name__
__module____qualname__r   r"   r#   r   r   r   r   
SaveConfig0   s    r'   )push_to_hubhub_repo_id	hub_tokenprivate_hubztrain.checkpoint.period.Nztrain.checkpoint.periodztrain.checkpoint.best.ztrain.checkpoint.bestsave_pretrainedc                  _   s   d S r   r   )argskwargsr   r   r   <lambda>L   s    z@CheckpointProcessor.copy_files_and_dump_config.<locals>.<lambda>)save_functionr   save_config_function)r1   )
unwrap_modulemodelsafe_getpophasattrr,   r   train_preprocessoreval_preprocessorr#   )r   r   r   bin_filer3   r'   pop_keysave_config_fnr   r   r   r   *   sD   



z.CheckpointProcessor.copy_files_and_dump_configc                 C   s,   t j}t| drt jt| jv rt j}|S )zGet bin file path.
        	model_dir)r   TORCH_MODEL_BIN_FILEr6   TORCH_MODEL_FILEoslistdirr<   )r3   default_bin_filer   r   r   	_bin_file\   s   zCheckpointProcessor._bin_fileNTc           	      C   sJ   | |j}| |\}}| ||||| | || | ||| dS )a  Save the state dict for trainer and model.

        This is a strategic function which can be registered by other hook's function.

        Args:
            trainer(`EpochBasedTrainer`): The trainer instance.
            checkpoint_path_prefix(`str`): The saving dir with a prefix.
                like: /tmp/test/epoch_0
            output_dir(`str`): The output dir for inference.
            meta: (`dict`): The meta info needed to be saved into files.
            save_optimizers: (`bool`): Do save the optimizers state
        N)r2   r3   _get_state_file_namesave_trainer_statesave_model_statelink)	r   r   checkpoint_path_prefixr   metasave_optimizersr3   _model_file_train_state_filer   r   r   save_checkpointsg   s   z$CheckpointProcessor.save_checkpointsc                 C   sB   |  |\}}tj|rt| tj|rt| dS dS )aB  Remove obsolete checkpoint files.

        This is a strategic function which can be registered by other hook's function.

        Args:
            trainer(`EpochBasedTrainer`): The trainer instance.
            checkpoint_path_prefix(`str`): The saving dir with a prefix.
                like: /tmp/test/epoch_0
        N)rC   r?   pathisfileremove)r   r   rG   rJ   rK   r   r   r   remove_checkpoints   s   

z&CheckpointProcessor.remove_checkpointsc                 C   s   t  S )a  Used in ddp or other distributed training scenario, returns whether do saving in current rank.

        This is a strategic function which can be registered by other hook's function.

        Args:
            trainer(`EpochBasedTrainer`): The trainer instance.
        r   )r   r   r   r   r   should_save_on_rank   s   z'CheckpointProcessor.should_save_on_rankc                 C   s   |  |}tj||}tj|rt| z	t|| W dS  tyI } zt 	d| d| d| d t
|| W Y d}~dS d}~ww )zLinks the src bin file to the output folder.

        Args:
            model: The model instance.
            src_file: The src bin file path.
            output_dir: The target folder used in inference.
        zLink z to z error: z>, changing to copy the bin file, this may use more disk space.N)rB   r?   rM   joinrN   unlinkrF   OSErrorr   errorshutilcopyfile)r   r3   src_filer   r9   	dest_fileer   r   r   rF      s   
	
zCheckpointProcessor.linkc                 C   s,   t |||r|jnd|r|jnd|dd dS )at  Save the trainer state, including optimizer/lr_scheduler's state dict, random states etc.

        Args:
            trainer: The trainer instance.
            model: The model instance.
            train_state_file: The target file name for saving trainer states.
            meta: Some extra meta info.
            save_optimizers: Save optimizers state or not.
        NF)rH   
with_model)r   	optimizerlr_scheduler)r   r   r3   train_state_filerH   rI   r   r   r   rD      s   
z&CheckpointProcessor.save_trainer_statec                 C   s   t ||ddddd dS )zSave the model state.

        Args:
            model: The model instance.
            model_file: The target file name for saving model states.
        NF)rH   	with_meta)r   )r   r3   
model_filer   r   r   rE      s   
z$CheckpointProcessor.save_model_statec                 C   sP   |  |\}}i }tj|r| |||}ntd| d | ||| |S )al  Load checkpoint files of trainer state and model state.

        This is a strategic function which can be registered by other hook's function.

        Args:
            checkpoint_path_prefix(str): The checkpoint dir with prefix or a model state file.
                Example: '/tmp/test/epoch_0' or '/tmp/test/epoch_0.pth'
            trainer(`EpochBasedTrainer`): The trainer instance.
            load_all_state(`boolean`): Load all states (else load only module states).
            strict(`boolean`): If strict, any unmatched keys will cause an error.

        Returns:
            The meta info in json.
        zNo trainer state file z found, skip.)rC   r?   rM   rN   load_trainer_stateprintload_model_state)r   rG   r   load_all_statestrictrJ   rK   rH   r   r   r   load_checkpoints   s   z$CheckpointProcessor.load_checkpointsc                 C   s6   |rt | ddnd}|rt | ddnd}t|d||S )z!Load trainer state file.
        r\   Nr]   )r   r   )r   r^   rd   r\   r]   r   r   r   ra      s   z&CheckpointProcessor.load_trainer_statec                 C   s   t |||jddS )zLoad model state file.
        N)r   r2   r3   )r   r   r`   re   r   r   r   rc      s   z$CheckpointProcessor.load_model_statec                 C   sl   t j| \}}t|dkstd|dd r"| tj | tj fS | |tj	dd  d |dd  fS )a  Get the default file name for state files.

        If the input is a checkpoint dir with prefix, this function will append suffix for both checkpoint files.
        If the input is an absolute file name, this function will return it as the model file name, and append
            suffix for the trainer file name.

        NOTE: a best checkpoint filename with float or int metric value inside
            will not be judged as having a extension file name. like: '/tmp/test/epoch_0_accuracy0.85'

        Args:
            checkpoint_path_prefix(`str`): The checkpoint dir with prefix or a model state file
            with extension file name. like: '/tmp/test/epoch_0'

        Returns:
              A tuple of model state file name and trainer state file name.
        r   z^\d+$   N.)
r?   rM   splitextlenrematchr
   MODEL_STATE_SUFFIXTRAINER_STATE_SUFFIXsplit)rG   baseextr   r   r   rC      s    

z(CheckpointProcessor._get_state_file_name)NT)r$   r%   r&   rn   rm   r   staticmethodr   rB   rL   rP   rQ   rF   rD   rE   rf   ra   rc   rC   r   r   r   r   r
      s,    
1





r
   )r?   rk   rV   modelscope.metainfor   modelscope.utils.checkpointr   r   r   modelscope.utils.constantr   modelscope.utils.loggerr   modelscope.utils.torch_utilsr	   r
   r   r   r   r   <module>   s   