o
    Xεi_                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZmZ d dlmZ d dlmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	l m!Z! d d
l"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7m8Z8 d dl9m:Z:m;Z;m<Z<m=Z=m>Z> d dl?m@Z@mAZA d dlBmCZCmDZDmEZEmFZF eGdZHe< rd dlImJZJ eG dd deZKeG dd deZLG dd dZMdS )    N)nullcontext)	dataclassfield)	signature)CallableDictListTupleUnion)Coqpit)nn)DistributedDataParallel)
DataLoader)ping_training_run)TrainerCallback)KeepAveragecount_parametersget_experiment_folder_pathget_git_branchisimplementedremove_experiment_folderset_partial_state_dictto_cuda)copy_model_filesget_last_checkpointload_fsspecsave_best_modelsave_checkpoint)ConsoleLoggerDummyLoggerlogger_factory)get_optimizerget_scheduleris_apex_availableprint_training_envsetup_torch_training_env)cuda_meminfoshould_reduce_batch_size)get_rankinit_distributedrank_zero_logger_inforank_zero_onlytrainer)ampc                   @   s,  e Zd ZU dZeddZeed< edddidZeed	< ed
ddidZ	eed< edddidZ
eed< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed< ed dd!idZeed"< eddd#idZeed$< ed%dd&idZeed'< ed(dd)idZeed*< ed dd+idZeed,< eddd-idZeed.< ed/dd0idZeed1< eddd2idZeed3< eddd4idZeed5< ed/dd6idZeed7< ed dd8idZeed9< eddd:idZeed;< ed<dd=idZeed>< ed?dd@idZ eedA< edddBidZ!eedC< edDddEidZ"eedF< edGddHidZ#eedI< edJddKidZ$eedL< edMddNidZ%eedO< edPddQidZ&e'edR< ed ddSidZ(eedT< edUddVidZ)e*e'e+e' f edW< edddXidZ,e*ee+e f edY< ee-ddZid[Z.e*e/e+e/ f ed\< eddd]idZ0e*ee+e f ed^< ee-dd_id[Z1e/ed`< edddaidZ2eedb< edddcidZ3eedd< ed ddeidZ4eedf< edddgidZ5eedh< edddiidZ6eedj< edkddlidZ7eedm< dS )nTrainerConfigu  Config fields tweaking the Trainer for a model.
    A ````ModelConfig```, by inheriting ```TrainerConfig``` must be defined for using 👟.
    Inherit this by a new model config and override the fields as needed.
    All the fields can be overridden from comman-line as ```--coqpit.arg_name=value```.

    Example::

        Run the training code by overriding the ```lr``` and ```plot_step``` fields.

        >>> python train.py --coqpit.plot_step=22 --coqpit.lr=0.001

        Defining a model using ```TrainerConfig```.

        >>> from trainer import TrainerConfig
        >>> class MyModelConfig(TrainerConfig):
        ...     optimizer: str = "Adam"
        ...     lr: float = 0.001
        ...     epochs: int = 1
        ...     ...
        >>> class MyModel(nn.module):
        ...    def __init__(self, config):
        ...        ...
        >>> model = MyModel(MyModelConfig())

    output)defaultoutput_pathNhelpzqURI to save training artifacts by the logger. If not set, logs will be saved in the output_path. Defaults to Noner0   metadata
logger_urirunz"Name of the run. Defaults to 'run'run_namez%Name of the project. Defaults to Noneproject_nameu   🐸Coqui trainer run.uI   Notes and description about the run. Defaults to '🐸Coqui trainer run.'run_description   zKPrint training stats on the terminal every print_step steps. Defaults to 25
print_stepd   zHPlot training stats on the logger every plot_step steps. Defaults to 100	plot_stepFzELog model parameters stats on the logger dashboard. Defaults to Falsemodel_param_statsz-Wandb entity to log the run. Defaults to Nonewandb_entitytensorboardzCLogger to use for the tracking dashboard. Defaults to 'tensorboard'dashboard_loggerTz7Save checkpoint on interrupt (Ctrl+C). Defaults to Truesave_on_interruptzgSave checkpoint to the logger every log_model_step steps. If not defined `save_step == log_model_step`.log_model_stepi'  z>Save local checkpoint every save_step steps. Defaults to 10000	save_step   z'Keep n local checkpoints. Defaults to 5save_n_checkpointsz*Save checkpoints locally. Defaults to Truesave_checkpointszDSave all best checkpoints and keep the older ones. Defaults to Falsesave_all_bestr   z4Wait N steps to save best checkpoints. Defaults to 0save_best_afterz;Target loss name to select the best model. Defaults to Nonetarget_lossz3Print eval steps on the terminal. Defaults to False
print_evalz4Wait N epochs before running the test. Defaults to 0test_delay_epochsz<Run evalulation epoch after training epoch. Defaults to Truerun_evalzgRun evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to Nonerun_eval_stepsncclz.Distributed backend to use. Defaults to 'nccl'distributed_backendztcp://localhost:54321z;Distributed url to use. Defaults to 'tcp://localhost:54321'distributed_urlz/Use mixed precision training. Defaults to Falsemixed_precisionfp16zkPrecision to use in mixed precision training. `fp16` for float16 and `bf16` for bfloat16. Defaults to 'f16'	precisioni  z+Number of epochs to train. Defaults to 1000epochs    z!Batch size to use. Defaults to 32
batch_size   z*Batch size to use for eval. Defaults to 16eval_batch_size        z:Gradient clipping value. Disabled if <= 0. Defaults to 0.0	grad_clipzTStep the scheduler after each epoch else step after each iteration. Defaults to Truescheduler_after_epochgMbP?z3Learning rate for each optimizer. Defaults to 0.001lrz%Optimizer(s) to use. Defaults to None	optimizerz&Optimizer(s) arguments. Defaults to {})default_factoryr4   optimizer_paramsz3Learning rate scheduler(s) to use. Defaults to Nonelr_schedulerz4Learning rate scheduler(s) arguments. Defaults to {}lr_scheduler_paramszhEnable/disable gradient scaler explicitly. It is enabled by default with AMP training. Defaults to Falseuse_grad_scalerzA bool that controls whether TensorFloat-32 tensor cores may be used in matrix multiplications on Ampere or newer GPUs. Default to False.
allow_tf32z1Enable/disable cudnn explicitly. Defaults to Truecudnn_enablezEnable/disable deterministic cudnn operations. Set this True for reproducibility but it slows down training significantly.  Defaults to False.cudnn_deterministiczqEnable/disable cudnn benchmark explicitly. Set this False if your input size change constantly. Defaults to Falsecudnn_benchmarki1  zRGlobal seed for torch, random and numpy random number generator. Defaults to 54321training_seed)8__name__
__module____qualname____doc__r   r1   str__annotations__r5   r7   r8   r9   r;   intr=   r>   boolr?   rA   rB   rC   rD   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rP   rQ   rR   rT   rU   rW   rY   r[   floatr\   r]   r
   r   r^   dictr`   r   ra   rb   rc   rd   re   rf   rg   rh    rs   rs   C/home/ubuntu/.local/lib/python3.10/site-packages/trainer/trainer.pyr.   @   s   
 $r.   c                   @   sJ  e Zd ZU dZedddidZeed< edddidZeed< eddd	idZ	eed
< edddidZ
eed< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed < eddd!idZeed"< dS )#TrainerArgszTrainer arguments that can be accessed from the command line.

    Examples::
        >>> python train.py --restore_path /path/to/checkpoint.pth
     r2   zPath to a training folder to continue training. Restore the model from the last checkpoint and continue training under the same folder.r3   continue_pathz`Path to a model checkpoit. Restore the model with the given checkpoint and start a new training.restore_pathzyBest model file to be used for extracting the best loss. If not specified, the latest best model in continue path is used	best_pathFzVUse DDP in distributed training. It is to set in `distribute.py`. Do not set manually.use_ddpz/Use HF Accelerate as the back end for training.use_accelerate   z`Number of gradient accumulation steps. It is used to accumulate gradients over multiple batches.grad_accum_stepsz%Overfit a single batch for debugging.overfit_batchz/Skip training and only run evaluation and test.skip_train_epochzStart with evaluation and test.start_with_evalNzdOnly use a subset of the samples for debugging. Set the number of samples to use. Defaults to None. 	small_runzIGPU ID to use if ```CUDA_VISIBLE_DEVICES``` is not set. Defaults to None.gpur   z;Process rank in a distributed training. Don't set manually.rankz?Process group id in a distributed training. Don't set manually.group_id)ri   rj   rk   rl   r   rw   rm   rn   rx   ry   rz   rp   r{   r}   ro   r~   r   r   r   r   r   r   rs   rs   rs   rt   ru      s`   
 ru   c                %   @   s  e Zd Zddddddddddi di dfdededededdd	ejd
e	de	de
de
de
dededededeee	f deddf$ddZedd Zedd Zedd Zdd Zd d! Zed"d# Zd$d% Zedeee
f fd&d'Zeddd(defd)d*Zdd+efd,d-Zedded.edefd/d0Zed1d2 Zeded
e	dejfd3d4Zedede	dejfd5d6Z 	dded7ed	ejd8e!j"j#d9e!j$j%j&de'eje!j"j#e!j$j%j&ef fd:d;Z(d<d= Z)d	ejded>ed?ed@e
dAedBedefdCdDZ*ded@e
dAedefdEdFZ+ded@e
dAedefdGdHZ,ded@e
dAedefdIdJZ-dKe
defdLdMZ.ed8e!j"j#fdNdOZ/e	ddKed	ejdPejdQede'eef f
dRdSZ0dTedUefdVdWZ1	ddXedYedQedZe2fd[d\Z3dKed	ejdPejdedQef
d]d^Z4ededQefd_d`Z5d8e!j"j#fdadbZ6dce2d8e!j"j#d9ddfdedfZ7			gddKed	ejd8e!j"j#d9dddPejdhee!j"j8j9e
ef dedQedYediede'eeef fdjdkZ:dKedledmedne2de'eef f
dodpZ;ddqdrZ<	ddKed	ejdPejdQede'eef f
dsdtZ=dKedmede'eef fdudvZ>ddwdxZ?ddydzZ@d{d| ZAddd}d~ZBdddZCddddZDdddZEdddZFeGdddZHeGdddZIeGdddZJed	ejdedee!j"j#e
f fddZKed	ejdedee2e
e2 f fddZLed	ejded8ee!j"j#e
ef dee!j"j8j9e
f fddZMedhede
ef dededededede
f fddZNed	ejdejfddZOedXedefddZPdeQdefddZRdeddfddZSedefddZTdS )TrainerNTargsconfigr1   c_loggerrA   Loggermodel	get_modelget_data_samplestrain_sampleseval_samplestest_samplestrain_loadereval_loadertraining_assetsparse_command_line_args	callbacksr   returnc                 C   s  |r|  |\}}| |||\}}n|js|jr#| |i |\}}ni }|jr,|j}n|du r3|jn|}t|j|j}tj|dd t	||| || _
|| _|| _|| _|j| _|j| _|j| _|j| _| jdksnJ dtj| jd|j d}| | | j|||d\| _| _| | j|||\| _| _| jjs| jj| j_| jjs| jrd	| _d| _d| _d| _ d| _!t"d
| jjrt"d
ndd| _#d| _$d| _%d| _&d| _'d| _(| jj)r| jj*dkr| jn| jj+| _,|	dur|	| _-|
| _.|| _/n|dur
| 0||\| _-| _.| _/n	d| _-d| _.d| _/|| _$|| _&| 1|j2 |du r-|du r-t3d|dur6|| _4n| 5| j| t6| j4drI| j47  | 8| j4| _9| j:rht;dt< t=|j| j|j>| jj?| jj@ | jr| j4A  tB| j9tCr| j9D ]}tB|tDjEjFr|A  q{ntB| j9tDjEjFr| j9A  | G| j4| j| _H| jdkrtB| jHtCrt6| j4dst3dtI | _J| jJK| | jJL|  | j,r| jMrd| _NtOjP| j4| jHdd\| _4| _HtDjAjOQ | _Nnd| _N| j
jr| R| j|j| j4| jH| jN\| _4| _H| _N| _ | _!tDjAjOQ | _N| S| j4| j| jH| _T| U| jT| j
| j| j!| j | _T| j:r=tV| j4|jg|jd| _4| W  tX| j4}t;d| dt< | jJY|  | jZ| | [  t\  dS )ui  Simple yet powerful 🐸💬 TTS trainer for PyTorch. It can train all the available `tts` and `vocoder` models
        or easily be customized.

        Notes:

            Supports Automatic Mixed Precision training. If `Apex` is availabe, it automatically picks that, else
            it uses PyTorch's native `amp` module. `Apex` may provide more stable training in some cases.

        Args:

            args (Union[Coqpit, Namespace]): Training arguments parsed either from console by `argparse` or `TrainerArgs`
                config object.

            config (Coqpit): Model config object. It includes all the values necessary for initializing, training, evaluating
                and testing the model.

            output_path (str): Path to the output training folder. All the files are saved under thi path.

            c_logger (ConsoleLogger, optional): Console logger for printing training status. If not provided, the default
                console logger is used. Defaults to None.

            dashboard_logger Union[TensorboardLogger, WandbLogger]: Dashboard logger. If not provided, the tensorboard logger is used.
                Defaults to None.

            model (nn.Module, optional): Initialized and ready-to-train model. If it is not defined, `Trainer`
                initializes a model from the provided config. Defaults to None.

            get_model (Callable):
                A function that returns a model. It is used to initialize the model when `model` is not provided.
                It either takes the config as the only argument or does not take any argument.
                Defaults to None

            get_data_samples (Callable):
                A function that returns a list of training and evaluation samples. Used if `train_samples` and
                `eval_samples` are None. Defaults to None.

            train_samples (List):
                A list of training samples used by the model's `get_train_data_loader` to init the `dataset` and the
                `data_loader`. Defaults to None.

            eval_samples (List):
                A list of evaluation samples used by the model's `get_eval_data_loader` to init the `dataset` and the
                `data_loader`. Defaults to None.

            train_loader (DataLoader):
                A pytorch data loader object for training epochs. Leave as None if you want it to be made during training. Defaults to None.

            eval_loader (DataLoader):
                A pytorch data loader object for evaluation epochs. Leave as None to be generated during training. Defaults to None.

            test_samples (List):
                A list of test samples used by the model's `get_test_data_loader` to init the `dataset` and the
                `data_loader`. If None, the ```model.test_run()``` is expected to load the data. Defaults to None.

            training_assets (Dict):
                A dictionary of assets to be used at training and passed to the model's ```train_log(), eval_log(), get_data_loader()```
                during training. It can include  `AudioProcessor` or/and `Tokenizer`. Defaults to {}.

            parse_command_line_args (bool):
                If true, parse command-line arguments and update `TrainerArgs` and model `config` values. Set it
                to false if you parse the arguments yourself. Defaults to True.

            callbacks (Dict[str, Callable]):
                A dictionary of callbacks to be used during training. The keys are the callback names and the values

            gpu (int):
                GPU ID to use for training If "CUDA_VISIBLE_DEVICES" is not set. Defaults to None.

        Example::

            Running trainer with a model.

            >>> args = TrainerArgs(...)
            >>> config = ModelConfig(...)
            >>> model = Model(config)
            >>> trainer = Trainer(args, config, output_path, model=model)
            >>> trainer.fit()

            TODO:
                - Wrap model for not calling .module in DDP.
                - Deepspeed integration
                - Profiler integration.
                - Overfitting to a batch.
                - TPU training
        NT)exist_okr   z- [!] grad_accum_steps must be greater than 0.trainer_z_log.txt)r   r   r   Finf
train_loss	eval_lossrS   z0[!] `model` and `get_model` cannot both be None.init_for_trainingz > Using PyTorch DDPr|   optimizeu    [!] Coqui Trainer does not support grad_accum_steps for multiple-optimizer setup, please set grad_accum_steps to 1 or implement in your model a custom method called ´optimize` that need to deal with dangling gradients in multiple-optimizer setup!O1)	opt_level)
device_idsoutput_devicez
 > Model has z parameters)]
parse_argvinit_trainingrw   rx   r1   r   r7   osmakedirsr   r   r   r   r}   r~   r   r   pathjoinr   _setup_logger_configsetup_training_environmentuse_cudanum_gpusinit_loggersrA   r   rC   rD   rM   total_steps_doneepochs_donerestore_steprestore_epochrq   	best_lossr   test_loaderr   keep_avg_trainkeep_avg_evalrR   rT   rc   use_amp_scalerr   r   r   run_get_data_samplessetup_small_runr   
ValueErrorr   run_get_modelr   r   get_criterion	criterion
use_pt_ddpr*   loggerr)   r   rP   rQ   cuda
isinstancelisttorchr   Moduler!   r^   r   r   parse_callbacks_dicton_init_startuse_apexscalerr-   
initialize
GradScalerrestore_modelr"   	schedulerrestore_schedulerDDP_thsetup_accelerater   on_init_end
add_configsave_training_scriptr   )selfr   r   r1   r   rA   r   r   r   r   r   r   r   r   r   r   r   r   coqpit_overrides
new_fieldslog_filer   
num_paramsrs   rs   rt   __init__"  s   i
 












zTrainer.__init__c                 C   s   | j j o|  S )zReturn True if using APEX.)r   r{   _is_apex_availabler   rs   rs   rt   r   I     zTrainer.use_apexc                 C   s   | j dko| j S )z!Return True if using PyTorch DDP.r|   )r   r{   r   rs   rs   rt   r   N  r   zTrainer.use_pt_ddpc                 C   s   | j jS )z#Return True if using HF Accelerate.)r   r{   r   rs   rs   rt   r{   S  s   zTrainer.use_acceleratec              	   C   sL   | j r$| j| j| j| j| j| j| jj| jj	d\| _| _| _| _| _
d S d S )N)r   r^   training_dataloaderr   r}   rR   rT   )r{   init_accelerater   r^   r   r   r}   r   rR   rT   acceleratorr   rs   rs   rt   r   X  s   zTrainer.setup_acceleratec                 C   s   | j r	| j|S |S )z)Prepare the accelerator for the training.)r{   r   prepare_data_loader)r   data_loaderrs   rs   rt   prepare_accelerate_loaderd  s   z!Trainer.prepare_accelerate_loaderc              
   C   s  zddl m} W n ty } ztd|d}~ww |dur |n|r$dnd}	|	dkr-d}	n|	dkr4d}	n|	d	kr:d
}	|||	d}
t| tjjrL|
| } t|trb|	 D ]\}}|

|||< qUnt|trxt|D ]\}}|

|||< qkn	|dur|

|}t|tjjjr|
|}t|tr|	 D ]\}}|
|||< qnt|trt|D ]\}}|
|||< qn	|dur|
|}| ||||
fS )z%Setup HF Accelerate for the training.r   )Acceleratorz.Please install accelerate to use this feature.Nf16float16float8f8bfloat16bf16)gradient_accumulation_stepsrR   )
accelerater   ImportErrorr   r   r   r   prepare_modelrr   itemsprepare_optimizerr   	enumerateutilsdatar   r   prepare_scheduler)r   r^   r   r   r}   rR   rT   r   e
_precisionr   keyoptimischedrs   rs   rt   r   j  sN   








zTrainer.init_acceleratec                 C   s   t jd }tj|rJtj|}| jj||dd t|ddd}| j	d|
  d W d   n1 s7w   Y  t|tj| j| dS dS )	z?Save the training script to tracking dashboard and output path.r   file)file_or_dirnameartifact_typerutf8)encodingztraining-scriptN)sysargvr   r   isfilebasenamerA   add_artifactopenadd_textreadshutilcopyfiler   r1   )r   	file_path	file_namefrs   rs   rt   r     s   
zTrainer.save_training_scriptc                 C   sH   t | tr| jdd}n	t }|jdd}| \}}| | | |fS )zAParse command line arguments to init or override `TrainerArgs()`.rv   )
arg_prefix)r   r   init_argparseru   parse_known_args
parse_args)r   parsertrain_configtraining_argsr   rs   rs   rt   r     s   

zTrainer.parse_argvr   c                 C   s@   |du rt  n|}t dkrt |fS |du rt| |}||fS )a  Init console and dashboard loggers.
        Use the given logger if passed externally else use config values to pick the right logger.
        Return a dashboard logger only for the rank 0 process in DDP
        Define a console logger for each process in DDP

        Args:
            config (Coqpit): Model config.
            output_path (str): Output path to save the training artifacts.
            dashboard_logger (DashboardLogger): Object passed to the trainer from outside.
            c_logger (ConsoleLogger): Object passed to the trained from outside.

        Returns:
            Initialized dashboard_logger and console_logger objects.
        Nr   )r   r(   r   r    )r   r1   rA   r   rs   rs   rt   r     s   


zTrainer.init_loggersr   c                 C   sv   |dur9t d| | jdu rdn| jd| | _| jdu r dn| jd| | _| jdu r/dn| jd| | _dS dS )z=Use a subset of samples for training, evaluation and testing.Nz%[!] Small Run, only using %i samples.)r   infor   r   r   )r   r   rs   rs   rt   r     s   "zTrainer.setup_small_runr   c                 C   s   | j r,tj| j d| _t| j \| _}| js|| _|r#|| j n	t	 }|| j t
|dkr9|j|dd i }| jdkrM| jrH| j|d< t |d< ||fS )a  Initialize training and update model configs from command line arguments.

        Args:
            args (argparse.Namespace or dict like): Parsed trainer arguments.
            config_overrides (argparse.Namespace or dict like): Parsed config overriding arguments.
            config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.

        Returns:
            config (Coqpit): Config paramaters.
        zconfig.jsonr   T)relaxed_parserrx   github_branch)rw   r   r   r   config_pathr   rx   ry   	load_jsonr   lenr  r   r   )r   r   r   
best_modelcoqpitr   rs   rs   rt   r     s"   


zTrainer.init_trainingc              
   C   s   t  dkrdd l}||j}||jd|d f t| |j|j|j	| j
|j|j| jd u r0|n| jd\}}t| | ||fS )NWindowsr   i   r|   )r   re   rf   rg   rz   rh   rd   r   )platformsystemresource	getrlimitRLIMIT_NOFILE	setrlimitr%   re   rf   rg   rz   rh   rd   r   r$   )r   r   r   r  rlimitr   r   rs   rs   rt   r     s    

z"Trainer.setup_training_environmentc                 C   s*   t t|jjdkr|| }|S | }|S )zRun the `get_model` function and return the model.

        Args:
            config (Coqpit): Model config.

        Returns:
            nn.Module: initialized model.
        r|   )r  r   sig
parameters)r   r   r   rs   rs   rt   r     s
   
zTrainer.run_get_modelc                 C   sF   t |r!tt|jjdkr|| \}}||fS | \}}||fS dS )Nr|   NN)callabler  r   r"  r#  )r   r   r   r   rs   rs   rt   r     s   
zTrainer.run_get_data_samplesrx   r^   r   c              
   C   sH  dd }t dtj| t|dd}zEt d ||d  t d z	||d	 |}W n ttt	fy@   t d
 Y nw d|v rX| j
rX|d rXt d ||d |}W n# tt	tfy|   t d | }t||d |}|| ~Y nw | || j||}t d|d  |d d }	|d }
tj  ||||	|
fS )a3  Restore training from an old run. It restores model, optimizer, AMP scaler and training stats.

        Args:
            config (Coqpit): Model config.
            restore_path (str): Path to the restored training run.
            model (nn.Module): Model to restored.
            optimizer (torch.optim.Optimizer): Optimizer to restore.
            scaler (torch.cuda.amp.GradScaler, optional): AMP scaler to restore. Defaults to None.

        Returns:
            Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: [description]
        c                 S   sj   t |trt| D ]\}}|| | q	|S t |tr.|  D ]\}}|| | q |S ||  |S N)r   r   r   load_state_dictrr   r   )statesobjidxstater   rs   rs   rt   _restore_list_objs:  s   


z1Trainer.restore_model.<locals>._restore_list_objsz > Restoring from %s ...cpumap_locationz > Restoring Model...r   z > Restoring Optimizer...r^   z7 > Optimizer is not compatible with the restored model.r   z > Restoring Scaler...z" > Partial model initialization...z > Model restored from step %istepr|   epoch)r   r  r   r   r  r   r'  KeyError	TypeErrorRuntimeErrorr   r   
state_dictr   
restore_lrr   r   r   empty_cache)r   r   rx   r   r^   r   r,  
checkpoint
model_dictr   r   rs   rs   rt   r   %  s:   





zTrainer.restore_modelc           	      C   s   |j sQt|tr#t|D ]\}}|jD ]}| ||| |d< qq|S t|trC| D ]\}}|jD ]}| ||| |d< q3q,|S |jD ]
}| |||d< qF|S )Nr]   )rw   r   r   r   param_groupsget_lrrr   r   )	r   r   r   r   r^   r*  r   group
optim_namers   rs   rt   r6  b  s    

	


zTrainer.restore_lrassetsis_evalsamplesverboser   c           	   	   C   sj   |dkrt |jdr|j||||||| jj}nt |dr)|j||||||d}t|dks3J d|S )Nr|   get_data_loader)r   r>  r?  r@  rA  r   r   uY    ❗ len(DataLoader) returns 0. Make sure your dataset is not empty or len(dataset) > 0. )r   modulerB  r   r   r  )	r   r   r   r>  r?  r@  rA  r   loaderrs   rs   rt   _get_loaderv  s(   


zTrainer._get_loaderc              	   C      | j dkr t| jjdr| jj| j| j||| j | jj}|S nt| jdr5| j| j| j||| j }|S | 	| j| j|d||| j S )a  Initialize and return a training data loader.
        Call ```model.get_train_data_loader``` if it is implemented, else call ```model.get_data_loader```
        and set ```is_eval=False```.

        Args:
            ap (AudioProcessor): Audio processor.
            samples (List): Data samples used for training.
            verbose (bool): enable/disable printing loader stats at initialization.

        Returns:
            DataLoader: Initialized training data loader.
        r|   get_train_data_loaderF)
r   r   r   rC  rG  r   r   r   r   rE  r   r   r@  rA  rD  rs   rs   rt   get_train_dataloader  4   
zTrainer.get_train_dataloaderc              	   C   rF  )a  Initialize and return a evaluation data loader.
        Call ```model.get_eval_data_loader``` if it is implemented, else call ```model.get_data_loader```
        and set ```is_eval=True```.

        Args:
            ap (AudioProcessor): Audio processor.
            samples (List): Data samples used for training.
            verbose (bool): enable/disable printing loader stats at initialization.

        Returns:
            DataLoader: Initialized training data loader.
        r|   get_eval_data_loaderT)
r   r   r   rC  rK  r   r   r   r   rE  rH  rs   rs   rt   get_eval_dataloader  rJ  zTrainer.get_eval_dataloaderc              	   C   rF  )a  Initialize and return a evaluation data loader.
        Call ```model.get_test_data_loader``` if it is implemented, else call ```model.get_data_loader```
        and set ```is_eval=True```.

        Args:
            ap (AudioProcessor): Audio processor.
            samples (List): Data samples used for training.
            verbose (bool): enable/disable printing loader stats at initialization.

        Returns:
            DataLoader: Initialized training data loader.
        r|   get_test_data_loaderT)
r   r   r   rC  rM  r   r   r   r   rE  rH  rs   rs   rt   get_test_dataloader  rJ  zTrainer.get_test_dataloaderbatchc                 C   s   z| j dkr| jj|}n| j|}W n	 ty   Y nw t|tr4| D ]
\}}t|||< q(nt|t	r@dd |D }z| j dkrP| jj
|}W |S | j
|}W |S  tyb   Y |S w )aA  Format the dataloader output and return a batch.

        1. Call ```model.format_batch```.
        2. Pass the batch to the Device.
        3. Call ```model.format_batch_on_device```.

        Args:
            batch (List): Batch returned by the dataloader.

        Returns:
            Dict: Formatted batch.
        r|   c                 S   s   g | ]}t |qS rs   )r   ).0vrs   rs   rt   
<listcomp>*  s    z(Trainer.format_batch.<locals>.<listcomp>)r   r   rC  format_batchNotImplementedErrorr   rr   r   r   r   format_batch_on_device)r   rO  krQ  rs   rs   rt   rS    s0   



zTrainer.format_batchc                 c   s&    | j D ]}|d D ]}|V  q
qdS )zGenerator over parameters owned by the optimizer.

        Used to select parameters used by the optimizer for gradient clipping.

        Args:
            optimizer: Target optimizer.
        paramsN)r:  )r^   r<  prs   rs   rt   master_params9  s   
	zTrainer.master_paramsr   optimizer_idxc                 C   s:   | |g}|dur| | t|dr|jj| S |j| S )as  
        Perform a trainig forward step. Compute model outputs and losses.

        Args:
            batch (Dict): [description]
            model (nn.Module): [description]
            criterion (nn.Module): [description]
            optimizer_idx (int, optional): [description]. Defaults to None.

        Returns:
            Tuple[Dict, Dict]: [description]
        NrC  )appendhasattrrC  
train_step)rO  r   r   rZ  
input_argsrs   rs   rt   _model_train_stepF  s   


zTrainer._model_train_steprR   rT   c                 C   st   d}t  }| jr1d}t j}|r-|dkrt j}||fS |dkr&t j}||fS td| ||fS |r6t j}||fS )Nr-  r   rS   r   u    ❗ Unknown precision )r   get_autocast_cpu_dtyper   float32r   r   r   )r   rR   rT   devicedtypers   rs   rt   _get_autocast_args^  s"   zTrainer._get_autocast_args	loss_dictstep_optimizer	grad_normc                 C   s\   |  |}|d ur"|d|d| < |r |d ur ||d| < |S |r,|d ur,||d< |S )Nlossloss_
grad_norm_rg  )_detach_loss_dictpop)r   re  rf  rZ  rg  loss_dict_detachedrs   rs   rt   detach_loss_dicto  s   
zTrainer.detach_loss_dictc           
      C   s   |  |j|j\}}tj|||jd. |d ur$| j||||d\}}	n| |||\}}	W d    ||	fS W d    ||	fS 1 sDw   Y  ||	fS )Ndevice_typerc  enabled)rZ  )rd  rR   rT   r   autocastr_  )
r   rO  r   r   r   rZ  rb  rc  outputsre  rs   rs   rt   _compute_loss  s   

zTrainer._compute_lossc                 C   sX   d}d| v r*| j d ur*|d ur'z| j | }W |S  ty&   td Y |S w | j }|S )NrZ   r[   zE [!] You are using multiple optimizers but `grad_clip` is not a list.)r[   r3  r   r  )r   rZ  r[   rs   rs   rt   _set_grad_clip_per_optimizer  s   z$Trainer._set_grad_clip_per_optimizerc                 C   s(   t jt jdd | |D ddddS )Nc                 S   s   g | ]}|j d qS ))gradview)rP  paramrs   rs   rt   rR    s    z.Trainer._compute_grad_norm.<locals>.<listcomp>r   )dim   )rX  )r   normcatrY  )r   r^   rs   rs   rt   _compute_grad_norm  s   (zTrainer._compute_grad_normr[   	AMPScalerc                 C   sR   |dur"|dkr"|r| | | j|  tjj| ||}|S | |}|S )zPerform gradient clippingNr   )	unscale_r   before_gradient_clippingr   r   r   clip_grad_norm_rY  r~  )r   r[   r^   r   rg  rs   rs   rt   _grad_clipping  s   

zTrainer._grad_clippingr|   r   num_optimizersc              	   C   s  t   }| j|||||d\}}|st   | }|i |fS | j||d}d}d}| j| | |d t| j |d< | jr| j	|\ |j
rK| jjnt}| @ | j|d  | |}| jjrs|durs|dkrs| j| | |  | jjs| jjs|  |jdd W d   n1 sw   Y  W d   n1 sw   Y  n| jr| jrt|d |}|  W d   n1 sw   Y  |	r| j||dd}n\||d   |	r| j|||d}| }|| |du s|d	 |
kr|  | |d
< || k}n#|d   |	r2| j|  |dkr.t j!j"| #||}|  |durF|rF| jjsF|	rF|  |	rO|jdd t$|t j%rdt &|sbt '|rdd}t   | }| (||	||}|||fS )al  Perform a forward - backward pass and run the optimizer.

        Args:
            batch (Dict): Input batch. If
            model (nn.Module): Model for training. Defaults to None.
            optimizer (Union[nn.optim.Optimizer, List]): Model's optimizer. If it is a list then, `optimizer_idx` must be defined to indicate the optimizer in use.
            scaler (AMPScaler): AMP scaler.
            criterion (nn.Module): Model's criterion.
            scheduler (torch.optim.lr_scheduler._LRScheduler): LR scheduler used by the optimizer.
            config (Coqpit): Model config.
            optimizer_idx (int, optional): Target optimizer being used. Defaults to None.
            step_optimizer (bool, optional): Whether step the optimizer. If False, gradients are accumulated and
                model parameters are not updated. Defaults to True.
            num_optimizers (int, optional): Number of optimizers. Defaults to 1.

        Raises:
            RuntimeError: When the loss is NaN.

        Returns:
            Tuple[Dict, Dict, int, torch.Tensor]: model outputs, losses, step time and gradient norm.
        )rO  r   r   r   rZ  )r   rZ  r   Trh  Nset_to_none)r[   r^   r   r|   
amp_scaler))timert  ru  r   before_backward_passrq   r}   r{   r   
accumulaterR   rr  r   backwardr~  sync_gradientsr  r#  r0  r   r\   optimizer_step_was_skipped	zero_gradr   r   r-   
scale_lossr  scale	get_scaleupdater  r   r   r   rY  r   Tensorisnanisinfrn  )r   rO  r   r^   r   r   r   r   rZ  rf  r  step_start_timers  re  	step_timer[   rg  update_lr_schedulerctx_mgrscaled_loss
scale_prevrm  rs   rs   rt   r     s   #







&
zTrainer.optimizebatch_n_stepsr0  loader_start_timec                 C   s0  | j |  | |}t | }d}i }t| jdrht }| | jj| jj	\}	}
t
j|	|
| jjd | j|| \}}W d   n1 sIw   Y  t | }|du rZdS | |ddd}|| nd}|d | j dkr{|d |kr{d}t| jts| j|| j| j| j| j| j| j|dd		\}}}|| nsdgt| j }d}t| jD ]W\}}| j}| j}d}| jdur| j| }| j|| j||||| j||t| jd	
\}}}||7 }|||< |dur| D ]\}}||v r||| d
| < q|||< q|}q|}|r| jjdd i }||d< ||d< | j| i }| D ]\}}||d| < q'| j| | j| jj dkri }t| jtrlt| jD ]\}}| j| jd d }|d| |i qQn3t| jtr| j D ]\}}| j| jd d }|d| |i qxn| jjd d }d|i}|| |t |dt |dd | j!"||| j|| jj# | j$j%dkr| j| jj& dkr| j'(| j| | j| jj) dkr| jdkr| jj*r| +  | j| jj, dkr| j-||d | j'.  |  jd7  _| j /|  ||fS )a  Perform a training step on a batch of inputs and log the process.

        Args:
            batch (Dict): Input batch.
            batch_n_steps (int): Number of steps needed to complete an epoch. Needed for logging.
            step (int): Current step number in this epoch.
            loader_start_time (float): The time when the data loading is started. Needed for logging.

        Returns:
            Tuple[Dict, Dict]: Model outputs and losses.
        Nr   ro  r$  Tr|   r   F)rf  r  -r  avg_loader_timeavg_step_timeavg_r]   current_lr_
current_lr   )r  loader_time)rO  rs  )0r   on_train_step_startrS  r  r   r   rd  r   rR   rT   r   rr  r   rn  r  r}   r   r^   r   r   r   r   r  r   r   r  r   update_valuesr   r;   r:  rr   roundr   print_train_step
avg_valuesr   r   r=   rA   train_step_statsrD   rG   r   rC    update_training_dashboard_loggerflushon_train_step_end)r   rO  r  r0  r  r  outputs_per_optimizerre  r  rb  rc  rs  loss_dict_newrf  total_step_timer*  r^   r   r   r   rV  rQ  keep_avg_updateupdate_eval_valuesr   valuelrsr  rs   rs   rt   r]  !  s   






 

zTrainer.train_stepc                 C   s  | j du r| j| j| jdd| _ | | j | _ td | jdkr(| jj	
  n| j
  t }| j|  | j  t }t| j }t| j D ]F\}}| ||||\}}|du rbtd qJ~t }| jjdur| j| jj dkr|   | jdkr| jj	
  n| j
  td qJt | }| j|  | jdur| jjrt| jtr| jD ]
}	|	dur|	  qnt| jtr| j  D ]
}	|	dur|	  qn| j  | j!j"dkrd|i}
|
#| j$j% | j&'| j|
 | jj(r| j&)| j| j tj*+  dS )zQMain entry point for the training loop. Run training on the all training samples.NTrA  r|   zC [!] `train_step()` retuned `None` outputs. Skipping training step.r   
epoch_time),r   rI  r   r   r   r   set_grad_enabledr   r   rC  trainr  r   on_train_epoch_startr   print_train_startr  r   r]  r   r  r   rN   r   
eval_epochon_train_epoch_endr   r\   r   r   r0  rr   valuesr   r   r  r   r  rA   train_epoch_statsr>   model_weightsr   r7  )r   epoch_start_timer  batch_num_stepscur_steprO  rs  _r  r   epoch_statsrs   rs   rt   train_epoch  sj   











zTrainer.train_epochc                 C   sh   ||g}t |drt|dr|j|| S ||| S |dur$|| t|dr/|jj| S |j| S )a  
        Perform a evaluation forward pass. Compute model outputs and losses with no gradients.

        Args:
            batch (Dict): IBatch of inputs.
            model (nn.Module): Model to call evaluation.
            criterion (nn.Module): Model criterion.
            optimizer_idx (int, optional): Optimizer ID to define the closure in multi-optimizer training. Defaults to None.

        Returns:
            Tuple[Dict, Dict]: model outputs and losses.
        r   rC  N)r   r\  rC  	eval_stepr[  )r   rO  r   r   rZ  r^  rs   rs   rt   _model_eval_step  s   




zTrainer._model_eval_stepc                 C   st  t   g }i }t| jtrt| jdr.| || j| j\}}|du r-	 W d   dS nBdgt	| j }t
| jD ]4\}}| j}| || j||\}}	|du rZ W d   dS |||< |	ro|	d|	d| < ||	 q;| |}i }
| D ]
\}}||
d| < q{| j|
 | jjr| j||| jj W d   ||fS W d   ||fS 1 sw   Y  ||fS )a  Perform a evaluation step on a batch of inputs and log the process.

        Args:
            batch (Dict): Input batch.
            step (int): Current step number in this epoch.

        Returns:
            Tuple[Dict, Dict]: Model outputs and losses.
        r   Nr$  rh  ri  r  )r   no_gradr   r^   r   r   r   r  r   r  r   rl  r  rk  r   r   r  r   rK   r   print_eval_stepr  )r   rO  r0  rs  re  r*  r  r   outputs_r  r  r   r  rs   rs   rt   r  )  sF   





zTrainer.eval_stepc                 C   sp  | j du rt n| j | _ | jdu r"| jjr| j| j| jddnd| _t	d | j
  | j  t }d}d}t| jD ]/\}}| |}t | }| j d|i | ||\}}|du rgtd q>|}t }q>| jjdkr|durt| j
drt| j
jd	r| j
j||| j| j| j nt| j
d	r| j
||| j| j| j | j| j| j j tj   dS )
zWMain entry point for the evaluation loop. Run evaluation on the all validation samples.NTr  Fr  zD [!] `eval_step()` retuned `None` outputs. Skipping evaluation step.r   rC  eval_log)!r   r   r   r   rM   rL  r   r   r   r  r   evalr   print_eval_startr  r   rS  r  r  r   r  r   r   r\  r   rC  r  rA   r   
eval_statsr  r   r7  )r   r  rO  rs  r  r  r  r  rs   rs   rt   r  T  sZ   







zTrainer.eval_epochc                 C   sJ  | j   d}t| j ds| jdkr/t| j jdr/| jdkr'| j j| j}nF| j | j}n>t| j dsA| jdkrmt| j jdrm| j| j| jrK| jn| j	dd| _
| jdkrc| j j| j| j
d}n
| j | j| j
d}t| j ds| jdkrt| j jdr| jdkr| j j|| j| j| j dS | j || j| j| j dS dS dS )a  Run model test.

        Test run is expected to pass over test samples and produce logging artifacts.

        If ```model.test_run()``` is defined, it will be called and it is expected to set and execute everything
        in the model.

        Else if  ```mode.test()``` is defined, it will be called and it takes an test data loader as an argument
        and iterate over it.
        Ntest_runr|   testTr  test_log)r   r  r   r   rC  r  r   rN  r   r   r   r  r  rA   r   )r   test_outputsrs   rs   rt   r    s0   

$

zTrainer.test_runc                 C   s   | j jrW| jdks| j jrYtdtj| j j t	| j j
dd}d|v rNt|d tr2|d | _nt|d trN| jjrFd|d d| _n|d dd| _td| j dS dS dS )	zRestore the best loss from the args.best_path if provided else
        from the model (`args.continue_path`) used for resuming the trainingr   z" > Restoring best loss from %s ...r-  r.  
model_lossNr   z) > Starting with loaded last best loss %s)r   rw   r   ry   r   r  r   r   r  r   rx   r   rr   r   rq   r   rM   )r   chrs   rs   rt   _restore_best_loss  s   zTrainer._restore_best_lossc                 C   sd   t d t | _|dur|| _| j}|dur|| _n| j| _|   | j	| j
| jj || _dS )a  Run evaluation steps on the test data split. You can either provide the model and the test samples
        explicitly or the trainer use values from the initialization.

        Args:
            model (nn.Module, optional): Model to use for testing. If None, use the model given in the initialization.
                Defaults to None.

            test_samples (List[str], optional): List of test samples to use for testing. If None, use the test samples
                given in the initialization. Defaults to None.
        z > USING TEST SET...N)r   r  r   r   r   r   r   r  r   print_epoch_endr   r  )r   r   r   eval_samples_cachers   rs   rt   r    s   

zTrainer.testc                 C   s
  |    | j| _td| jjD ]s}| jdkrt  | j	
|  t | _| jjr+t nd| _|| _| j|| jj| j | jsF| jsF|   | jjrN|   || jjkr^| jjdkr^|   | j|| jjrj| jjn| jj | jjdv ry|   | j	|  d| _qdS )u8   🏃 train -> evaluate -> test for the number of epochs.r   r|   N)Nr   F)r  r   r   ranger   rU   r   distbarrierr   on_epoch_startr   r   rM   r   r   r   print_epoch_startr1   r   r   r  r  rL   r   r   r  r  r  r   on_epoch_end)r   r1  rs   rs   rt   _fit  s2   
zTrainer._fit   c              
   C   s   t   |}	 t  tj  zt  tj  || j_t	d| jj | 
  W d S  tyT } z|dkrIt|rI|d }t  tj  n W Y d }~n-d }~w ty| } z|dkrqt|rq|d }t  tj  n W Y d }~nd }~ww q)NTz > current batch size: %ir|   r{  )r&   gccollectr   r   r7  r   rW   r   r  r  r4  r'   	Exception)r   starting_batch_sizebs	exceptionrs   rs   rt   fit_with_largest_batch_size  s<   


z#Trainer.fit_with_largest_batch_sizec                 C   s  z|    | jjdkr| j  W dS W dS  tyk   td | jj	r2td | 
  |   | j|  t| j | jdkrFt  | jjdkrQ| j  z	td W Y dS  tyj   td Y Y dS w  ty   t| j t  td Y dS w )u&   Where the ✨️magic✨️ happens...r   z > Keyboard interrupt detected.z! > Saving model before exiting...r|   N)r  r   r   rA   finishKeyboardInterruptr   r  r   rB   r   r  r   on_keyboard_interruptr   r1   r   r  destroy_process_groupr   exit
SystemExitr   _exitBaseException	traceback	print_excr   rs   rs   rt   fit&  s6   





zTrainer.fitc                 C   sn   t  | _|r
|| j_|r| | d| j_d| j_|| j_dd g| _|| _	| j	
  |   | j	  | j	S )a4  Run training under the torch profiler.

        Example::
            Run torch profiler to profile CPU, GPU and memory usage with Tensorboard logging.

            >>> import torch
            >>> profiler = torch.profiler.profile(
            >>>        activities=[
            >>>         torch.profiler.ProfilerActivity.CPU,
            >>>         torch.profiler.ProfilerActivity.CUDA,
            >>>     ],
            >>>     schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),
            >>>     on_trace_ready=torch.profiler.tensorboard_trace_handler("./profiler/"),
            >>>     record_shapes=True,
            >>>     profile_memory=True,
            >>>     with_stack=True,
            >>> )
            >>> prof = trainer.profile_fit(profiler, epochs=1, small_run=64)
        Fi c                 S   s
   | j  S r&  )torch_profilerr0  )r,   rs   rs   rt   <lambda>i  s   
 z%Trainer.profile_fit.<locals>.<lambda>)r   rA   r   epocshsr   rM   rL   rU   callbacks_on_train_step_endr  startr  stop)r   r  rU   r   rs   rs   rt   profile_fitH  s   


zTrainer.profile_fitc                 C   sj   |  | j}|  | j}t||d| j| j| j| j| jr| j	nd| j
| j| j| jj| jj| jjd| _dS )z[Save the best model. It only saves if the current target loss is smaller then the previous.r   N)keep_all_best
keep_after	save_func)_pick_target_avg_lossr   r   r   r   r   r   r^   r   r   r   r   r1   rH   rI   rA   
save_modelr   r   r   rs   rs   rt   r   t  s    zTrainer.save_best_modelc                 C   s^   |  | j}|  | j}t| j| j| j| jr| jnd| j	| j
| j||d| jj| jjd
 dS )z"Save the current model checkpoint.Nr   )r  rF   r  )r  r   r   r   r   r   r^   r   r   r   r   r1   rF   rA   r  r  rs   rs   rt   r     s   
zTrainer.save_checkpointc                 C   s   d| j  d| j g}| jj| jdd|d |d urQ|d urSt| jdr<t| jjdr<| jj	||| j| j
| j d S t| jdrU| j	||| j| j
| j d S d S d S d S )Nzepoch-zstep-r8  r   )r   r   r   aliasesrC  	train_log)r   r   rA   r  r1   r\  r   r   rC  r  r   )r   rO  rs  r   rs   rs   rt   r    s2   


	z(Trainer.update_training_dashboard_loggerc                 C   sh   d}t | drz|  }W n ty   d}Y nw |du r2|j}|jdu r'i n|j}t|||j| S |S )a  Receive the optimizer from the model if model implements `get_optimizer()` else
        check the optimizer parameters in the config and try initiating the optimizer.

        Args:
            model (nn.Module): Training model.
            config (Coqpit): Training configuration.

        Returns:
            Union[torch.optim.Optimizer, List]: A optimizer or a list of optimizers. GAN models define a list.
        Nr!   )r   r!   rT  r^   r`   r]   )r   r   r^   optimizer_namer`   rs   rs   rt   r!     s   
zTrainer.get_optimizerc                 C   sD   d}t | drz|  }W n ty   d}Y nw |du r |j}|S )a  Set the initial learning rate by the model if model implements `get_lr()` else try setting the learning rate
        fromthe config.

        Args:
            model (nn.Module): Training model.
            config (Coqpit): Training configuration.

        Returns:
            Union[float, List[float]]: A single learning rate or a list of learning rates, one for each optimzier.
        Nr;  )r   r;  rT  r]   )r   r   r]   rs   rs   rt   r;    s   
zTrainer.get_lrc                 C   st   d}t | dr(z| |}W n ty   d}Y nw t|tr(t | ds(td|du r8|j}|j}t|||S |S )a  Receive the scheduler from the model if model implements `get_scheduler()` else
        check the config and try initiating the scheduler.

        Args:
            model (nn.Module): Training model.
            config (Coqpit): Training configuration.

        Returns:
            Union[torch.optim.Optimizer, List, Dict]: A scheduler or a list of schedulers, one for each optimizer.
        Nr"   r   za [!] Dictionary of schedulers are only supported with the manual optimization `model.optimize()`.)r   r"   rT  r   rr   r   ra   rb   )r   r   r^   r   ra   rb   rs   rs   rt   r"     s    
zTrainer.get_scheduler	Schedulerr   r   c                 C   s   | durH|j rHt| tr!| D ]}|dur|jr||_q||_q| S t| tr=|  D ]}|dur:|jr7||_q*||_q*| S |jrE|| _| S || _| S )z%Restore scheduler wrt restored model.N)rw   r   r   r\   
last_epochrr   r  )r   r   r   r   r   srs   rs   rt   r     s.   

zTrainer.restore_schedulerc                 C   s   d}|   }|S )zReceive the criterion from the model. Model must implement `get_criterion()`.

        Args:
            model (nn.Module): Training model.

        Returns:
            nn.Module: Criterion layer.
        N)r   )r   r   rs   rs   rt   r   '  s   
zTrainer.get_criterionc                 C   sF   i }|   D ]\}}t|ttfr|||< q|   ||< q|S )zDetach loss values from autograp.

        Args:
            loss_dict (Dict): losses.

        Returns:
            Dict: losses detached from autograph.
        )r   r   ro   rq   detachr-  item)re  rm  r   r  rs   rs   rt   rk  9  s   

zTrainer._detach_loss_dictkeep_avg_targetc                 C   s   |du st t|j dkrdS d}d| jv r5| jjr5d| jj |j v r1|d| jj  S tdt| jtrad}t	t | jD ]}d| |jv rW||d|  7 }qD|t | j }|S |j
dd}|S )z&Pick the target loss to compare modelsNr   rJ   r  z [!] Target loss not found in the keep_avg_target. You might be exiting the training loop before it is computed or set the target_loss in the model config incorrectly.	avg_loss_avg_loss)r  r   r  keysr   rJ   r   r   r^   r  get)r   r  target_avg_lossr*  rs   rs   rt   r  K  s&   zTrainer._pick_target_avg_lossr   c                 C   s\   t d}t j|dd}t d}|| || | jjdkr,dd |jD |_dS dS )	z3Set up the logger based on the process rank in DDP.r,   a)moderv   r   c                 S   s   g | ]
}t |tjs|qS rs   )r   loggingStreamHandler)rP  hrs   rs   rt   rR  s  s    z0Trainer._setup_logger_config.<locals>.<listcomp>N)	r  	getLoggerFileHandler	FormattersetFormatter
addHandlerr   r   handlers)r   r   
logger_newhandlerfmtrs   rs   rt   r   h  s   



zTrainer._setup_logger_configc                   C   s   t jdduS )z$Check if Nvidia's APEX is available.apexN)	importlibutil	find_specrs   rs   rs   rt   r   u  s   zTrainer._is_apex_availabler$  r&  )NTr|   )r   N)r  )Uri   rj   rk   ru   r   rm   r   r   r   r   r   r   r   rp   ro   r   propertyr   r   r{   r   r   staticmethodr   r   r
   r   r   r   r   r   r   r   r   r   	Optimizerr   r-   r   r	   r   r6  rE  rI  rL  rN  rS  rY  r_  rd  rq   rn  rt  ru  r~  r  ra   _LRSchedulerr   r]  r  r  r  r  r  r  r  r  r  r  r  r+   r   r   r  r!   r;  r"   r   r   rk  r   r  r   r   rs   rs   rs   rt   r   !  s   	


  )



+
%

=	
 )))(

"		

&} 
'F


+
:*
 

", &&
r   )Nr  r  r  r   r  r  r   r  r  
contextlibr   dataclassesr   r   inspectr   typingr   r   r   r	   r
   r   torch.distributeddistributedr  r  r   r   torch.nn.parallelr   r   torch.utils.datar   trainer.analyticsr   trainer.callbacksr   trainer.generic_utilsr   r   r   r   r   r   r   r   
trainer.ior   r   r   r   r   trainer.loggingr   r   r    trainer.trainer_utilsr!   r"   r#   r$   r%   trainer.utils.cuda_memoryr&   r'   trainer.utils.distributedr(   r)   r*   r+   r  r   r  r-   r.   ru   r   rs   rs   rs   rt   <module>   sH   (

 #=