o
    ix                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZ	 d dl
mZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ zd dlZW n   dZY edd ZG dd dZdS )    N)tqdm)datetime)autocast
GradScaler)nullcontextcontextmanager)Path)	to_device)recursive_average)average_checkpoints)ShardedGradScalerc                 c   sB    | rt   d V  W d    d S 1 sw   Y  d S d V  d S N)r   )enabled r   N/home/ubuntu/.local/lib/python3.10/site-packages/funasr/train_utils/trainer.pymaybe_autocast   s   "
r   c                	   @   s   e Zd ZdZ				ddedededefdd	Z	
	
	
	
	
	
dddZ	
	
	
	
dddZ	
	
	
	
	
	
	
	
dddZ		
	
	
	
dddZ
							
	
	
				
d ddZd!ddZd
S )"Trainera   
    A simple trainer class for training a PyTorch model, saving checkpoints at the end of each epoch,
    and optionally resuming from a saved checkpoint.

    Attributes:
        max_epoch (int): Maximum number of epochs for training.
        model (torch.nn.Module): The model to be trained.
        optim (torch.optim.Optimizer): The optimizer to use for training.
        scheduler (torch.optim.lr_scheduler._LRScheduler): The learning rate scheduler.
        dataloader_train (torch.utils.data.DataLoader): DataLoader for the training dataset.
        dataloader_val (torch.utils.data.DataLoader): DataLoader for the validation dataset.
        output_dir (str): Directory where model checkpoints will be saved.
        resume (str, optional): Path to a checkpoint to resume training from.
    F./use_ddpuse_fsdpuse_fp16
output_dirc           	   	   K   s  || _ tj| j stj| j dd |dd| _d| _|dd| _|| _	|| _
|| _|dd| _|d	d
| _d| _|| _|dd| _|dd| _| jdk rW| j| _| j| jksaJ d|dd| _|dd| _|dd| _|dd| _|dd| _|dd| _z
t }t }W n   d}d}td Y || _|| _d| _d| _ d| _!d| _"d| _#i | _$d| _%d| _&i | _'i | _(|dd | _)d| _*d| _+d| _,|d!d | _-| j-rt.j/|d"d# t.j0||d$d%|d&d'|d(d)|d*dd+ d,S d,S )-a  
        Initializes the Trainer class with the model, optimizer, scheduler, dataloaders, and other settings.

        Args:
            model (torch.nn.Module): The model to be trained.
            optim (torch.optim.Optimizer): The optimizer to use for training.
            scheduler (torch.optim.lr_scheduler._LRScheduler): The learning rate scheduler.
            dataloader_train (torch.utils.data.DataLoader): The DataLoader for the training dataset.
            dataloader_val (torch.utils.data.DataLoader): The DataLoader for the validation dataset.
            **kwargs: Additional keyword arguments:
                      max_epoch (int): The maximum number of epochs for training.
                      output_dir (str): The directory where model checkpoints will be saved. Default is './'.
                      resume (str, optional): The file path to a checkpoint to resume training from.
        Texist_okresumer   	max_epochd   devicecudalog_interval2   save_checkpoint_intervali  validate_intervalz8save_checkpoint_interval must equal to validate_intervalkeep_nbest_modelsi  avg_keep_nbest_models_typeaccavg_nbest_model
   
accum_grad   	grad_clipg      $@grad_clip_typeg       @z1distributed is not initialized, only single shard         reset_gpu_cacheF	use_wandbwandb_tokenkeywandb_project
my_project
wandb_teammy_teamwandb_exp_namemy_exptraining)configprojectentitynamedirjob_typereinitN)1r   ospathexistsmakedirsgetr   start_epochr   
local_rankr   r   r   r   batch_totalr   r!   r"   r$   r%   r'   r)   r+   r,   distget_rankget_world_sizeloggingwarningrank
world_sizetrain_acc_avgtrain_loss_avgval_acc_avgval_loss_avgbest_acc_idxsaved_ckptsstep_or_epochbest_step_or_epochval_acc_step_or_epochval_loss_step_or_epochr/   start_data_split_i
start_stepstep_in_epochr0   wandblogininit)	selfrH   r   r   r   r   kwargsrO   rP   r   r   r   __init__1   s|   




zTrainer.__init__Nc                 K   s  |du rdn|}| j dkrtd| d| j d i d|d|d| jd	| d
| d| d| jd| jd| jd| j	d| j
d|d|ddd|ddd| jd|ddd|dd}	|}t|dr{|j |	d	< |r| |	d< tj| jdd |du rd| }
nd| d| }
tj| j|
}t|	| td|  ttj| jd}t|	| | j	d kr|
| _	| j
d!kr | j|
 | j| j	 kr|
| _	ttj| jd"}t|	| td#| j| j	 d$d%|  nxtd&| j|
 d$d'| j| j	 d$d%tj| j| j	  nY| j
d(kru| j|
 | j| j	 krV|
| _	ttj| jd"}t|	| td)| j| j	 d$d%|  n#td*| j|
 d$d+| j| j	 d$d%tj| j| j	  ntd, t| d-| j
 d.|
 | j|
< | jdkrt| j| jkr| j
d!krt| j| jjd/}n	t| j| jjd/}|| jv r| j|= tj| j|}td0|  tj|rt| | js| jrt !  dS dS )1a`  
        Saves a checkpoint containing the model's state, the optimizer's state,
        and the scheduler's state at the end of the given epoch. This method is
        intended to be called at the end of each epoch to save the training progress.

        Args:
            epoch (int): The epoch number at which the checkpoint is being saved.
        Nr   zSave checkpoint: , rank: 
epochstep
total_step
state_dict	optimizer	schedulerrV   rY   rZ   rX   r%   r]   data_split_idata_split_numr*   rI   rR   rQ   modulescaler_stateTr   model.pt.ep.zCheckpoint saved to model.ptr.   r&   zmodel.pt.bestzUpdate best acc: z.4f, zNo improvement in acc: z < losszUpdate best loss: zNo improvement in loss: z > Undoval__step_or_epochr2   zDelete: )"rO   rM   inforH   rI   ri   rV   rY   rZ   rX   r%   rF   hasattrrn   rB   rE   r   rC   jointorchsaver   printgetattrr$   lenminmaxrD   remover   r   rJ   barrier)ra   rf   rg   modeloptimrk   scalerr]   rb   state	ckpt_namefilenamelatest	best_ckptr3   r   r   r   save_checkpoint   s   	



4
4

zTrainer.save_checkpointc                 C   sz  | j r-tj| jd}tj|r%tj|dd}|d | _|d }|	 }|
 D ]B}	|	ds?d|	 |
 v r?d|	 }
n|	drTd|	 |
 vrT|	ddd}
n|	}
|
|
 v rc||
 ||	< q+td	|	 d
|
  q+|| ||d  ||d  |durd|v r||d  |d | _d|v r|d ni | _d|v r|d ni | _d|v r|d nd| _d|v r|d nd| _d|v r|d nd| _d|v r|d nd| _| jdu rdn| j| _d|v r|d nd| _| jdu rdn| j| _t|d  d|v r|d nd| _d|v r|d nd| _|| j td| d ntd| d | js5| jr;t  dS dS )z
        Resumes training from a checkpoint at the given file path.
        Loads the model's state, the optimizer's state, and the scheduler's state.

        Args:
            resume_path (str): The file path to the checkpoint to resume from.
        rr   cpu)map_locationrf   ri   zmodule.r.   r*   zMiss key in ckpt: model: z, ckpt: rj   rk   Nro   rV   rY   rZ   rX   rl   r   rI   rg   r]   rQ   rR   z%Checkpoint loaded successfully from ''zNo checkpoint found at 'z', does not resume status!)r   rB   rC   rz   r   isfiler{   loadrG   ri   keys
startswithreplacer}   load_state_dictrV   rY   rZ   rX   r[   rI   r\   r]   rQ   rR   tor   r   r   rJ   r   )ra   r   r   rk   r   ckpt
checkpoint	src_state	dst_statekk_ddpr   r   r   resume_checkpoint   sf   



zTrainer.resume_checkpointc	                  K   s6  | j s| jr
t  td| d| j d |  | j}
|	  i }t
d| j}|j| t }|}t|D ]L\}}|  jd7  _|  jd7  _t }|| d|d< t|| j}t}| j sj| jru||
 dkrs|jn|}|  t }t| j |di |}W d   n1 sw   Y  |\}}}d	d
 | D }| j s| jr|||j  }| j s| jrtj|tjj d ||  }|| j!9 }||
 }t }|| d|d< | jr|"|#  n|#  t }|| d|d< | j$||	%dd  |& ' (  ||	%dd d  | _$d|v r?| j)||	%dd  |d & ' (  ||	%dd d  | _)W d   n	1 sJw   Y  |d |
 dkrI| j*dkrt
j+j,j-|. | j*| j/d}t
0|st1d| d |	  q=| j s| jrt  | jr|2| |3  n|2  |2  |j	dd | j s| jrt
j| j$t
j4d| j}t
j| j)t
j4d| j}tj|tjj d tj|tjj d |& ' ( | j! | _$|& ' ( | j! | _)t | |
 d}t }|| d|d< ||d< |5 d }d}t6|dr t7|}| j8||||	%dd | j|||
|& ' (  |||d|	%dd|	%ddd | j| j9 dkr`| j:|||||d | jd | j| j; dkr| j<||||||d | j|	%dd|	%dd| j$| j)d t }q=| j s| jrt  dS dS ) z
        Defines the training process for a single epoch with gradient accumulation.
        Args:
            epoch (int): The current epoch number.
        zTrain epoch: rd   re   r   r*   0.3f	data_loadNc                 S      i | ]\}}|d ur||qS r   r   .0r   vr   r   r   
<dictcomp>      z'Trainer.train_epoch.<locals>.<dictcomp>opforward_timebackward_and_AllReaduce_timer\   r&   )max_norm	norm_typezThe grad norm is z. Skipping updating the model.T)set_to_nonedtype
optim_time
total_time__len__trainrl   rm   )log_stepr]   batch_num_epochlrrt   speed_statsstatswritertagrl   rm   )r   dataloader_valrf   r   rg   r]   )
r   r   rk   r   rg   r]   rl   rm   rR   rQ   r   )=r   r   rJ   r   rM   rx   rO   r   r)   	zero_gradr{   tensorr   r   batch_sampler	set_epochtimeperf_counter	enumeraterI   r]   r	   r   no_syncr   r   itemstyper   sum
all_reduceReduceOpSUMrP   scalebackwardrR   rF   detachr   itemrQ   r+   nnutilsclip_grad_norm_
parametersr,   isfiniterN   rg   updatefloat32get_last_lrry   r   logr"   validate_epochr!   r   ) ra   r   r   rk   r   dataloader_trainr   rf   r   rb   r)   r   iterator_stoptime_begtime5	batch_idxbatchtime1
my_contexttime2retvalrt   r   weighttime3time4	grad_normrR   rQ   r   r   r   r   r   r   train_epochO  s  
	

1




	


zTrainer.train_epochc                 K   s  | j s| jr
t  td| d| j d |  t	 W i }t
 }td| j}|j| t|D ]\}	}
| j sE| jrTt|tjj |dkrT nt
 }|| d|d< t|
| j}
t
 }|di |
}t
 }|| d|d< |\}}}dd	 | D }| j s| jr|||j  }| j s| jrtj|tjjd
 ||  }|| j9 }|}t
 }t|r6| j|	 |    |	d  | _d|v r| j |	 |d     |	d  | _ | j s| jr6tj| jtj!d| j}tj| j tj!d| j}tj|tjjd
 tj|tjjd
 |   | j | _|   | j | _ t
 }d}t"|drFt#|}| j$||	|d|   |||dd	 q:| j sc| jrp|%d t|tjj W d   n	1 s{w   Y  |&dddu rd| }nd| d|&d }| j | j'|< | j| j(|< |)  | j s| jrt  td| j}dS dS )z
        Defines the validation process for a single epoch.
        Should be implemented with the actual model validation steps.

        Args:
            epoch (int): The current epoch number.
        zValidate epoch: rd   re   r   r   r   r   c                 S   r   r   r   r   r   r   r   r   <  r   z*Trainer.validate_epoch.<locals>.<dictcomp>r   r*   r&   r   r   r-   val)r   r   rt   r   r   r   r   Nr]   rp   rq   r   )*r   r   rJ   r   rM   rx   rO   evalr{   no_gradr   r   r   r   r   r   r   r   r   r   r   r	   r   r   r   r   rP   r   rT   r   r   r   rS   r   ry   r   r   fill_rF   rY   rZ   r   )ra   r   r   rf   r   rb   r   r   r   r   r   r   r   r   r   rt   r   r   r   rT   rS   r   r   r   r   r   r     s   


RzTrainer.validate_epochr   r#   r-   r   r*   c              	   K   s  |d | j  dkri|d ur|n|}dtj d d d tj d d d tj d d d tj d d d }t| |
 d}t| |
 d}d	g |
 d| j
 d	| d
| j d| d
| d|d  d
| d| d| j d|dd|ddt|dd|dd|dddd | D  d| d| }t| d| j
 d|
 |d| j
 d|
 |i}|	d urV|	d| j
 d|
 || j |	d| j
 d|
 || j | D ](\}}|	d| j
 d| d
|
 | | j | |d| j
 d| d
|
 < q| D ])\}}|	d| j
 d| d
|
 t|| j t||d| j
 d| d
|
 < q,| jrktd urmtj|| jd d S d S d S d S ) Nr*   r   zWGPU, memory: usage: {:.3f} GB, peak: {:.3f} GB, cache: {:.3f} GB, cache_peak: {:.3f} GBi   	_loss_avg_acc_avgr.   rd   z	, epoch: /z, data_slice: z, step_in_slice: z, step_in_epoch: z, total step: z, (loss_avg_rank: z.3fz), (loss_avg_slice: z), (ppl_avg_slice: z.3ez), (acc_avg_slice: z), (lr: z), c                 S   s*   g | ]\}}|t |   d fqS )   )roundr   r   r   r   r   r   r   
<listcomp>  s   * zTrainer.log.<locals>.<listcomp>rs   rO   z_loss/z_lr/
stats_rank_)rg   )r   formatr{   r   memory_allocatedmax_memory_allocatedmemory_reservedmax_memory_reservedr~   rz   rO   r   rI   mathexpr   rM   rx   
add_scalarr   r   r0   r^   r   )ra   rf   r   r]   r   r   rt   r   r   r   r   rl   rm   r   rb   gpu_infoloss_avg_epochacc_avg_epochdescriptiondescription_dictr3   varr   r   r   r     s   
	


 " $
3zTrainer.logc                 C   sB   | j s| jr
t  |d ur|  | j s| jrtj  d S d S r   )r   r   rJ   r   closer{   distributeddestroy_process_group)ra   r   r   r   r   r     s   zTrainer.close)FFFr   )NNNNNN)NNNN)NNNNNNNN)r   r   r   r#   r-   r-   NNNr   r   r*   Nr   )__name__
__module____qualname____doc__boolstrrc   r   r   r   r   r   r   r   r   r   r   r   !   sr    
\
w
R
 H
t
Kr   )r   rB   r   r{   rM   r   r   torch.distributedr  rJ   torch.cuda.ampr   r   
contextlibr   r   pathlibr   funasr.train_utils.device_funcsr	   funasr.train_utils.recursive_opr
   'funasr.train_utils.average_nbest_modelsr   *torch.distributed.fsdp.sharded_grad_scalerr   r^   r   r   r   r   r   r   <module>   s,    
