o
    i"                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZmZmZmZ ddlZddlZddlZddlZddlZddlmZ ddlmZ dd	lmZ dd
lm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8 ddl9m:Z: ej;< rddl=m>Z> eej?edkrddl@mAZAmBZB n	ed ddZBdZAzddlCZCW n eDy   dZCY nw ejEG dd dZFG dd dZGdS )!zTrainer module.    N)contextmanager)is_dataclass)Path)DictIterableListOptionalSequenceTupleUnion)parse)check_argument_types)AbsIterFactory)average_nbest_models)calculate_all_attentions)AbsBatchStepSchedulerAbsEpochStepSchedulerAbsSchedulerAbsValEpochStepScheduler)add_gradient_noise)	to_device)recursive_average)set_all_random_seed)AbsESPnetModel)DistributedOption)ReporterSubReporter)build_dataclass)kwargs2args)ReduceOp1.6.0)
GradScalerautocastTc                 c   s    d V  d S N )enabledr$   r$   I/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/train/trainer.pyr"   -   s   
r"   c                   @   s
  e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< ee ed	< eed
< eed< eed< eed< e	e
ef ed< eed< eed< eed< ee ed< e	eee f ed< eed< ee ed< eee  ed< ee ed< eed< eed< eed< dS )TrainerOptionsngpuresumeuse_amptrain_dtype
grad_noise
accum_grad	grad_clipgrad_clip_typelog_intervalno_forward_runuse_matplotlibuse_tensorboard	use_wandb
output_dir	max_epochseedsharded_ddppatiencekeep_nbest_modelsnbest_averaging_intervalearly_stopping_criterionbest_model_criterionval_scheduler_criterionunused_parameterswandb_model_log_intervalcreate_graph_in_tensorboardN)__name__
__module____qualname__int__annotations__boolstrfloatr   r   r   r   r	   r$   r$   r$   r&   r'   9   s6   
 r'   c                   @   s  e Zd ZdZdd ZedejdefddZ	edej
fd	d
Ze	d%deeef dejjdedeejj deee  dee defddZededeejj deee  dededee deddfddZedejjde e!e"e e#eej$f f  deejj deee  dee de%dedede&fddZ'ee( dejjde e#eej$f  de%dededdfd d!Z)ee( dejjd"ee de e!e"e e#eej$f f  de%deddfd#d$Z*dS )&Trainera~  Trainer having a optimizer.

    If you'd like to use multiple optimizers, then inherit this class
    and override the methods if necessary - at least "train_one_epoch()"

    >>> class TwoOptimizerTrainer(Trainer):
    ...     @classmethod
    ...     def add_arguments(cls, parser):
    ...         ...
    ...
    ...     @classmethod
    ...     def train_one_epoch(cls, model, optimizers, ...):
    ...         loss1 = model.model1(...)
    ...         loss1.backward()
    ...         optimizers[0].step()
    ...
    ...         loss2 = model.model2(...)
    ...         loss2.backward()
    ...         optimizers[1].step()

    c                 C   s   t d)Nz!This class can't be instantiated.)RuntimeError)selfr$   r$   r&   __init__n   s   zTrainer.__init__argsreturnc                 C   s   t  sJ tt|S )z?Build options consumed by train(), eval(), and plot_attention())r   r   r'   )clsrN   r$   r$   r&   build_optionsq   s   

zTrainer.build_optionsparserc                 C   s   dS )z2Reserved for future development of another TrainerNr$   )rP   rR   r$   r$   r&   add_argumentsw   s   zTrainer.add_argumentsr   
checkpointmodelreporter
optimizers
schedulersscalerr(   c                 C   s   t j| |dkrdt j  ndd}||d  ||d  t||d D ]	\}}	||	 q(t||d D ]\}
}	|
d urF|
|	 q9|d ur^|d	 d u rWtd
 n||d	  td|   d S )Nr   zcuda:cpu)map_locationrU   rV   rW   rX   rY   zscaler state is not foundzThe training was resumed using )	torchloadcudacurrent_deviceload_state_dictziploggingwarninginfo)rT   rU   rV   rW   rX   rY   r(   states	optimizerstate	schedulerr$   r$   r&   r)   |   s"   

zTrainer.resumetrain_iter_factoryvalid_iter_factoryplot_attention_iter_factorydistributed_optionNc	           #         s  t  sJ t|sJ t|t|t|ks!J t|t|ft|jtr,|jg nt|jdkr<td dg|_|j t	|j
}	t |jrmttjtdk rWtd|jritdu rbtdtjj }
nt }
nd}
|jr|	d  r| j|	d ||||
|jd	  d }||jd krtd
|  |jr|jrtjjj||d}n8tjj j!||jdkrtj"# gnd|jdkrtj"# nd|j$d}n|jdkrtjj j%|t&t'|jd}n|}|j(r|jr|j)dkrddl*m+} |t,|	d d }|t,|	d d }nd}t-. }t'||jd D ]}||kr@t/d0||jt12t-. | ||  |j| d   nt/| d|j d t3|j4|  5| 6d}| j7||||8|||
|||d	}W d   n	1 szw   Y  6d}| j9||8||||d W d   n	1 sw   Y  |jr|j)dkr|dur׈6d}| j:||	d ||8|||d W d   n	1 sw   Y  |D ]}t|t;r|<j=|j>  qt|t?r|<  q|jr|D ]}t|tjj@jAr|B  q|jr|j)dkrt/C  |jDr+E|	d  |dur>jF|dd jF|dd |jGrFH  tI|J J dd |D dd |D |
durc|
J ndd|	d  tI|J |	| d   |	d! }|K s| r|L  |M| d  g }|jND ]C\}}}O||rوP|||}||kr|	| d"| d# }|K s| r|L  |M| d  |Q| d"|  qt|dkrt/d$ n
t/d%d&R|  |jSdko||jS dk}|r?|jGr?ddlT}t/d' |jUd(|jVjW d)d*|id+}|Xt,|	| d   d,| ||kr5d-nd.g}|jY||d/ g } tZ j[ fd0d|jND  }!|j\dkrl||j\ dkrlt]|	|jN d1| d2d3 t'd|D ]}"|	|" d  }| r|"|!vr|L  | Qt,| qqt| dkrt/d4d&R|   |rtd5| d2  n |j^durĈj_|j^g|j`R  r nqt/d6|j d7 |jr|j)dkrt]|	|jN d8 dS dS )9zDPerform training. This method performs the main process of training.r   z,No keep_nbest_models is given. Change to [1]   r    z3Require torch>=1.6.0 for  Automatic Mixed PrecisionNz/Requiring fairscale. Do 'pip install fairscale'zcheckpoint.pth)rT   rU   rW   rX   rV   rY   r(   z/The training has already reached at max_epoch: )modulesharded_optimizer)
device_idsoutput_devicefind_unused_parameters)rp   )SummaryWritertensorboardtrainvalidz0{}/{}epoch started. Estimated time to finish: {}/zepoch started)	rU   rW   rX   iteratorrV   rY   summary_writeroptionsrl   )rU   rx   rV   rz   rl   att_plotatt_ws)rU   r5   ry   rx   rV   rz   images)key1c                 S   s   g | ]}|  qS r$   
state_dict).0or$   r$   r&   
<listcomp>X  s    zTrainer.run.<locals>.<listcomp>c                 S   s    g | ]}|d ur|  nd qS r#   r   )r   sr$   r$   r&   r   Y  s    )rU   rV   rW   rX   rY   z	epoch.pthz
latest.pth.z	.best.pthz'There are no improvements in this epochz!The best model has been updated: z, z!Logging Model on this epoch :::::model_rU   improved)nametypemetadatazepoch-best )aliasesc                    s<   g | ]\}}} ||rt|||d t  qS r#   )hassetsort_epochsmax)r   phkmr:   rV   r$   r&   r     s    
tillepoch)rV   r5   r=   nbestsuffixzThe model files were removed: zjThe gradients at all steps are invalid in this epoch. Something seems wrong. This training was stopped at zThe training was finished at z epochs )rV   r5   r=   r   )ar   r   r   len
isinstancer:   rE   rb   rc   r   r5   r   r*   Vr\   __version__rK   r8   	fairscaleoptimgrad_scalerShardedGradScalerr!   r)   existsr(   	get_epochr6   distributednndata_parallelShardedDataParallelparallelDistributedDataParallelr^   r_   r?   DataParallellistranger3   	dist_ranktorch.utils.tensorboardrs   rH   timeperf_counterrd   formathumanfriendlyformat_timespanr   r7   	set_epochobservetrain_one_epoch
build_itervalidate_one_epochplot_attentionr   step	get_valuer>   r   ossOSSconsolidate_state_dictlog_messager2   matplotlib_plottensorboard_add_scalarr4   	wandb_logsaver   
is_symlinkunlink
symlink_tor=   r   get_best_epochappendjoinr@   wandbArtifactrunidadd_filelog_artifactr   unionr;   r   r9   check_early_stoppingr<   )#rP   rU   rW   rX   ri   rj   rk   trainer_optionsrl   r5   rY   start_epochdp_modelrs   train_summary_writervalid_summary_writer
start_timeiepochsub_reporterall_steps_are_invalidrh   rf   p	_improved_phaser   _mode
best_epoch	log_modelr   artifactr   _removednbestser$   r   r&   r      s  
$




















zTrainer.runrx   rz   c
           &   
   C   sR  t  sJ |j}
|j}|j}|j}|j}|j}|j}|j}|j	}|	j
}|d u r>ztt|d d}W n ty=   d}Y nw |  d}td|dkrOdnd}t }t||dd	D ]\}\}}t|tsqJ t||rtj
|tj |dkr |S ||d
< t||dkrdnd}|rd}q_|r|d	kr|d ur|rt|d}n?|}|d urzt|j|}W n t tfy   t!"d Y n!w z
|j#||dd W n t$y   t!"d Y nw ~nt!"d ~t%|d u |&d} |d&i |}t|trc|d }|d }|d }|'d} | d urbt| t(sbt| tj)s.t*dt|  | + dkr?t*d| +  d| + d	kr^| D ]}!|!| d krUt*dqH| d , } n| , } n|\}}}d } W d    n	1 suw   Y  dd |- D }|d	ks|r|||j. / }t0|||\}}|| }|r|tj
1 9 }|| }W d    n	1 sw   Y  |2|| |&d |d ur|3|4  n|4  W d    n	1 sw   Y  || dkr|d urt|D ]\}"}#| d ur|"| krq|5|# q|
r!t6||7 dddd tj8j9j:|; ||d }$t|$tj)s9t|$}$t<|$smt!"d!|$ d" |d urlt|D ]\}"}#| d ura|"| kraqQ|=|# |>  qQnPd}|&d#@ tt?||D ]0\}"\}#}%| d ur|"| krq||d ur|=|# |>  n|#=  t|%t@r|%=  q|W d    n	1 sw   Y  t|D ]\}"}#| d ur|"| krѐq|#A  q|2td$d t|D t | d% t }|B  || dkrt!C|D|  |d ur|E||  |r|F  q_|r'|Gd	 tj
|tj |S )'N   
   d   Tr   r^   rZ   	iter_timerm   utt_idFrn   zUinpect.signature() is failed for the model. The graph can't be added for tensorboard.)use_strict_tracez]summary_writer.add_graph() is failed for the model. The graph can't be added for tensorboard.z1model.module is not found (This should be a bug.)forward_timelossstatsweight	optim_idxz4optim_idx must be int or 1dim torch.Tensor, but got    z
dim tensorz@optim_idx must be 1dim tensor having same values for all entriesc                 S   s   i | ]\}}|d ur||qS r#   r$   )r   r   vr$   r$   r&   
<dictcomp>Q  s    z+Trainer.train_one_epoch.<locals>.<dictcomp>backward_time      ?g?)durationetascale_factor)max_norm	norm_typezThe grad norm is z. Skipping updating the model.optim_step_timec                 S   sB   i | ]\}}t |jD ]\}}d |v rd| d| |d  qqS )lrr   _lr)	enumerateparam_groups)r   irf   jpgr$   r$   r&   r     s    )
train_timer$   )Hr   r,   r-   r.   r/   r0   r1   r(   r4   rA   r   r   r   	TypeErrorru   r\   tensortor   r   r   measure_iter_timer   dictr   
all_reducer   SUMr   getattrr   forward
ValueErrorrb   rc   	add_graph	Exceptionr"   measure_timegetrE   TensorrK   dimitemitemsdtypesumr   get_world_sizeregisterscalebackwardunscale_r   get_total_countr   utilsclip_grad_norm_
parametersisfiniter   updatera   r   	zero_gradnextrd   r   r   r   fill_)&rP   rU   rx   rW   rX   rY   rV   ry   rz   rl   r,   r-   r.   r/   r0   r1   r(   r4   rA   r   r   iterator_stopr   iiterr   batch_model_argsretvalr   r   r   r   r   ioptrf   	grad_normrh   r$   r$   r&   r     s|  
 S 


&
8

	










zTrainer.train_one_epochc                 C   s4  t  sJ |j}|j}|j}|  td|dkrdnd}	|D ]d\}
}t|t	s1J t
||rBtj|	tj |	dkrB d S |
|d< t||dkrNdnd}|rTq"|di |}t|t	ri|d }|d }n|\}}}|dkst|r|t|||\}}||| |  q"|r|	d tj|	tj d S d S )	Nr   r^   rZ   r   r   r   rm   r$   )r   r(   r1   r   evalr\   r  r  r   r  r   r  r   r	  r   r   r  r#  r$  )rP   rU   rx   rV   rz   rl   r(   r1   r   r%  r   r'  r*  r   r   _r$   r$   r&   r     s:   






zTrainer.validate_one_epochr5   c              	   C   s  t  sJ dd l}|j}|j}	|d dd lm}
 ddlm} |	  |D ]E\}}t
|ts6J t|ttt| t|ksTJ ttt| t|f||d< t||dkr`dnd}|	rfq&t||}| D ]\}}t|t|ksJ t|t|ft||D ]\}}t
|tjr|   }|jdkr|d  }n|jdks|jd	krtd
|j |
dt| \}}|
j|d |d fd}|d	t|}t|d	kr|g}t||D ]5\}}|j| t!j"dd |#| d|  |$d |%d |j&'|dd |j('|dd q|d ur;|| | d|)  d }|j*j+ddd |,| |d urN|-| d| ||)  |j.rfdd l/}|0d| d| |1|i qqo|  q&d S )Nr   Agg)MaxNLocatorr   r^   rZ   r      rm   zMust be 2 or 3 dimension: r   g?)figsizeauto)aspectr.  InputOutputT)integerr   zep.png)parentsexist_okzattention plot/)2r   
matplotlibr(   r1   usematplotlib.pyplotpyplotmatplotlib.tickerr0  r-  r   r  r   r   r#  itervaluesr   r   r  ra   r\   r  detachrZ   numpyndimrK   	figaspectFiguresubplotsimshowastypenpfloat32	set_title
set_xlabel
set_ylabelxaxisset_major_locatoryaxisr   parentmkdirsavefig
add_figurer4   r   logImage)rP   rU   r5   ry   rx   rV   rz   r:  r(   r1   pltr0  idsr'  att_dictr   att_listid_att_wwhfigaxesaxawr   r   r$   r$   r&   r     sn   


$






 
&zTrainer.plot_attention)r   )+rB   rC   rD   __doc__rM   classmethodargparse	Namespacer'   rQ   ArgumentParserrS   staticmethodr   rH   r   r\   r   Moduler   r	   r   	Optimizerr   r   r!   rE   r)   r   r   r   r   r   r
   r   r   r  r   rG   r   no_gradr   r   r$   r$   r$   r&   rJ   W   s    




	
  7

	
  2rJ   )T)Hrc  re  dataclassesrb   r   
contextlibr   r   pathlibr   typingr   r   r   r   r	   r
   r   r   rB  rI  r\   torch.nntorch.optimpackaging.versionr   r   	typeguardr   "espnet2.iterators.abs_iter_factoryr   'espnet2.main_funcs.average_nbest_modelsr   +espnet2.main_funcs.calculate_all_attentionsr    espnet2.schedulers.abs_schedulerr   r   r   r   &espnet2.torch_utils.add_gradient_noiser    espnet2.torch_utils.device_funcsr    espnet2.torch_utils.recursive_opr   'espnet2.torch_utils.set_all_random_seedr   espnet2.train.abs_espnet_modelr   espnet2.train.distributed_utilsr   espnet2.train.reporterr   r   espnet2.utils.build_dataclassr   espnet2.utils.kwargs2argsr   r   is_availabletorch.distributedr   r   torch.cuda.ampr!   r"   r   ImportError	dataclassr'   rJ   r$   r$   r$   r&   <module>   sX    $
