o
    iW                     @   s   d Z ddlZddlZddlZddlZddlZddlZddlZddl	m
Z
 ddlmZ ddlmZ dddZd	d
 Zdd Zdd ZedkrPeejdd  dS dS )z3Automatic speech recognition model training script.    N)__version__)	strtobool)BATCH_COUNT_CHOICESTc                 C   sB  | du rt jdt jt jd} | jdddd | jddd	d | jd
ddd | jddtdd | jddddd | jddg ddd | jddtddgdd | jdt|dd  | jd!d"td#d | jd$|d%d& | jd'd"td(d | jd)td*d+ | jd,d-d.d/d0d1 | jd2d3td4d5d6 | jd7d8d9td:d | jd;dtd/d<d= | jd>d?td@d | jdAd9tdBd | jdCtddDd6 | jdEtddFd6 | jdGtddHd6 | jdId"tdJd | jdKdLtg dMdNd | jdOdPtdQd | jdRdStdTd | jdUdddVd | jdWdddXd | jdYtd"dZd6 | jd[td\d]d6 | jd^dStd_d | jd`dStdad | jdbdStdcd | jdddetdfd | jdgtddhd6 | jditddjd6 | jdkdltdmd | jdndotdpd | jdqdrtdsd | jdtd9td/dud= | jdvdwt	dxd | jdydzd{d9td|d | jd}d9td~d | jdd9tdd | jdd9tdd | jdd9tdd | jdddtddd | jdddtddd | jdd9tdd | jdtdd/dd | jddtg ddd | jdd"tdd | jddtdd | jddtdd | jddStdd | jddtg ddd | jddtdd | jdddtdd | jddtd/dd= | jddtd/dd= | jddtdd | jddtdd | jddtdd | jdt
ddd6 | jdd"td"dgdd | jddt
d/dd= | jddtdd | jddddƄ dd | jddtdd | jddddƄ dd | jddddƄ dd | jdt
ddd6 | jdt
ddd6 | jddtg dעdd | jdtdd.d6 | jdtdd.d6 | jdtdd.d6 | jdtdSd.d6 | jdtdd.d6 | jdtdd.d6 | jdt
ddd6 | jdt
dd.d6 | jddtg dעdd | jdtdd.d6 | jdtdd.d6 | jdtdd.d6 | jdtdd.d6 | jdtddd6 | jdtddd6 | jdtdSd.d6 | jdtddd6 | jdt
ddd6 | jdt
dd.d6 | jdt
dd.d6 | jdtddd6 | jdtddd6 | jdtdSd.d6 | jdtdd.d6 | S )zGet default arguments.NzRTrain an automatic speech recognition (ASR) model on one CPU, one or multiple GPUs)descriptionconfig_file_parser_classformatter_classz--configTzconfig file path)is_config_filehelpz	--config2zCsecond config file path that overwrites the settings in `--config`.z	--config3zRthird config file path that overwrites the settings in `--config` and `--config2`.z--ngpuz5Number of GPUs. If not given, use all visible devices)defaulttyper	   z	--use-ddpF
store_truezEnable process-based data parallel. --ngpu's GPUs will be used. If --ngpu is not given, this tries to identify how many GPUs can be used. But, if it fails, the application will abort. And, currently, single node multi GPUs job is only supported.)r
   actionr	   z--train-dtypefloat32)float16r   float64O0O1O2O3zData type for training (only pytorch backend). O0,O1,.. flags require apex. See https://nvidia.github.io/apex/amp.html#opt-levels)r
   choicesr	   z	--backendchainerpytorchzBackend library)r
   r   r   r	   z--outdirzOutput directory)r   requiredr	   z--debugmode   	Debugmodez--dict
Dictionary)r   r	   z--seedzRandom seedz
--debugdirzOutput directory for debugging)r   r	   z--resumez-r ?z!Resume the training from snapshot)r
   nargsr	   z--minibatchesz-Nz-1z&Process only N minibatches (for debug))r   r
   r	   z	--verbosez-Vr   zVerbose optionz--tensorboard-dirzTensorboard log dir path)r
   r   r   r	   z--report-interval-itersd   zReport interval iterationsz--save-interval-itersz!Save snapshot interval iterationsz--train-jsonz#Filename of train label data (json)z--valid-jsonz(Filename of validation label data (json)z--model-modulezCmodel defined module (default: espnet.nets.xxx_backend.e2e_asr:E2E)z
--num-encsz Number of encoders in the model.z
--ctc_typebuiltin)r    gtnctccudnnctcz-Type of CTC implementation to calculate loss.z
--mtlalphag      ?zKMultitask learning coefficient, alpha: alpha*ctc_loss + (1-alpha)*att_loss z--lsm-weightg        zLabel smoothing weightz--report-cerzCompute CER on development setz--report-werzCompute WER on development setz--nbestzOutput N-best hypothesesz--beam-size   z	Beam sizez	--penaltyzIncertion penaltyz--maxlenratiozInput length ratio to obtain max output length.
                        If maxlenratio=0.0 (default), it uses a end-detect function
                        to automatically find maximum hypothesis lengthsz--minlenratioz.Input length ratio to obtain min output lengthz--ctc-weightg333333?zCTC weight in joint decodingz--rnnlmzRNNLM model file to readz--rnnlm-confzRNNLM model config file to readz--lm-weightg?zRNNLM weight.z--sym-spacez<space>zSpace symbolz--sym-blank<blank>zBlank symbolz--sortagradzFHow many epochs to use sortagrad for. 0 = deactivated, -1 = all epochsz--batch-countautozKHow to count batch_size. The default (auto) will find how to count by args.z--batch-sizez--batch-seqsz-bz*Maximum seqs in a minibatch (0 to disable)z--batch-binsz*Maximum bins in a minibatch (0 to disable)z--batch-frames-inz2Maximum input frames in a minibatch (0 to disable)z--batch-frames-outz3Maximum output frames in a minibatch (0 to disable)z--batch-frames-inoutz9Maximum input+output frames in a minibatch (0 to disable)z--maxlen-inz--batch-seq-maxlen-ini   MLzPWhen --batch-count=seq, batch size is reduced if the input sequence length > ML.)r
   r   metavarr	   z--maxlen-outz--batch-seq-maxlen-out   zPWhen --batch-count=seq, batch size is reduced if the output sequence length > MLz--n-iter-processeszNumber of processes of iteratorz--preprocess-confz-The configuration file for the pre-processing)r   r
   r   r	   z--optadadelta)r)   adamnoam	Optimizerz--accum-gradzNumber of gradient accumurationz--epsg:0yE>zEpsilon constant for optimizerz--eps-decayg{Gz?zDecaying ratio of epsilonz--weight-decayzWeight decay ratioz--criterionacc)lossloss_eps_decay_onlyr-   z"Criterion to perform epsilon decayz--thresholdg-C6?zThreshold to stop iterationz--epochsz-e   zMaximum number of epochsz--early-stop-criterionzvalidation/main/accz=Value to monitor to trigger an early stopping of the trainingz
--patience   zINumber of epochs to wait without improvement before stopping the trainingz--grad-clip   zGradient norm threshold to clipz--num-save-attentionz*Number of samples of attention to be savedz--num-save-ctcz0Number of samples of CTC probability to be savedz--grad-noisezFThe flag to switch to use noise injection to gradients during trainingz--num-spkrs   z!Number of speakers in the speech.z--context-residualzHThe flag to switch to use context vector residual in the decoder networkz
--enc-initz,Pre-trained ASR model to initialize encoder.z--enc-init-modszenc.enc.c                        fdd  dD S )Nc                       g | ]
} d krt |qS r   str.0mods H/home/ubuntu/.local/lib/python3.10/site-packages/espnet/bin/asr_train.py
<listcomp>y      0get_parser.<locals>.<lambda>.<locals>.<listcomp>,splitr<   r>   r<   r?   <lambda>y      zget_parser.<locals>.<lambda>z<List of encoder modules to initialize, separated by a comma.z
--dec-initz6Pre-trained ASR, MT or LM model to initialize decoder.z--dec-init-modsz	att.,dec.c                    r4   )Nc                    r5   r6   r7   r9   r<   r>   r?   r@     rA   rB   rC   rD   r<   r>   r<   r?   rF     rG   z<List of decoder modules to initialize, separated by a comma.z--freeze-modsc                    r4   )Nc                    r5   r6   r7   r9   r<   r>   r?   r@     rA   rB   rC   rD   r<   r>   r<   r?   rF     rG   z0List of modules to freeze, separated by a comma.z--use-frontendz*The flag to switch to use frontend system.z	--use-wpezApply Weighted Prediction Errorz--wtypeblstmp)lstmblstmlstmprH   vgglstmp	vggblstmpvgglstmvggblstmgrubgrugrupbgrupvgggrupvggbgrupvgggruvggbgruzDType of encoder network architecture of the mask estimator for WPE. z	--wlayersz--wunitsi,  z--wprojsz--wdropout-ratez
--wpe-tapsz--wpe-delayz--use-dnn-mask-for-wpezGUse DNN to estimate the power spectrogram. This option is experimental.z--use-beamformerz--btypezJType of encoder network architecture of the mask estimator for Beamformer.z	--blayersz--bunitsz--bprojsz--badimi@  z--bnmaskz>Number of beamforming masks, default is 2 for [speech, noise].z--ref-channelzWThe reference channel used for beamformer. By default, the channel is estimated by DNN.z--bdropout-ratez--stats-filez,The stats file for the feature normalizationz--apply-uttmvnz2Apply utterance level mean variance normalization.z--uttmvn-norm-meansz--uttmvn-norm-varsz
--fbank-fsi>  z5The sample frequency used for the mel-fbank creation.z--n-melsP   z!The number of mel-frequency bins.z--fbank-fminz--fbank-fmax)configargparseArgumentParserYAMLConfigFileParserArgumentDefaultsHelpFormatteraddadd_argumentintr8   floatr   r   )parserr   r>   r>   r?   
get_parser   sT  		
rc   c                 C   s:   | dkrt jt jdd dS t jt jdd t d dS )z*Make logging setup with a given log level.r   z>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelformatzSkip DEBUG/INFO messagesN)loggingbasicConfigINFOWARNwarning)verboser>   r>   r?   setup_logging  s   
rl   c              	   C   s  t  }|| \}}|jdkr|jdkrtd|j d|jdkr0|jdv r0td|j ddd	lm} |j	d
u rP|j
dkrHd|j d }nd|j d }n|j	}||}|| || }||_	d|j	v rld|_d|j	v rtd|_t|_t|j |jd
u rtjd}|d
urt|d}n<td ztjddgtjtjd}	W n tjtfy   d}Y nw t|	j dd }n|jdkrtd |j}|j r|dkrtdt!d|  t!dtjdd  t!d|j"  t#"|j" t$j#"|j" |j%d
urEt&|j%d }
|
' }W d
   n	1 sw   Y  d!d" |D }|(dd# |)d$ d%|j	v rA|)d& ||_*nd
|_*t!d'|j  |j r}|j
dkry|jdkrdtd(dd)l+m,}m-} |d* |t.||j d
S td+|j
dkr|jdkrdd,l/m0} || d
S |jdkrdd,l1m0} || d
S td-|jdkrdd,l2m0} || d
S td.)/zRun the main training function.r   r   z/chainer backend does not support --train-dtype z.Use --dtype float32.r   )r   r   r   r   r   z--train-dtype z" does not support the CPU backend.)dynamic_importNr   zespnet.nets.z_backend.e2e_asr:E2Ez_backend.e2e_asr_mix:E2Echainer_backendpytorch_backendr   CUDA_VISIBLE_DEVICESrC   z CUDA_VISIBLE_DEVICES is not set.z
nvidia-smiz-L)stdoutstderr
zsThere are some bugs with multi-GPU processing in PyTorch 1.2+ (see https://github.com/pytorch/pytorch/issues/21108)zDDP requires at least 1 GPU.zngpu: zpython path = 
PYTHONPATHz(None)zrandom seed = %drbc                 S   s    g | ]}| d dd qS )zutf-8 r   )decoderE   )r:   entryr>   r>   r?   r@   n  s     zmain.<locals>.<listcomp>r$   z<eos>maskctcz<mask>z
backend = z"Chainer with DDP is not supported.)launchset_start_methodspawnz0Single speaker is only supported when using DDP.trainz'Only chainer and pytorch are supported.zOnly pytorch is supported.)3rc   parse_known_argsbackendtrain_dtypeNotImplementedErrorngpu
ValueErrorespnet.utils.dynamic_importrm   model_module	num_spkrsadd_arguments
parse_argsr   versionrl   rk   osenvirongetlenrE   rf   rj   
subprocessrunPIPECalledProcessErrorFileNotFoundErrorrr   rw   debuguse_ddpinfoseedrandomnpdictopen	readlinesinsertappend	char_list)espnet.distributed.pytorch_backend.launchrz   r{   $_reinitialize_logging_and_call_trainespnet.asr.chainer_backend.asrr~   espnet.asr.pytorch_backend.asr"espnet.asr.pytorch_backend.asr_mix)cmd_argsrb   args_rm   r   model_classcvdr   pf
dictionaryr   rz   r{   r~   r>   r>   r?   main  s   













r   c                 C   s"   ddl m} t| j ||  d S )Nr   r}   )r   r~   rl   rk   )r   r~   r>   r>   r?   r     s   
r   __main__r   )NT)__doc__rf   r   r   r   sysrZ   numpyr   espnetr   espnet.utils.cli_utilsr   espnet.utils.training.batchfyr   rc   rl   r   r   __name__argvr>   r>   r>   r?   <module>   s,   
   z 