o
    ߥim                     @   s   d Z ddlZddlZddlZddlZddlZddlmZ dd Zdd Z	d	d
 Z
dd Zdd Zdd Zdd Zdd Zdd ZdS )zargparser configuration    N   )get_hostnamec                 C   s6  |  dd}|jdddd |jdddd |jd	dd
d |jdtddd |jdtddd |jdtddd |jdtddd |jdtddd |jdtddd |jdtddd |jd tdd!d |jd"td#d$d |jd%td&d'd |jd(dd)d |jd*td+d,d |jd-dd.d |jd/dd0d | S )1zModel argumentsmodelzmodel configurationz--transformer-xl
store_truezuse transformer-xl for trainingactionhelpz--pretrained-bertzuse a pretrained bert-large-uncased model insteadof initializing from scratch. See --tokenizer-model-type to specify which pretrained BERT model to usez--encoder-decoderz0use the encoder-decoder architecture for blocklmz--attention-dropout皙?z)dropout probability for attention weightstypedefaultr   z--num-attention-heads   z"num of transformer attention headsz--hidden-sizei   ztansformer hidden sizez--intermediate-sizeNzMtransformer embedding dimension for FFNset to 4*`--hidden-size` if it is Nonez--num-layers   znum decoder layersz--layernorm-epsilongh㈵>zlayer norm epsilonz--hidden-dropoutz0dropout probability for hidden state transformerz--output-dropoutz%dropout probability for pooled outputz--max-position-embeddings   z,maximum number of position embeddings to usez--vocab-sizei  znvocab size to use for non-character-level tokenization. This value will only be used when creating a tokenizerz--deep-initzinitialize bert model similar to gpt2 model.scales initialization of projection layers by a factor of 1/sqrt(2N). Necessary to train bert models larger than BERT-Large.z--make-vocab-size-divisible-by   zcPad the vocab size to be divisible by this value.This is added for computational efficieny reasons.z--cpu-optimizerzRun optimizer on CPUz--cpu_torch_adamz#Use Torch Adam as optimizer on CPU.add_argument_groupadd_argumentfloatintparsergroup r   X/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/mglm/arguments.pyadd_model_config_args   s   r   c                 C   s   |  dd}|jdddd |jdddd |jd	dd
d |jdddd |jdddd |jdtddd |jdtddd |jdtddd |jdtddd |jdtdd | S )zMixed precision arguments.fp16zfp16 configurationsz--fp16r   zRun model in fp16 moder   z--fp32-embeddingzembedding in fp32z--fp32-layernormzlayer norm in fp32z--fp32-tokentypeszembedding token types in fp32z--fp32-allreducezall-reduce in fp32z--hysteresis   z#hysteresis for dynamic loss scalingr
   z--loss-scaleNzsStatic loss scaling, positive power of 2 values can improve fp16 convergence. If None, dynamicloss scaling is used.z--loss-scale-window  z.Window over which to raise/lower dynamic scalez--min-scaler   z)Minimum loss scale for dynamic loss scalez--attention-scale      ?r   r   )r   r   r   r   r   r   r   r   add_fp16_config_argst   sX   r!   c                 C   s  |  dd}|jdtddd |jdtdd	d |jd
tdd	d |jdtddd |jdddd |jdtddd |jdddd |jdtddd |jdtddd |jdtddd |jd td!d" |jd#td$d%d |jd&td'd(d |jd)td*d+d |jd,dd-d |jd.dd/d |jd0tdd1d |jd2td3g d4d5d6 |jd7td8d" |jd9td:d;d |jd<tdd=d |jd>dd?d |jd@tddAd |jdBddC |jdDtddEd |jdFtdGdHd |jdIddJd |jdKddLd |jdMtddNd |jdOddPd |jdQddRd |jdSddTd |jdUddVd |jdWddXd |jdYddZd |jd[d\d]d\d^gd_ |jd`dag dbdcdd |jdetddfd |jdgddhd |jdiddjd |jdktdld" |jdmtdld" |jdntdld" |jdotd!d" |jdptdqd" |jdrtdsd" |jdttd!d" |jdutd!d" |jdvddwd |jdxddyd |jdzdd{d |jd|dd}d |jd~td!d" |jdtd!d" |jdddd | S )zTraining arguments.trainztraining configurationsz--experiment-namezgpt-345Mz.The experiment name for summary and checkpointr
   z--batch-size   zData Loader batch sizez--gradient-accumulation-stepsr   z--weight-decayg{Gz?z.weight decay coefficient for L2 regularizationz--checkpoint-activationsr   zLcheckpoint activation to allow for training with larger models and sequencesr   z--checkpoint-num-layersz/chunk size (number of layers) for checkpointingz$--deepspeed-activation-checkpointingz,uses activation checkpointing from deepspeedz--epochsNz>Number of finetunning epochs. Zero results in evaluation only.z--clip-gradr   zgradient clippingz--train-itersr   z:total number of iterations to train over all training runsz--label-smoothing        r    z--log-intervald   zreport intervalz--summary-dir z"The directory to store the summaryz--seedi  zrandom seedz--reset-position-idsz0Reset posistion ids after end-of-document token.z--reset-attention-maskz7Reset self attention maske after end-of-document token.z--lr-decay-iterszUnumber of iterations to decay LR over, If None defaults to `--train-iters`*`--epochs`z--lr-decay-stylelinear)constantr'   cosineexponentialzlearning rate decay functionr   r   choicesr   z--lr-decay-ratior	   z--lrg-C6?zinitial learning ratez--warmupzNpercentage of data to warmup on (.01 = 1% of all training iters). Default 0.01z--switch-linearz'Switch to linear decay for cosine decayz--savez(Output directory to save checkpoints to.z--new-save-directoryr   z--save-epochznumber of epochs between savesz--save-intervali  z"number of iterations between savesz--no-save-optimzDo not save current optimizer.z--no-save-rngzDo not save current rng state.z--loadz2Path to a directory containing a model checkpoint.z--no-load-optimz.Do not load optimizer when loading checkpoint.z--no-load-rngz.Do not load rng state when loading checkpoint.z--no-load-lr-schedulerz1Do not load lr scheduler when loading checkpoint.z--no-deepspeed-loadz)Not use deepspeed when loading checkpointz
--finetunezLoad model for finetuning. Do not load optimizer or rng state from checkpoint and set iteration to 0. Assumed when loading a release checkpoint.z--resume-dataloaderzResume the dataloader when resuming training. Does not apply to tfrecords dataloader, try resumingwith a different seed in this case.z--distributed-backendncclzBwhich backend to use for distributed training. One of [gloo, nccl]gloo)r   r   r,   z
--DDP-impltorch)localr0   nonez4which DistributedDataParallel implementation to use.)r   r,   r   z--local_rankz+local rank passed from distributed launcherz
--block-lmz$whether use the BlockLM pre-trainingz--masked-lmz whether to use the mlm objectivez--bert-probg      ?z--gpt-infill-probz--gpt-min-ratioz--gap-sentence-probz--gap-sentence-ratiog333333?z--avg-block-length   z--short-seq-probz--single-span-probz--task-maskz3Use different mask for generation and blank fillingz--no-shuffle-blockz-not shuffle the blocks when filling the blankz--no-block-positionz9Use (rough) absolute positions instead of block positionsz--sentinel-tokenz:Use sentinel (mask) tokens to replace 2d position encodingz--block-mask-probz--context-mask-ratioz--random-positionz>Use random start position to cover all the position embeddingsr   r   strr   r   r   r   r   r   add_training_args   s  r6   c                 C   s   |  dd}|jdtddd |jdtdd	d |jd
tddd |jdtddd |jdtddd |jdtddd |jdtdd | S )zEvaluation arguments.
validationzvalidation configurationsz--eval-batch-sizeNzIData Loader batch size for evaluation datasets.Defaults to `--batch-size`r
   z--eval-itersr%   z=number of iterations to run for evaluationvalidation/test forz--eval-intervalr   z5interval between running evaluation on validation setz--eval-epochr   z2epoch between running evaluation on validation setz--eval-seq-lengthzMMaximum sequence length to process for evaluation. Defaults to `--seq-length`z--eval-max-preds-per-seqziMaximum number of predictions to use for evaluation. Defaults to math.ceil(`--eval-seq-length`*.15/10)*10z--overlapping-eval    r    )r   r   r   r   r   r   r   add_evaluation_argss  sR   r9   c                 C   s   |  dd}|jdtdd |jdtdd |jdtd	d |jd
tdd |jdtdd |jdtdd |jdtd	d |jdtd	d |jddd |jdtdd | S )zText generate arguments.zText generationconfigurationsz--temperaturer   r    z--top_pr$   z--top_kr   z--out-seq-length   z--num-beamsr   z--length-penaltyz--no-repeat-ngram-sizez--min-tgt-lengthz--select-topkr   r-   z--blank-maskratior	   r   r   r   r   r   add_text_generate_args  s   r<   c                 C   sx  |  dd}|jdtddd |jddd	d
 |jddd |jddddd |jddddd |jddddd |jdtddd |jdtddd |jdddd |jd d!d"d |jd#dd$d |jd%d&d'd |jd(dd)d
 |jd*dd |jd+tdd,d |jd-dd.d
 |jd/dd0d
 |jd1td2d3d |jd4tdd5d |jd6td7d8d |jd9td:g d;d<d= |jd>dd |jd?dtd@dA |jdBddCd
 |jdDtdEdFd |jdGtdHdId |jdJtddKd |jdLtdMdN |jdOddPd
 |jdQtddRd |jdStddTd |jdUtddVd |jdWdddXd |jdYtdMdZd |jd[tddN |jd\tddN | S )]z Train/valid/test data arguments.datazdata configurationsz--model-parallel-sizer   zsize of the model parallel.r
   z	--shuffler   zIShuffle data. Shuffling is deterministic based on seed and current epoch.r   z--filter-englishr-   z--train-data+Nz=Whitespace separated filenames or corpora names for training.)nargsr   r   z--valid-data*zFilename for validation data.z--test-datazFilename for testingz
--data-dirz#The data path to all the data filesz--input-data-sizes-filez	sizes.txtz,the filename containing all the shards sizesz--delim,z&delimiter used to parse csv data files)r   r   z
--text-keysentencez(key to use to extract text from json/csvz--eval-text-keyz<key to use to extract text from json/csv evaluation datasetsz--splitz1000,1,1zLcomma-separated list of proportions for training, validation, and test splitz--no-lazy-loaderz!whether to lazy read the data setz--half-lazy-loaderz--loader-scatterz)Number of scatters to use for dataloadersz--loose-jsonzlUse loose json (one json-formatted string per newline), instead of tight json (data file is one json string)z--presplit-sentenceszaDataset content consists of documents where each document consists of newline separated sentencesz--num-workersr   z(Number of workers to use for dataloadingz--tokenizer-model-typea  Model type to use for sentencepiece tokenization                        (one of ['bpe', 'char', 'unigram', 'word']) or                        bert vocab to use for BertWordPieceTokenizer (one of                        ['bert-large-uncased', 'bert-large-cased', etc.])z--tokenizer-pathztokenizer.modelz8path used to save/load sentencepiece tokenization modelsz--tokenizer-typeBertWordPieceTokenizer)CharacterLevelTokenizerSentencePieceTokenizerrC   GPT2BPETokenizerChineseSPTokenizerzwhat type of tokenizer to user+   z--no-pre-tokenizez--cache-dirz)Where to store pre-trained BERT downloads)r   r   r   z--use-tfrecordszgload `--train-data`, `--valid-data`, `--test-data` from BERT tf records instead of normal data pipelinez--seq-lengthr   z"Maximum sequence length to processz--mem-lengthr   zThe memory length to preservez--max-preds-per-seqzMaximum number of predictions to use per sequence.Defaults to math.ceil(`--seq-length`*.15/10)*10.MUST BE SPECIFIED IF `--use-tfrecords` is True.z--non-sentence-startr$   r    z--sample-one-documentz&only sample one document in one samplez--load-splitsz#The path to load split indices fromz--save-splitsz!The path to save split indices toz--save-test-datazThe path to save the test dataz--multi-task-dataz0Downsteam task names for multi-task pre-trainingz--multi-task-ratioz!Ratio for multi-task pre-trainingz--multi-seq-lengthz--multi-batch-size)r   r   r   r5   r   r   r   r   r   add_data_args  sb  	rH   c                 C   s  |  dd}|jdtdd |jdtdd d |jd	tg d
ddd |jdddd |jdddd |jdtddd |jdtg ddd |jdtdd |jddd d |jd!tdd |jd"dd#d |jd$dd% |jd&dd'd |jd(td d |jd)dd*d |jd+td d |jd,td d |jd-td.d |jd/td0d |jd1td2d |jd3td4d5gd4d |jd6dd% |jd7dd% |jd8dd% |jd9dd:d |jd;tdd |jd<d=g d>d? |jd@ddAdB |jdCtd d |jdDtdd |jdEddAdB | S )FNfinetunezfinetune configurationsz--taskz
Task name.)r   r   z--load-pretrainedzLoad pretrained model)r   r   r   z--pool-token)startpadclsz-The token to pool the sequence representationrL   )r   r,   r   r   z--cloze-evalr   z"Evaluation dataset with cloze taskr   z--multi-tokenz$Use multi token for cloze evaluationz--segment-lengthr   z/The maximum segment length for cloze evaluationr
   z--loss-func)cross_entropyhinge
generativemixrM   )r   r,   r   z--block-lm-ratior$   r    z--adapetz.Use the decoupled cross entropy loss in AdaPETz--pattern-idz--fast-decodezRFast decode for multi-token cloze. Can only be used without checkpoint activation.z--few-supergluer-   z--eval-validz!Whether evaluate on the valid setz--validation-metricz--unidirectionalz$Use the left to right language modelz--src-seq-lengthz--tgt-seq-lengthz--adam-beta1g?z--adam-beta2g+?z
--adam-epsg:0yE>z--optimizeradam	adafactorz--wsc-negativez--overwritez--no-validationz--continuous-promptzUse continuous prompt for PETz--num-prompt-tokensz--prompt-funclstm)rS   mlpr2   )r   r,   z--freeze-transformerF)r   r   z--tune-prefix-layersz--prefix-promptz--prompt-initr4   r   r   r   r   add_finetune_config_args_  s   
rU   c            	      C   sN  t jdd} t| } t| } t| } t| } t| } t| } t| } t	
| } | jg d}|js7|js7td tj |_ttdd|_ttdd|_t|d	r\|jr\t| n2td
rttd
}ttd}ttdd}ttdd}||_|| | |_|| |_t|j|j|_|jdkrtd|j|j d|_|jdu rd|_|jdkrtd |j sd|_!d|_"d|_#t|dr%|j	r%|j$dur%t%|j$dd}t&'|}W d   n1 sw   Y  d|v r|d |_(d|v r|d |_)nd|_)d|v r%|d *di }|*d|j+|_+|*d|j,|_,|S )zParse all the args.zPyTorch BERT Model)description)argsz#WARNING: No training data specifiedRANK0
WORLD_SIZE1deepspeed_mpiOMPI_COMM_WORLD_LOCAL_RANKOMPI_COMM_WORLD_LOCAL_SIZESLURM_JOB_NUM_NODESSLURM_NODEIDr   z1using world size: {} and model-parallel size: {} FNTz > using dynamic loss scaling	deepspeedzutf-8)encodingtrain_micro_batch_size_per_gpugradient_accumulation_stepsr   	optimizerparamslrweight_decay)-argparseArgumentParserr   r!   r6   r9   r<   rH   rU   ra   add_config_arguments
parse_args
train_datadata_dirprintr0   cudais_availabler   osgetenvrank
world_sizehasattrr\   mpi_define_env
local_rankminmodel_parallel_sizeformatdynamic_loss_scale
loss_scaler   fp32_embeddingfp32_tokentypesfp32_layernormdeepspeed_configopenjsonload
batch_sizerd   getrg   rh   )	r   rW   rx   
local_size	num_nodesnodeidfiler   optimizer_params_configr   r   r   get_args  s   








r   c              	      s   ddl m} |j}| }| }d }|dkrt }|j|dd}|  | }t	 fdd|d | D }t
|tjd< t
|tjd< || _|| _|| _|tjd< d	tjd
< tdtjd | jtjd tjd tjd
  d S )Nr   )MPI)rootc                    s   g | ]}| kqS r   r   ).0i	proc_namer   r   
<listcomp>  s    z"mpi_define_env.<locals>.<listcomp>rX   rZ   MASTER_ADDR29500MASTER_PORTzfDiscovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={})mpi4pyr   
COMM_WORLDGet_rankGet_sizer   bcastGet_processor_name	allgathersumr5   rr   environrx   ru   rt   ro   r{   )rW   r   commrt   ru   master_addr	all_procsrx   r   r   r   rw     s6   

rw   )__doc__ri   rr   ra   r   r0   utilsr   r   r!   r6   r9   r<   rH   rU   r   rw   r   r   r   r   <module>   s$   Z+ U. .PN