o
    ॵiE                     @   s  d dl Z d dlmZ d dlmZmZmZ d dlmZm	Z	 d dl
Z
d dlZd dlmZ d dlmZ d dlmZ de	eee f fd	d
ZeG dd dZeG dd dZeG dd dZeddG dd deeeZdd Zdede	eeedf fddZdS )    N)deepcopy)	dataclassfieldfields)ListUnion)CliArgumentParser)Config)DEFAULT_DATASET_NAMESPACEvaluesc                 C   sZ   t | tr
| dn| }i }|pg D ]}t| dkrq|d\}}t|||< q|S )N,r   =)
isinstancestrsplitlenstripparse_value)r   pairs_paramskvkeyvalue r   U/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/trainers/training_args.pyset_flatten_value   s   r   c                   @   s   e Zd ZU edddidZeed< edddidZeed< edddidZeed	< eddd
idZ	eed< edddidZ
eed< edddidZeed< eeddidZeed< eeddidZeed< edddidZeed< dS )DatasetArgsNhelpzNThe dataset name used for training, can be an id in the datahub or a local dirdefaultmetadatatrain_dataset_namezOThe subset name used for evaluating, can be an id in the datahub or a local dirval_dataset_namez.The subset name used for training, can be Nonetrain_subset_namez0The subset name used for evaluating, can be Noneval_subset_namezThe split of train datasettrain_splitzThe split of val dataset	val_splitz'The dataset namespace used for trainingtrain_dataset_namespacez)The dataset namespace used for evaluatingval_dataset_namespacea  The json file to parse all datasets from, used in a complex dataset scenario,the json format should be like:
                    [
                        {
                            "dataset": {
                                # All args used in the MsDataset.load function
                                "dataset_name": "xxx",
                                ...
                            },
                            # All columns used, mapping the column names in each dataset in same names.
                            "column_mapping": {
                                "text1": "sequence1",
                                "text2": "sequence2",
                                "label": "label",
                            },
                            # float or str, float means to split the dataset into train/val,
                            # or just str(train/val)
                            "split": 0.8,
                        }
                    ]
                    dataset_json_file)__name__
__module____qualname__r   r!   r   __annotations__r"   r#   r$   r%   r&   r
   r'   r(   r)   r   r   r   r   r      sb   
 r   c                   @   sr   e Zd ZU edddddZeed< edddidZeed< eddd	idZeed
< edddddZ	eed< dS )	ModelArgsNzThe task code to be usedtaskr   cfg_noder   r   zA model id or model dirmodelzthe revision of modelmodel_revisionzJThe mode type, if load_model_config is False, user need to fill this fieldz
model.type
model_type)
r*   r+   r,   r   r/   r   r-   r2   r3   r4   r   r   r   r   r.   j   s.   
 r.   c                   @   s  e Zd ZU edddidZeed< edddd	dZeed
< edddd	dZeed< edddd	dZ	e
ed< edddd	dZe
ed< edddd	dZeed< edddd	dZeed< edddd	dZe
ed< eddd d	dZe
ed!< ed"d#d$d	dZeed%< ed&d'd(d	dZeed)< ed*d+d,d	dZeed-< ed.d/d0d	dZeed1< ed2d3d4d	dZeed5< ed6d7d8ed9dZeed:< ed6d;d<ed9dZeed=< ed>d?d@g dAdBdZeedC< edddDidZeedE< ed"dFdGdHdZeedI< ed>dJdKg dAdLdZeedM< edNdOdPdHdZeedQ< ed6dRdSdHdZeedT< ed>dUdVg dAdLdZ eedW< edNdXdYdHdZ!eedZ< edd[d\dHdZ"e
ed]< ed6d^d_dHdZ#eed`< edadbdcdHdZ$eedd< ed6dedfdHdZ%eedg< edNdhdidHdZ&eedj< eddkdldHdZ'e
edm< ed6dndodHdZ(eedp< ed6dqdrdHdZ)eeds< edtdudvdHdZ*e
edw< edxdydzdHdZ+eed{< eddkd|dHdZ,e
ed}< ed6dnd~dHdZ-eed< ed6dqddHdZ.eed< edtduddHdZ/e
ed< edxdyddHdZ0eed< d6S )	TrainArgs*   r   zThe random seedr   seed   z#train.dataloader.batch_size_per_gpuz:The `batch_size_per_gpu` argument for the train dataloader)r1   r   per_device_train_batch_sizer   z train.dataloader.workers_per_gpuz7The `workers_per_gpu` argument for the train dataloadertrain_data_workerFztrain.dataloader.shufflez/The `shuffle` argument for the train dataloadertrain_shuffleztrain.dataloader.drop_lastz1The `drop_last` argument for the train dataloadertrain_drop_lastz(evaluation.dataloader.batch_size_per_gpuz9The `batch_size_per_gpu` argument for the eval dataloaderper_device_eval_batch_sizez%evaluation.dataloader.workers_per_gpuz6The `workers_per_gpu` argument for the eval dataloadereval_data_workerzevaluation.dataloader.shufflez.The `shuffle` argument for the eval dataloadereval_shufflezevaluation.dataloader.drop_lastz0The `drop_last` argument for the eval dataloadereval_drop_last   ztrain.max_epochszThe training epochs
max_epochsz./train_targetztrain.work_dirz%The directory to save models and logswork_dirg-C6
?ztrain.optimizer.lrz"The learning rate of the optimizerlrLinearLRztrain.lr_scheduler.typezThe lr_scheduler type in torchlr_schedulerAdamWztrain.optimizer.typez+The optimizer type in PyTorch, like `AdamW`	optimizerNztrain.optimizerzThe optimizer params)r1   r   
cfg_setteroptimizer_paramsztrain.lr_schedulerzThe lr scheduler paramslr_scheduler_paramsby_epochz&train.lr_scheduler.options.lr_strategyzThe lr decay strategy)rL   by_stepno)r1   r   choiceslr_strategyzThe local rank
local_rankz+The interval of iter of logging informationztrain.logging.intervalr0   logging_intervalz5Eval strategy, can be `by_epoch` or `by_step` or `no`zevaluation.period.eval_strategy)r   r1   rO   eval_strategy   zEval intervalzevaluation.period.intervaleval_intervalzThe metric name for evaluationzevaluation.metricseval_metricsz>Checkpointing strategy, can be `by_epoch` or `by_step` or `no`z%train.checkpoint.period.save_strategysave_strategyz9The interval of epoch or iter of saving checkpoint periodz train.checkpoint.period.intervalsave_intervalz;Save the checkpoint(if it's the best) after the evaluation.ztrain.checkpoint.best.save_bestsave_best_checkpointz%The metric used to measure the model.z train.checkpoint.best.metric_keymetric_for_best_modelmaxzDThe rule to measure the model with the metric, can be `max` or `min`ztrain.checkpoint.best.rulemetric_rule_for_best_modelzBThe max number of checkpoints to keep, older ones will be deleted.z*train.checkpoint.period.max_checkpoint_nummax_checkpoint_numzGThe max number of best checkpoints to keep, worse ones will be deleted.z(train.checkpoint.best.max_checkpoint_nummax_checkpoint_num_bestz$Push to hub after each checkpointingz#train.checkpoint.period.push_to_hubpush_to_hubz<The repo id in modelhub, usually the format is "group/model"z#train.checkpoint.period.hub_repo_idrepo_idzYThe modelhub token, you can also set the token to the env variable `MODELSCOPE_API_TOKEN`z!train.checkpoint.period.hub_token	hub_tokenTzUpload to a private hubz#train.checkpoint.period.private_hubprivate_hubmasterzWhich branch to commit toz$train.checkpoint.period.hub_revisionhub_revisionz!train.checkpoint.best.push_to_hubpush_to_hub_bestz!train.checkpoint.best.hub_repo_idrepo_id_bestztrain.checkpoint.best.hub_tokenhub_token_bestz!train.checkpoint.best.private_hubprivate_hub_bestz"train.checkpoint.best.hub_revisionhub_revision_best)1r*   r+   r,   r   r7   intr-   r9   r:   r;   boolr<   r=   r>   r?   r@   rB   rC   r   rD   floatrF   rH   r   rJ   rK   rP   rQ   rR   rS   rU   rV   rW   rX   rY   rZ   r\   r]   r^   r_   r`   ra   rb   rd   re   rf   rg   rh   ri   r   r   r   r   r5      s  
 	r5   F)initc                   @   sJ   e Zd ZU edddidZeed< dd Zdd	d
ZdddZ	dd Z
dS )TrainingArgsFr   zeUse the configuration of the model, default will only use the parameters in the CLI and the dataclassr   use_model_configc                 K   sD   t | | _t| D ]}|j|v rt| |j||j  qi | _d S N)listkeysmanual_argsr   namesetattr_unknown_args)selfkwargsfr   r   r   __init__  s   

zTrainingArgs.__init__Nc           
      C   s   t | }||\}}dd |D }i }tdt|dD ]}t||d  ||| dd< qt|}|  j|j7  _| j	| t
| D ]\}}	|durZt| |rZt| ||	 qG| S )	zcConstruct a TrainingArg class by the parameters of CLI.

        Returns:
            Self
        c                 S   s    g | ]}|d vrd|vr|qS ))\
z--local-rank=r   ).0itemr   r   r   
<listcomp>  s
    z*TrainingArgs.parse_cli.<locals>.<listcomp>r      rT   - N)r   parse_known_argsranger   r   replacevarsrs   rv   updater   itemshasattrru   )
rw   parser_argsparserargsunknown_unknowni	args_dictr   r   r   r   r   	parse_cli  s    "zTrainingArgs.parse_clic              	   C   s   t  }t }|du r| j}t| D ]B}|jd}|jdp#dd }|durK|j| jv s0|sJt	|t
r8|g}|D ]}|||t| |ji q:qt| |j||j< q|| j ||fS )zyConvert the TrainingArgs to the `Config`

        Returns:
            The Config, and extra parameters in dict.
        Nr1   rI   c                 S      | S rp   r   xr   r   r   <lambda>      z(TrainingArgs.to_config.<locals>.<lambda>)r	   addictDictro   r   r    getrt   rs   r   r   merge_from_dictgetattrrv   )rw   ignore_default_configcfgr   ry   r1   rI   _noder   r   r   	to_config  s&   
zTrainingArgs.to_configc                 C   s(   t | }|D ]}|j|kr|  S qd S rp   )r   rt   )rw   r   _fieldsry   r   r   r   get_metadata  s   
zTrainingArgs.get_metadatarp   )r*   r+   r,   r   ro   rk   r-   rz   r   r   r   r   r   r   r   rn     s   
 

rn   c                    s  ddl m} g }g }t| d}t|}|D ]}|jdi |d  }|j}|d    fdd|D }	ddlm	}
 dd	lm
} dd
lm}  fdd|j D }i }|D ]}t|d |rn||d j||d < qY|d ||d < qY|
|}|jdd |	|d|d }|d }t|tr|dv sJ |dkr|| q|| qt|trd|  k rdk sJ  J |j|d}||d  ||d  qW d   n1 sw   Y  ddlm} ||||fS )aW  
    The filename format:
    [
        {
            "dataset": {
                "dataset_name": "xxx",
                ...
            },
            "column_mapping": {
                "text1": "sequence1",
                "text2": "sequence2",
                "label": "label",
            }
            "usage": 0.8,
        }
    ]
    r   )	MsDatasetrdatasetcolumn_mappingc                    s   g | ]}| vr|qS r   r   )r}   columnkeep_columnsr   r   r     s    z+build_dataset_from_file.<locals>.<listcomp>)Features)Value)
ClassLabelc                    s   g | ]
}|d   v r|qS )r   r   )r}   ry   r   r   r   r     s    rT   c                 S   r   rp   r   r   r   r   r   r   "  r   z)build_dataset_from_file.<locals>.<lambda>)remove_columnsfeaturesusage)trainvalr   )
train_sizetestN)concatenate_datasetsr   )
modelscoper   openjsonloadto_hf_datasetcolumn_namesrr   datasetsr   r   r   r   r   r   dtypemaprename_columnsr   appendrl   train_test_splitr   )filenamer   	train_seteval_setry   ds_jsondsr   all_columnsr   r   r   r   r   new_featuresr   ds_dictr   r   r   r   build_dataset_from_file  sZ   




&'r   r   returnc                 C   st   ddddd d d d}| |v r||  S d| v sd| v r$|  dd ddS td| r.t| S td| r8t| S | S )	NTF)TruetrueFalsefalseNonenonenull"'r   z^\d+$z4[+-]?(?=\d*[.eE])(?=\.?\d)\d*\.?\d*(?:[eE][+-]?\d+)?)r   rematchrj   rl   )r   	const_mapr   r   r   r   6  s&   	r   )r   copyr   dataclassesr   r   r   typingr   r   r   r   'modelscope.trainers.cli_argument_parserr   modelscope.utils.configr	   modelscope.utils.constantr
   r   r   r   r.   r5   rn   r   rl   rk   r   r   r   r   r   <module>   s,   O  #L"A