o
    TÃi?Ô  ã                
   @   s.  d dl Z d dlZd dlZd dlZd dlZd dlZddlmZ ddlT ddl	m
Z
mZ ddlmZ ddlmZ ddlT dd	lmZ dd
lmZmZmZ ddlT d dlmZ zd dlmZ W n eyg   dZY nw zd dlZdZW n ey„ Z zdZW Y dZ[ndZ[ww dZdZ dZ!ej"Z#G dd„ dƒZ$dS )é    Né   )Ú"dict_raise_error_on_duplicate_keys)Ú*)ÚZERO_OPTIMIZATIONÚZeroStageEnum)Úloggeré   )ÚDeepSpeedAutotuningConfig)ÚResourceManager)ÚGridSearchTunerÚRandomTunerÚModelBasedTuner)Úget_accelerator)ÚtabulateTFÚstageÚoffload_optimizerÚoffload_paramc                   @   s  e Zd ZdZdd„ Zdd„ Zdd„ Zdd	„ Zd
d„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zd d!„ Zd"d#„ Zd$d%„ ZdFd'd(„Zd)d*„ Zd+d,„ Zd-d.„ Zd/d0„ Zd1d2„ Zd3d4„ Zd5d6„ Zd7d8„ Zd9d:„ Zd;d<„ Z d=d>„ Z!d?d@„ Z"dAdB„ Z#dCdD„ Z$dES )GÚ	Autotunerax  The DeepSpeed Autotuner automatically discovers the optimal DeepSpeed configuration that delivers good training speed. The Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations. It not only reduces the time and resources user spend on tuning, but also can discover configurations better than hand-tuned methods.
    Autotuning with DeepSpeed requires no code change from DeepSpeed users. Please refer to the README for usage details.
    c                 C   s  || _ d | _td usJ dƒ‚t d|› ¡ |  |j¡| _| jd us&J dƒ‚t| jƒ| _	| jt
 rOt| jt
  ¡ v r@| jt
 t= t| jt
  ¡ v rO| jt
 t= | j	j| _| j	jrgtj | j¡rgtj| jdd tj | j¡s”ztj| jdd t d| j› ¡ W n   t d| j› d	¡ td
ƒ Y | j	j| _| j	jr¬tj | j¡r¬tj| jdd tj | j¡sÙztj| jdd t d| j› ¡ W n   t d| j› d¡ td
ƒ Y |  |¡| _|  |¡\| _| _| j| jjksóJ dƒ‚| jt | jj!ƒksJ dƒ‚i | _"d | _#d | _$d | _%d S )Nz]Missing required package `tabulate`, please install with `pip install deepspeed[autotuning]`.zautotuning args=z'DeepSpeed configuration is not providedT)Úignore_errors)Úexist_okz*Created autotuning experiments directory: zFailed to create z`, please check exps_dir in the autotuning config file is accessible by all the nodes in the job.éÿÿÿÿz&Created autotuning results directory: zc, please check results_dir in the autotuning config file is accessible by all the nodes in the job.znnum_gpus in the autotuning configuration must not be less than the --num_gpus value in the train script if anyzpnum_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any)&ÚargsÚselected_exp_dirr   r   ÚdebugÚ_get_user_configÚ	user_argsÚuser_configr	   Úautotuning_configÚ
AUTOTUNINGÚAUTOTUNING_EXPS_DIRÚkeysÚAUTOTUNING_RESULTS_DIRÚexps_dirÚ	overwriteÚosÚpathÚexistsÚshutilÚrmtreeÚmakedirsÚinfoÚerrorÚexitÚresults_dirÚ_get_resource_managerÚrmÚ_get_exp_resourcesÚexp_num_nodesÚexp_num_gpusÚnum_gpus_per_nodeÚlenÚnodesÚrecordsÚoptimal_cmdÚoptimal_ds_configÚmlflow_parent_id)Úselfr   Úactive_resources© r<   úR/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/autotuning/autotuner.pyÚ__init__/   s`   

ÿ

ÿ

ÿþ
zAutotuner.__init__c                 C   sð  |   ¡ }g }|r| ¡ D ]J\}}|sqg }| |¡ d}|tkr8d}| ¡ D ]\}}	|tkr4||	d 7 }q&|}n|d }| |¡ | |d ¡ | |d d ¡ | |¡ qt|g d¢dd}
t|
ƒ ttj 	| j
d¡d	td
}| |
¡ | ¡  t |¡ W d  ƒ n1 sŠw   Y  t|v rö|t \}}}|r¬t |d › d|d › d¡ nt d¡ tjt ¡ | j d}t d|› ¡ ttj 	| j
d¡dƒ}| d|› d| jjd › d¡ | ¡  W d  ƒ dS 1 sïw   Y  dS dS )z8Print the autotuning results in tabular format.
        r   r   r   Úname)Útuning_spaceÚnum_experimentsÚbest_metric_valÚbest_exp_nameÚpipe)ÚheadersÚtablefmtzsummary.txtÚw©Ú	bufferingNz9 is the optimal setup after tuning. The exp result is at Ú
result_dirÚ.zONo optimal setup is found. Please check that experiments were run successfully.)ÚsecondszTuning completed in Úaz

Tuning completed in z. Total number of experiments: )Úget_best_space_recordsÚitemsÚappendÚGLOBAL_TUNING_SPACEr   ÚprintÚopenr$   r%   Újoinr-   ÚBUFSIZEÚwriteÚflushÚfsyncr   r*   ÚdatetimeÚ	timedeltaÚtimeÚ
start_timer/   Úexperiment_count)r:   Úbest_space_recordsÚtabÚkeyÚvalÚrowÚnum_expsÚcntÚkÚvÚsummaryÚfdÚbest_exprB   Útotal_num_expsÚtuning_durationÚfr<   r<   r=   Úprint_tuning_resultsl   s^   
€
þ
ýÿ
ÿ
"üõzAutotuner.print_tuning_resultsc                 C   sÀ   d}d|v r|  d¡}d||d  v sJ dƒ‚||d  }nd|v r5|  d¡}d||d  v r5||d  }t d|› ¡ |dur^tj |¡sNJ d |¡ƒ‚tj |¡r^tj	t
|d	ƒtd
S dS )züGet DeepSpeed configuration from the user arguments passed to the launcher.

        Args:
            user_args ([list]): user arguments passed to the DeepSpeed launcher

        Returns:
            [dict]: DeepSpeed configuration dictionary
        Nz--deepspeed_configú.jsonr   zNDeepSpeed --deepspeed_config requires a json file to specify the configurationz--deepspeedzuser_config_file = z8DeepSpeed configuration file: {} is not an existing fileÚr)Úobject_pairs_hook)Úindexr   r   r$   r%   ÚisfileÚformatr&   ÚjsonÚloadrS   r   )r:   r   Úuser_config_fileÚidxr<   r<   r=   r   ž   s*   	
ÿÿ
ÿzAutotuner._get_user_configc                 C   sp   t  d|› ¡ g }d}| ¡ D ]\}}| |¡ tt|ƒ|ƒ}q|dks)J dƒ‚t| j||| j| j	| j
jdS )a„  Initialize and return a resource manager

        Args:
            active_resources ([dict]): A dictionary of hostname and its slots (GPUs), e.g. {"worker-0": "0,1,2,3,4,5,6,7,8"}

        Raises:
            RuntimeError: raises the error if no GPU is available

        Returns:
            [ResourceManager]: A resource manager that schedules and runs autotuning experiments.
        zactive_resources = éd   r   zno gpu is available)r   Úhostsr3   r-   r"   Úarg_mappings)r   r*   rO   rP   Úminr4   r
   r   r-   r"   r   rz   )r:   r;   ry   Úngpus_per_nodeÚhostnameÚslotsr<   r<   r=   r.   ¼   s   
ûzAutotuner._get_resource_managerc                 C   sF   |j dkr	|j }nt| jjƒ}|jdkr|j}||fS | jj}||fS )zïGet resource requirement for each autotuning experiment

        Args:
            args (dict): user args

        Returns:
            num_nodes, num_gpus: the number of gpus and number of nodes used in the autotuning experiments
        r   )Ú	num_nodesr4   r/   r5   Únum_gpusr3   )r:   r   r   r€   r<   r<   r=   r0   Ù   s   
	
þzAutotuner._get_exp_resourcesc                 C   ó   | j jS ©N)r   Úmetric©r:   r<   r<   r=   rƒ   î   ó   zAutotuner.metricc                 C   r   r‚   )r   Úfastr„   r<   r<   r=   Úfast_enabledñ   r…   zAutotuner.fast_enabledc                 C   r   r‚   )r   Úmax_train_batch_sizer„   r<   r<   r=   rˆ   ô   r…   zAutotuner.max_train_batch_sizec                 C   r   r‚   )r   Úmp_sizer„   r<   r<   r=   r‰   ÷   r…   zAutotuner.mp_sizec                 C   sF   |   ¡ r|   ¡ dkr|   ¡ |  ¡  | j| j  }t| jj|ƒS | jjS ©Nr   )rˆ   r‰   r2   r1   r{   r   Ú"max_train_micro_batch_size_per_gpu)r:   Úmax_train_micro_batch_sizer<   r<   r=   r‹   ú   s   ÿ
ÿz,Autotuner.max_train_micro_batch_size_per_gpuc                 C   r   r‚   )r   Ú"min_train_micro_batch_size_per_gpur„   r<   r<   r=   r     r…   z,Autotuner.min_train_micro_batch_size_per_gpuc                 C   r   r‚   )r   Únum_tuning_micro_batch_sizesr„   r<   r<   r=   rŽ     r…   z&Autotuner.num_tuning_micro_batch_sizesc                 C   s$   t | j ¡ v r| jt   tt¡S dS )NF)ÚFP16r   r    ÚgetÚFP16_ENABLEDÚFP16_ENABLED_DEFAULTr„   r<   r<   r=   Úfp16_enabled	  s   zAutotuner.fp16_enabledc                 C   s
   t ƒ  ¡ S r‚   )r   Útotal_memoryr„   r<   r<   r=   Úget_gpu_memory_info  s   
zAutotuner.get_gpu_memory_infoc                 C   ó"   | j rd| j v r| j d S d S d S )NÚactivation_mem_per_gpu©Ú
model_infor„   r<   r<   r=   Úget_activation_memory_per_gpu  ó   
ÿz'Autotuner.get_activation_memory_per_gpuc           	      C   s¢   |   ¡ }| j| j }|  ¡ }|sdS ||rdnd }||rdnd }||r'dnd }|tjkr3|| }|tjkr<|| }|tjkrE|| }|| | |  ¡  }|S )Nr   r   é   é   é   )	Úget_model_num_paramsr1   r2   r“   r   Úoptimizer_statesÚ	gradientsÚweightsr‰   )	r:   Ú
zero_stageÚ
num_paramsÚ
total_gpusr“   Ú
params_memÚgradients_memÚoptimizer_memÚmem_per_gpur<   r<   r=   Ú)get_instantiation_memory_required_per_gpu  s    


z3Autotuner.get_instantiation_memory_required_per_gpuc                 C   s\  g }|  ti ¡}|  tt¡}i }|dkr!t}t t|dƒ¡}d}n\|dkr2t}t t|dƒ¡}d}nK|dkrCt	}t t|dƒ¡}d}n:|dkr{t
}t t|dƒ¡}| j}	|	rxd	|	v rx|	d	 }
|
|
 |t d
< d|
 |
 |t d< d|
 |t d< d}n|S t|| jttgƒ t dt |¡› ¡ t|dgd}t|ƒ}t d|› ¡ t dt|ƒ› ¡ t|ƒ}t dt|ƒ› ¡ |D ]m}t |¡}t||ƒ |  td¡}|rðt|vrát|t v rá|t t= t|vrðt|t v rð|t t= |t }|| }||t< || | j | j |  ¡  |t< i }t |||ƒ}||d< ||t!< | j|d< | j|d< | "|¡ q¾|S )a´  Generates a list of autotuning experiments given a tuning_space.
            The corresponding parameter values are replaced by user-defined values in the DeepSpeed configuration file.
        Args:
            tuning_space ([dict]): A DeepSpeed configuration dictionary where a value can be a list (called a tuning parameter). For example,
                {
                    "zero_optimization": {
                        "stage": 1,
                        "reduce_bucket_size": [5e7,
                                            5e8,
                                            1e9],
                        "allgather_bucket_size": [5e7,
                                                5e8,
                                                1e9],
                    }
                }
                reduce_bucket_size and allgather_bucket_size are the tuning parameters in this tuning space.
        Returns:
            [list]: a list of experiments generated by taking combinations of values of the tuning space. The above tuning space generates 3*3 = 9 experiments if the user DeepSpeed configuration file does not overwrite the two tuning parameters or define more tuning parameters.
        r   ro   Úz0_r   Úz1_r   Úz2_é   Úhidden_sizeÚreduce_bucket_sizegÍÌÌÌÌÌì?Ústage3_prefetch_bucket_sizeé
   Ú"stage3_param_persistence_thresholdÚz3_ztuning_space = Ú	optimizer)Úignore_keysztuning_keys = zbefore pruning total configs = zafter pruning total configs = Nr?   r€   r   )#r   r   ÚZERO_OPTIMIZATION_STAGEÚZERO_OPTIMIZATION_STAGE_DEFAULTÚDEFAULT_TEMPLATE_PATH_ZERO_0Úhjsonru   rS   ÚDEFAULT_TEMPLATE_PATH_ZERO_1ÚDEFAULT_TEMPLATE_PATH_ZERO_2ÚDEFAULT_TEMPLATE_PATH_ZERO_3r™   Úreplace_dictr   ÚTRAIN_MICRO_BATCH_SIZE_PER_GPUr   r   rt   ÚdumpsÚget_all_configsÚget_tuning_keysr4   Úprune_configsÚcopyÚdeepcopyÚOFFLOAD_OPTIMIZERÚOFFLOAD_PARAMÚGRADIENT_ACCUMULATION_STEPSr2   r1   r‰   ÚTRAIN_BATCH_SIZEÚcanonical_nameÚ	DS_CONFIGrP   )r:   r@   Úmax_train_batch_size_per_gpuÚexpsÚconfig_zeror   Útemplate_configÚtemplate_pathÚprefixr™   ÚhsÚall_configsÚtuning_keysÚpruned_listÚconfigÚ
exp_configÚmbsÚgasÚexpÚexp_namer<   r<   r=   Ú_generate_experiments0  s|   



ÿÿÿ

zAutotuner._generate_experimentsc                 C   sö  t rtjd | _tj| jd t ¡ | _|  ¡ rt	 
d¡ |  ¡ }|r'|| _ndS t	 
dt|  ¡ ƒ› d¡ |  ¡ | _t	 
dt| jdd	› d
¡ |  ¡ | _t	 
dt| jdd	› d¡ | j ti ¡ td¡}t|tƒso|gn|}t	 
d|› d
¡ d}d}d}|  tj¡| j }| j|krÄd|v s—tj|v rÃt	 
dt|dd	› d¡ |  t¡\}}	}
|	|kr¶|	}|}|
}t rÃt d|   ¡ › |
¡ nt	 
dtj› dt|dd	› d¡ |  tj!¡| j }| j|kr!d|v sîtj!|v r t	 
dt|dd	› d¡ | jt"|||d\}}	}
|	|kr|	}|}|
}t r t d|   ¡ › |
¡ nt	 
dtj!› dt|dd	› d¡ |  tj#¡| j }| j|krd|v sLtj#|v r~t	 
dt|dd	› d¡ | jt$|||d\}}	}
|	|krp|	}|}|
}t r~t d|   ¡ › |
¡ nt	 
dtj#› dt|dd	› d¡ |  tj%¡| j }| j|krÒd|v sªtj%|v rÑt	 
dt|dd	› d¡ | jt&|||d\}}}
t rÑt d|   ¡ › |
¡ nt	 
d|  ¡ › d t|dd	› d!tj%› d"t| jƒ› d#	¡ dS t rùt '¡  dS dS )$z¢ Tunes Zero stages, micro batch size per GPU, and other Zero configurations. Performance metrics of different tuning spaces are recorded in self.records.
        ÚMLFLOW_RUN_ID)Úrun_idz3Fast mode is enabled. Tuning micro batch size only.NzThe model has z parameters.z Memory per GPU in the system is ÚB)ÚpostfixrK   zThe model requires at least z* activation memory for micro batch size 1.r   zUser-defined zero stages are Úallz@The model might be runable with ZERO 0 (which requires at least zT memory with mbs = 1), adding DEFAULT_TUNING_SPACE_ZERO_0 to the global tuning spaceÚz0z)The model is not runable with ZERO stage z (which requires at least z memory with mbs = 1)z@The model might be runable with ZERO 1 (which requires at least zG memory), adding DEFAULT_TUNING_SPACE_ZERO_1 to the global tuning space)Úprev_max_mbsÚprev_best_mbsÚprev_best_metric_valÚz1z@The model might be runable with ZERO 2 (which requires at least zG memory), adding DEFAULT_TUNING_SPACE_ZERO_2 to the global tuning spaceÚz2z@The model might be runable with ZERO 3 (which requires at least zG memory), adding DEFAULT_TUNING_SPACE_ZERO_3 to the global tuning spaceÚz3z" parameters and requires at least z* memory per GPU with DeepSpeed Zero stage z+ optimization. Memory per GPU in system is z. No tuning is performed.)(Ú
has_mlflowr$   Úenvironr9   ÚmlflowÚ	start_runr[   r\   r‡   r   r*   Úmodel_info_profile_runr™   Únumber_to_stringrŸ   r•   Úgpu_memÚmemory_to_stringrš   Úactivation_memr   r   r   r·   Ú
isinstanceÚlistrª   r   ÚdisabledÚ
tune_spaceÚDEFAULT_TUNING_SPACE_ZERO_0Ú
log_metricrƒ   r    ÚDEFAULT_TUNING_SPACE_ZERO_1r¡   ÚDEFAULT_TUNING_SPACE_ZERO_2r¢   ÚDEFAULT_TUNING_SPACE_ZERO_3Úend_run)r:   r™   r   Úuser_zero_stagesrØ   Úmax_mbsÚ
metric_valÚrequired_gpu_memÚnext_max_mbsÚnext_mbsÚnext_metric_valÚ_r<   r<   r=   Útune”  sÞ   



ÿ
ÿ€ÿÿÿÿý
€ÿÿÿÿý
€ÿÿý€0ÿÿzAutotuner.tuner   c                 C   s–  |  ti ¡}|  td ¡}tt|ƒ }g }d}	d}
t| j|  |¡ ƒ| j }t	 
d|› d|› ¡ ||k rAt	 
d|› d|› ¡ dS t| jv rkt| jt tƒrkdd	„ | jt D ƒ}|  ¡ }t|ƒ}t|ƒ}|| }	d
}
n&|  |||¡\}}||k r…t	 
d|› d|› ¡ dS | j|||  ¡ d\}}	t	 
d|› d|	› ¡ |s¨t	 
d|› ¡ dS |  ||	|||
¡}|  |¡}|r¼|d nd}|rÈ|d t t nd}t	 
d|› d|d d › ¡ |  ¡ sá|dkrît	 
d|› ¡ |||fS |dkr
||ksü||k r
t	 
d|› d¡ |||fS ||t< t|t|ƒdt|ƒ d d
d}t	 
d|› ¡ t	 
d|› ¡ |  ||	¡}t	 
d| jj› ¡ | jjtkrPt|| j |  !¡ |ƒ}n| jjt"krat#|| j |  !¡ ƒ}n	t$|| j |  !¡ ƒ}t%| j j&ƒ| j j' | j(| j)  }|j*|| jj+| jj,d}|j-}|j.}|r—|  /||||¡ |  |¡}|r£|d nd}||krº|}|r·|d t t nd}n|}|}t	 
d|› ¡ |||fS )Nr   FzStart tuning for space z$, calculated_max_micro_batch_size = zNo need to tune Zero stage z. End tuning for space ©r   r   r   c                 S   s   g | ]	}t |tƒr|‘qS r<   )rò   Úint)Ú.0Úsr<   r<   r=   Ú
<listcomp>"  s
    
ÿÿz(Autotuner.tune_space.<locals>.<listcomp>T)rŽ   ztuning_micro_batch_sizes = z!, max_train_batch_size_per_gpu = zEnd tuning for space r   zfast_best_mbs = z	, name = r?   zEnd tuning for space: z6. No need to tune other Zero configuration parameters.Úzr  )rÔ   rÑ   Úomit_valzTuning space is zTuning space name is zTuner type is )Úsample_sizeÚn_trialsÚearly_stoppingr   )0r   r   r·   ÚTUNING_MICRO_BATCH_SIZE_PREFIXÚstrr  rï   rª   rñ   r   r*   r¿   r   rò   ró   Úget_gas_from_user_configr{   ÚmaxÚget_min_max_micro_batch_sizeÚ get_tuning_micro_batch_size_listrŽ   Úrun_tuning_micro_batch_sizesÚget_best_space_recordrË   r‡   rÊ   rÂ   rÜ   r   Ú
tuner_typeÚAUTOTUNING_TUNER_MODELBASEDr   r/   rƒ   ÚAUTOTUNING_TUNER_RANDOMr   r   r4   r5   r3   r2   r1   r  Útuner_num_trialsÚtuner_early_stoppingri   rB   Úupdate_records)r:   r@   rã   rä   rå   rÎ   r   Útuning_space_nameÚtuning_micro_batch_sizesrÌ   Ú$tuning_micro_batch_sizes_overwrittenÚcalculated_max_micro_batch_sizerÙ   Úmin_micro_batch_sizeÚmax_micro_batch_sizeÚfast_best_recordÚfast_best_metric_valÚfast_best_mbsrÍ   Útr  rc   rÚ   rþ   Úfull_best_recordÚfull_best_metric_valrB   Úbest_mbsr<   r<   r=   rõ     sÀ   ÿÿÿ
ÿÿÿ
ýÿý



ÿ
ý þ


zAutotuner.tune_spacec           
      C   s   || j vrdS | j | }t|dd„ d}d }d}|D ])\}}}|r;||k r* |}	|	S ||kr;|| | tk r; |}	|	S |}|t t }q|}	|	S )Nr   c                 S   s   | d t  t S rŠ   )rË   r¿   )Úxr<   r<   r=   Ú<lambda>„  s    z+Autotuner.get_plateau_mbs.<locals>.<lambda>©r`   )r6   ÚsortedÚMETRIC_PERCENT_DIFF_CONSTrË   r¿   )
r:   r  Úspace_recordsÚsorted_space_recordsÚprev_metric_valÚprev_micro_batch_sizerÚ   rþ   r  Úplateau_mbsr<   r<   r=   Úget_plateau_mbs€  s*   

úýzAutotuner.get_plateau_mbsc                 C   r–   )Nr¤   r˜   r„   r<   r<   r=   rŸ   “  r›   zAutotuner.get_model_num_paramsc                 C   sš  t  d¡ | jj}|rt|v r|S t | j¡}t|t	ƒ t
j | jdd¡}d|ddidœ|t< i }d}||d< ||t< | j|d< | j|d	< | jj|d
< t
j | j|› d¡}t|dtd}t ||¡ | ¡  t
 |¡ W d  ƒ n1 sww   Y  | j |g¡ | j ¡  | jj ¡ D ]\}\}	}
| j  ¡  |
r¦t  !d|
› ¡  dS qŽt
j "|¡rËt|dƒ}t# $|¡}|W  d  ƒ S 1 sÄw   Y  dS dS )a‹  Does a model information profiling experiment that collects the number of model parameters and activation memory.            The experiment produces a "profile_model_info" folder under self.results_dir.
        Returns:
            [dict]: a model information dictionary, e.g., {"num_params": 335144976, "trainable_num_params": 335144976, "activation_mem_per_gpu": 324358144, "rank": 0}
        z Starting model info profile run.Úprofile_model_infozmodel_info.jsonTÚprofile)ÚenabledÚmodel_info_pathr™   r?   r€   r   Úhostfilern   rG   rH   Nz6The model is not runnable with DeepSpeed with error = ro   )%r   r*   r   r™   ÚMODEL_INFO_NUM_PARAMSrÄ   rÅ   r   r¾   ÚDEFAULT_MIN_MEM_CONFIGr$   r%   rT   r-   r   rË   r2   r1   r   r9  r"   rS   rU   rt   ÚdumprW   rX   r/   Úschedule_experimentsÚrunÚfinished_experimentsrO   Úclearr+   r&   rº   ru   )r:   r™   Ú	ds_configr8  r×   rÛ   Úexp_pathrh   Úexp_idÚexp_jsonÚerrrl   r<   r<   r=   rí   —  sF   



ý

þ
$þÿz Autotuner.model_info_profile_runc                 C   s:   || j vr|||fg| j |< d S | j |  |||f¡ d S r‚   )r6   rP   )r:   Ú
space_namerÚ   rþ   rc   r<   r<   r=   r  Ä  s   
zAutotuner.update_recordsc                 C   sf   || j vrd S | j | }d }d}|D ]\}}}||7 }|d u s%||d kr)||f}q|r1||f }|S )Nr   r   )r6   )r:   rF  r/  Úbest_space_recordÚspace_num_expsrÚ   rþ   rc   r<   r<   r=   r  Ê  s   

€
zAutotuner.get_best_space_recordc                 C   sZ   i }d }| j  ¡ D ]\}}|  |¡}|r$|||< |r"|d |d kr$|}q	|r+||t< |S ©Nr   )r6   rO   r  rQ   )r:   r^   Úglobal_best_recordrF  r/  rG  r<   r<   r=   rN   Ø  s   
€z Autotuner.get_best_space_recordsc              	   C   sB  |sJ dƒ‚|  ¡  |d }d}t| jƒ}t|i|t< tt|ƒ }	g }
|D ]|}||t< || }||t< || | j	 | j
 |  ¡  |t< |	d t|ƒ d t|ƒ }i }||d< ||t< | j	|d< | j
|d< | jj|d	< tj | j|› d
¡}t|dtd}t ||¡ | ¡  t |¡ W d   ƒ n1 s—w   Y  |
 |¡ q%| j |
¡ | j ¡  | jj ¡ D ]Ž\}\}}|r2|t t  t! }tj "|¡r)t|dƒO}t# $|¡}||  %¡  }|  &|	||d¡ ||t t krî|}t'rtj( )d¡ t*j+d|d d |D ]}t* ,||| ¡ qt* -¡  | j.tj(d< W d   ƒ n	1 s#w   Y  q³|  &|	|dd¡ q³|t t }t/ 0d|› d¡ q³| j 1¡  |rL|S t2|ƒdkrY|d d n|}|}|}|| d }|dkrld}t3t4|||ƒƒD ]¡}||t< || }||t< || | j	 | j
 |  ¡  |t< |	d t|ƒ d t|ƒ }|  5||¡\}}|rt|dƒ;}t# $|¡}||  %¡  }t'rätj( )d¡ t*j+d|d |D ]}t* ,||| ¡ qÎt* -¡  | j.tj(d< W d   ƒ n	1 sïw   Y  |  &|	||d¡ ||dt6  kr|}|}qt n	|  &|	|dd¡  ||kr||d< |S )Nz)the tuning micro batch size list is emptyr   r   Ú_gasÚ_tmbspgr?   r€   r   r9  rn   rG   rH   ro   r   rÝ   T)ÚnestedÚrun_namezmicro batch size = z was not run successfullyéþÿÿÿr®   )7ÚsortÚget_first_configr   r·   r   r  r  r¿   rÈ   r2   r1   r‰   rÉ   rË   r   r9  r$   r%   rT   r"   rS   rU   rt   r<  rW   rX   rP   r/   r=  r>  r?  rO   r   ÚAUTOTUNING_METRIC_PATHr&   rº   ru   rƒ   r  ré   rê   Úpoprë   rì   r÷   rû   r9   r   r*   r@  r4   ÚreversedÚrangeÚrun_ds_configr.  )r:   r  rÌ   r!  r   r  r"  Úmax_micro_batch_size_metric_valrA  r  Ú	exp_pathsrØ   rÙ   rÛ   r×   rB  rh   rC  rÚ   rE  Úmetric_filerl   Úresultsrþ   rƒ   Ú"min_micro_batch_size_with_same_gasrå   rä   Ústrider<   r<   r=   r  å  sØ   
ÿÿÿ

ý

€ô€
ÿÿÿ
ÿÿÿ
€÷

z&Autotuner.run_tuning_micro_batch_sizesc                    s(  ˆ |krdS g }t t|ƒ }t| jƒ}t|i|t< |  ¡ }||t< ˆ dk rt| jv r;t	| jt t
ƒr;t
| jt ƒ}n|  t¡}	|	rGt
|	ƒ}nd}|dksQJ dƒ‚||t< ||t< || | j | j |  ¡  |t< |d t|ƒ d t|ƒ }
|  ||
¡\}}|r‘|  |||d¡ | |¡ |‰ n¹|  ||dd¡ t d|› d¡ |  ¡ |krªdS |  ¡ }||t< ||t< || | j | j |  ¡  |t< |d t|ƒ d t|ƒ }
|  ||
¡\}}|sñ|  ||dd¡ t d	|› d
¡ dS |  |||d¡ |‰ | |¡ nIˆ |t< ||t< ˆ | | j | j |  ¡  |t< |d t|ƒ d tˆ ƒ }
|  ||
¡\}}|r@|  |||d¡ | ˆ ¡ n
|  ||dd¡ dS t||  ¡ ƒ}t d| ¡|t
d| ƒfD ]h}||  ¡ krjq_||v ruˆ |f  S ||t< || | j | j |  ¡  |t< |d t|ƒ d t|ƒ }
|  ||
¡\}}|r¾t d|› d¡ |  |||d¡ | |¡ ˆ |f  S |  ||dd¡ q_|| jv rÓ| j| ng ‰ˆrîtttˆƒƒ‡ ‡fdd„d}ˆ| d }nd }ˆ }|}||kr€t
|| d ƒ}t d|› d|› d|› ¡ ||vrw||t< || | j | j |  ¡  |t< |d t|ƒ d t|ƒ }
|  ||
¡\}}|rj|d }|  |||d¡ | |¡ |rg|| | tk rgt d|› ¡ n|}n|  ||dd¡ |d }n|d }||ksù|d }t dˆ › d|› d¡ ˆ |fS )N)r   r   r   r   z4The micro batch size per GPU must be greater than 0.rK  rL  z(User-specified micro batch size per GPU z does not runz#min_train_micro_batch_size_per_gpu z is not runnable.gÍÌÌÌÌÌð?gffffffî?zmbs = z is found as max mbsc                    s   t ˆ|  d t t ˆ  ƒS rŠ   )ÚabsrË   r¿   )Úi©r!  r/  r<   r=   r+  ±  s    ÿz8Autotuner.get_min_max_micro_batch_size.<locals>.<lambda>r,  r   ztrying mbs = z, low = z	, high = zperformance plateaus at mbs = úmin_micro_batch_size = ú, max_micro_batch_size = rK   )r  r  rQ  r   r·   r   r  rÈ   r¿   rò   r  Úget_val_from_user_argsr2   r1   r‰   rÉ   rV  r  rP   r   r*   r   r{   r‹   ÚmathÚceilr6   rU  r4   r   r.  )r:   r   r!  r   Úused_micro_batch_sizesr  rA  rÙ   rØ   ra   rÛ   rÚ   rþ   r"  Úprev_idxr1  ÚlowÚhighÚmidr<   r_  r=   r  S  s  


ÿ

ÿÿÿ
ÿÿÿÿÿÿ 
ÿÿÿ
ÿ

ÿÿÿ
ÿÿ

êz&Autotuner.get_min_max_micro_batch_sizec                 C   sx   d}t | jv r2| jt  }t|tƒr|}n|dkr$|  t ¡}|r#t|ƒ}nt|tƒr2t dt › d¡ |dks:J dƒ‚|S )Nr   ÚautozSpecifying a list of z+ to tune is not supported. 1 would be used.r   z-Gradient accumulation steps must be positive.)rÈ   r   rò   r  rb  ró   r   r*   )r:   rÙ   Úgas_in_configra   r<   r<   r=   r  ×  s    



€

ÿz"Autotuner.get_gas_from_user_configc                 C   sV   | j j}| jj}|r)||v r)|| }||v r)| |¡}||d   ¡ r)||d  S d S rI  )r   rz   r   r   rq   Ú	isnumeric)r:   Úds_namerz   r   Úarg_namerw   r<   r<   r=   rb  ç  s   
z Autotuner.get_val_from_user_argsc           
      C   sú   |dks|dkrt  d|› d|› ¡ g dfS |  ¡ r0|  ¡ dkr0|  ¡ |  ¡  | j| j  }n|  ¡ }|| |  ¡  }t  d|› ¡ ||d k rN|d }|| | }|dkrZd}g }|| }t|||ƒD ]}	||	 |krs| |	¡ qf| |¡ ||fS )a®  Get a list of micro batch sizes to tune based on min and max values, as well as the size of the list.
        Args:
            min_micro_batch_size ([int]): min micro batch size per GPU
            max_micro_batch_size ([int]): max micro batch size per GPU
            num_tuning_micro_batch_sizes (int): the number of items in the returned list

        Returns:
            [list]: a list of micro batch sizes to tune.
        r   r`  ra  zmax_train_batch_size_per_gpu = r   r   )	r   r*   rˆ   r‰   r2   r1   r  rU  rP   )
r:   r!  r"  rŽ   rÌ   rÙ   r\  ÚlsÚmin_gasrØ   r<   r<   r=   r  ò  s6   ÿÿÿ
€
z*Autotuner.get_tuning_micro_batch_size_listc                 C   sä   i }||d< ||t < | j|d< | j|d< | jj|d< tj | j|› d¡}t	 
d|› ¡ t|dtd}t ||¡ | ¡  t |¡ W d   ƒ n1 sNw   Y  | j |g¡ | j ¡  | j |  ¡ ¡\}}| j ¡  ||fS )	Nr?   r€   r   r9  rn   zrun_ds_config exp_name = rG   rH   )rË   r2   r1   r   r9  r$   r%   rT   r"   r   r   rS   rU   rt   r<  rW   rX   r/   r=  r>  Úparse_resultsrƒ   r@  )r:   rA  rÛ   r×   rB  rh   rÚ   rþ   r<   r<   r=   rV    s$   

ý

zAutotuner.run_ds_configc                 C   sH  |   ¡ }t|vr
d S |t \}}}|r¢|d }d }ttj |d¡dƒ}dd„ | ¡  ¡ D ƒ}W d   ƒ n1 s9w   Y  t 	ttj |d¡dƒ¡}| 
t¡ tj | jd¡}	t |t|	dƒ¡ tj | jd	¡}
t|
dƒ}| d
 |¡¡ | d¡ | ¡  W d   ƒ n1 sŠw   Y  || _|| _t d|	› d|
› ¡ d S d S )NrJ   zcmd.txtro   c                 S   s   g | ]}t |ƒ‘qS r<   )r  )r  r^  r<   r<   r=   r	  <  s    z2Autotuner.write_optimal_config.<locals>.<listcomp>zds_config.jsonzds_config_optimal.jsonrG   zcmd_optimal.txtú Ú
zAWrote the optimal DeepSpeed configuration found by autotuning to z-, and the corresponding DeepSpeed command to )rN   rQ   rS   r$   r%   rT   ÚreadÚsplitrº   ru   rS  r   r-   rt   r<  rV   rW   r7   r8   r   r*   )r:   r^   ri   rB   r  Úexp_dirÚcmdrl   rA  Úds_config_pathÚcmd_pathrh   r<   r<   r=   Úwrite_optimal_config3  s4   ÿ


ýÿízAutotuner.write_optimal_configc                 C   s>   | j rt | j ¡}| ¡  t d| j › ¡ dS t d¡ dS )a   Launches the training with the optimal DeepSpeed configuration found through the autotuning process.
            "ds_config_optimal.json" describing the optimal DeepSpeed configuration as well the command used to launch training "cmd_optimal.txt" are saved to self.results_dir.
        z<Done running with the optimal DeepSpeed configuration using z7No optimal DeepSpeed configuration found by autotuning.N)r7   Ú
subprocessÚPopenÚwaitr   r*   )r:   Úresultr<   r<   r=   Úrun_after_tuningO  s
   zAutotuner.run_after_tuningNr  )%Ú__name__Ú
__module__Ú__qualname__Ú__doc__r>   rm   r   r.   r0   rƒ   r‡   rˆ   r‰   r‹   r   rŽ   r“   r•   rš   rª   rÜ   r  rõ   r4  rŸ   rí   r  r  rN   r  r  r  rb  r  rV  rz  r  r<   r<   r<   r=   r   *   sH    =2	d
wu-n ,r   )%r'   r{  r[   rY   rc  rº   Úruntime.config_utilsr   Úruntime.constantsÚruntime.zero.configr   r   Úutilsr   rÖ   r	   Ú	constantsÚ	schedulerr
   Útunerr   r   r   Údeepspeed.acceleratorr   r   ÚImportErrorrë   ré   Ú	ExceptionÚer·   rÆ   rÇ   rô   r¸   r   r<   r<   r<   r=   Ú<module>   sB   ÿ€ÿ