o
    `۷i                     @   s   d dl Z d dlmZmZmZmZmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ erG	 e eZeddG dd deZdS )    N)TYPE_CHECKINGCallableDictOptionalUnion)*ensure_only_allowed_dataclass_keys_updated)
DataConfig)
GenDataset)	RunConfigScalingConfig)DataParallelTrainer)ValidationConfig)	JaxConfig)	PublicAPIalpha)	stabilityc                       s   e Zd ZdZdddddddddeeg df eegdf f dee dee dee	 deee
ef  d	ee d
eee
ef  dee f fddZede	de	fddZ  ZS )
JaxTrainera  A Trainer for Single-Program Multi-Data (SPMD) JAX training.

    At a high level, this Trainer does the following:

    1. Launches multiple workers as defined by the ``scaling_config``.
    2. Sets up a distributed JAX environment for TPUs or GPUs
       on these workers as defined by the ``jax_config``.
    3. Ingests the input ``datasets`` based on the ``dataset_config``.
    4. Runs the input ``train_loop_per_worker(train_loop_config)``
       on all workers.

    For more details, see:

    * :ref:`Jax Guide <train-jax>`

    .. testcode::
        :skipif: True

        import os
        from absl import app
        import logging
        from typing import Sequence

        import ray
        from ray.train import ScalingConfig, RunConfig
        from ray.train.v2.jax import JaxTrainer
        from MaxText.train import main as maxtext_main

        def train_loop_per_worker(config):
            argv = config["argv"]
            maxtext_main(argv)

        def main(argv: Sequence[str]):
            ray.init()

            # If you want to use TPUs, specify the TPU topology and accelerator type.
            tpu_scaling_config = ScalingConfig(
                use_tpu=True,
                num_workers=4,
                topology="4x4",
                accelerator_type="TPU-V6E",
                placement_strategy="SPREAD",
                resources_per_worker={"TPU": 4},
            )

            # If you want to use GPUs, specify the GPU scaling config like below.
            # gpu_scaling_config = ScalingConfig(
            #     use_gpu=True,
            #     num_workers=4,
            #     resources_per_worker={"GPU": 1},
            # )


            trainer = JaxTrainer(
                train_loop_per_worker=train_loop_per_worker,
                train_loop_config={"argv": absolute_argv},
                scaling_config=tpu_scaling_config,
                run_config=RunConfig(
                    name="maxtext_jaxtrainer",
                    worker_runtime_env={
                        "env_vars": {
                            "JAX_PLATFORMS": "tpu",
                            # If you want to use GPUs, set the JAX_PLATFORMS to "cuda".
                            # "JAX_PLATFORMS": "cuda",
                        }
                    },
                ),
            )

            result = trainer.fit()

    If the ``datasets`` dict contains datasets (e.g. "train" and "val"), then it will be split into multiple dataset
    shards that can then be accessed by ``ray.train.get_dataset_shard("train")`` and ``ray.train.get_dataset_shard("val")``.

    Note:
        * If you are using TPUs, importing `jax` should occur within `train_loop_per_worker` to
          avoid driver-side TPU lock issues.

    Args:
        train_loop_per_worker: The training function to execute on each worker.
            This function can either take in zero arguments or a single ``Dict``
            argument which is set by defining ``train_loop_config``.
            Within this function you can use any of the
            :ref:`Ray Train Loop utilities <train-loop-api>`.
        train_loop_config: A configuration ``Dict`` to pass in as an argument to
            ``train_loop_per_worker``.
            This is typically used for specifying hyperparameters. Passing large
            datasets via `train_loop_config` is not recommended and may introduce
            large overhead and unknown issues with serialization and deserialization.
        jax_config: The configuration for setting up the JAX backend.
            If set to None, a default configuration will be used based on the ``scaling_config`` and ``JAX_PLATFORMS`` environment variable.
        scaling_config: Configuration for how to scale data parallel training
            with SPMD. ``num_workers`` should be set to the number of TPU hosts or GPU workers.
            If using TPUs, ``topology`` should be set to the TPU topology.
            See :class:`~ray.train.ScalingConfig` for more info.
        dataset_config: The configuration for ingesting the input ``datasets``.
            By default, all the Ray Dataset are split equally across workers.
            See :class:`~ray.train.DataConfig` for more details.
        run_config: The configuration for the execution of the training run.
            See :class:`~ray.train.RunConfig` for more info.
        datasets: The Ray Datasets to ingest for training.
            Datasets are keyed by name (``{name: dataset}``).
            Each dataset can be accessed from within the ``train_loop_per_worker``
            by calling ``ray.train.get_dataset_shard(name)``.
            Sharding and additional configuration can be done by
            passing in a ``dataset_config``.
        validation_config: [Alpha] Configuration for checkpoint validation.
            If provided and ``ray.train.report`` is called with the ``validation``
            argument, Ray Train will validate the reported checkpoint using
            the validation function specified in this config.
    N)train_loop_config
jax_configscaling_configdataset_config
run_configdatasetsvalidation_configtrain_loop_per_workerr   r   r   r   r   r   r   c          	   
      s8   |s
t |j|jd}tt| j||||||||d d S )N)use_tpuuse_gpu)r   r   backend_configr   r   r   r   r   )r   r   r   superr   __init__)	selfr   r   r   r   r   r   r   r   	__class__ R/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/train/v2/jax/jax_trainer.pyr      s   

zJaxTrainer.__init__returnc                 C   s   t || jd |S )z>Return scaling config dataclass after validating updated keys.)	dataclassallowed_keys)r   _scaling_config_allowed_keys)clsr   r#   r#   r$   _validate_scaling_config   s
   z#JaxTrainer._validate_scaling_config)__name__
__module____qualname____doc__r   r   r   r   r   r   strr   r
   r	   r   r   classmethodr*   __classcell__r#   r#   r!   r$   r      s8    t	
r   )loggingtypingr   r   r   r   r   ray.air._internal.configr   	ray.trainr   ray.train.trainerr	   ray.train.v2.api.configr
   r   &ray.train.v2.api.data_parallel_trainerr   "ray.train.v2.api.validation_configr   ray.train.v2.jax.configr   ray.utilr   	getLoggerr+   loggerr   r#   r#   r#   r$   <module>   s    
