o
    $i:Q                     @   s,  d dl mZmZmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlmZmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/ d dl0m1Z1 G dd deZ2G dd de
Z3dS )    )CallableOptionalTypeUnion)Self)deprecation_warning)	Algorithm)AlgorithmConfigNotProvided)+AddNextObservationsFromEpisodesToTrainBatch"AddObservationsFromEpisodesToBatchAddOneTsToEpisodesAndTruncateGeneralAdvantageEstimation)Learner)TrainingData)RLModuleSpec)synchronous_parallel_sample)multi_gpu_train_one_steptrain_one_step)Policy)OldAPIStackoverride)LEARNER_RESULTSLEARNER_UPDATE_TIMERNUM_AGENT_STEPS_SAMPLEDNUM_ENV_STEPS_SAMPLEDOFFLINE_SAMPLING_TIMERSAMPLE_TIMERSYNCH_WORKER_WEIGHTS_TIMERTIMERS)EnvType
ResultDictRLModuleSpecType)Loggerc                       sf  e Zd ZdZd# fdd	Zeeeeeeeeddee	 dee	 dee	 d	ee	 d
ee	 dee	 de
f fddZeedefddZeedeed ef fddZeede
f fddZeede
f fddZee		d$deeeef  deeg ef  ddf fddZee	d# fdd	Zeed% fdd Ze fd!d"Z  ZS )&MARWILConfiga  Defines a configuration class from which a MARWIL Algorithm can be built.

    .. testcode::

        import gymnasium as gym
        import numpy as np

        from pathlib import Path
        from ray.rllib.algorithms.marwil import MARWILConfig

        # Get the base path (to ray/rllib)
        base_path = Path(__file__).parents[2]
        # Get the path to the data in rllib folder.
        data_path = base_path / "offline/tests/data/cartpole/cartpole-v1_large"

        config = MARWILConfig()
        # Enable the new API stack.
        config.api_stack(
            enable_rl_module_and_learner=True,
            enable_env_runner_and_connector_v2=True,
        )
        # Define the environment for which to learn a policy
        # from offline data.
        config.environment(
            observation_space=gym.spaces.Box(
                np.array([-4.8, -np.inf, -0.41887903, -np.inf]),
                np.array([4.8, np.inf, 0.41887903, np.inf]),
                shape=(4,),
                dtype=np.float32,
            ),
            action_space=gym.spaces.Discrete(2),
        )
        # Set the training parameters.
        config.training(
            beta=1.0,
            lr=1e-5,
            gamma=0.99,
            # We must define a train batch size for each
            # learner (here 1 local learner).
            train_batch_size_per_learner=2000,
        )
        # Define the data source for offline data.
        config.offline_data(
            input_=[data_path.as_posix()],
            # Run exactly one update per training iteration.
            dataset_num_iters_per_learner=1,
        )

        # Build an `Algorithm` object from the config and run 1 training
        # iteration.
        algo = config.build()
        algo.train()

    .. testcode::

        import gymnasium as gym
        import numpy as np

        from pathlib import Path
        from ray.rllib.algorithms.marwil import MARWILConfig
        from ray import tune

        # Get the base path (to ray/rllib)
        base_path = Path(__file__).parents[2]
        # Get the path to the data in rllib folder.
        data_path = base_path / "offline/tests/data/cartpole/cartpole-v1_large"

        config = MARWILConfig()
        # Enable the new API stack.
        config.api_stack(
            enable_rl_module_and_learner=True,
            enable_env_runner_and_connector_v2=True,
        )
        # Print out some default values
        print(f"beta: {config.beta}")
        # Update the config object.
        config.training(
            lr=tune.grid_search([1e-3, 1e-4]),
            beta=0.75,
            # We must define a train batch size for each
            # learner (here 1 local learner).
            train_batch_size_per_learner=2000,
        )
        # Set the config's data path.
        config.offline_data(
            input_=[data_path.as_posix()],
            # Set the number of updates to be run per learner
            # per training step.
            dataset_num_iters_per_learner=1,
        )
        # Set the config's environment for evalaution.
        config.environment(
            observation_space=gym.spaces.Box(
                np.array([-4.8, -np.inf, -0.41887903, -np.inf]),
                np.array([4.8, np.inf, 0.41887903, np.inf]),
                shape=(4,),
                dtype=np.float32,
            ),
            action_space=gym.spaces.Discrete(2),
        )
        # Set up a tuner to run the experiment.
        tuner = tune.Tuner(
            "MARWIL",
            param_space=config,
            run_config=tune.RunConfig(
                stop={"training_iteration": 1},
            ),
        )
        # Run the experiment.
        tuner.fit()
    Nc                    s   ddi| _ t j|ptd d| _d| _d| _d| _d| _d| _	d| j
d	< d
| _d| _d| _d| _d| _d| _d| _d| _d| _d
S )z$Initializes a MARWILConfig instance.typeStochasticSampling)
algo_classF      ?        g:0yE>g      Y@vf_share_layersNsamplerTg-C6?i  )exploration_configsuper__init__MARWIL
_is_onlinebetabc_logstd_coeff'moving_average_sqd_adv_norm_update_rate!moving_average_sqd_adv_norm_startvf_coeffmodel	grad_clipinput_postprocess_inputslrlambda_train_batch_sizematerialize_datamaterialize_mapped_data"_set_off_policy_estimation_methods)selfr'   	__class__ _/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/ray/rllib/algorithms/marwil/marwil.pyr.      s&   


zMARWILConfig.__init__)r1   r2   r3   r4   r5   r7   r1   r2   r3   r4   r5   r7   returnc                   sj   t  jdi | |tur|| _|tur|| _|tur|| _|tur%|| _|tur,|| _|tur3|| _| S )a  Sets the training related configuration.

        Args:
            beta: Scaling  of advantages in exponential terms. When beta is 0.0,
                MARWIL is reduced to behavior cloning (imitation learning);
                see bc.py algorithm in this same directory.
            bc_logstd_coeff: A coefficient to encourage higher action distribution
                entropy for exploration.
            moving_average_sqd_adv_norm_update_rate: The rate for updating the
                squared moving average advantage norm (c^2). A higher rate leads
                to faster updates of this moving avergage.
            moving_average_sqd_adv_norm_start: Starting value for the
                squared moving average advantage norm (c^2).
            vf_coeff: Balancing value estimation loss and policy optimization loss.
            grad_clip: If specified, clip the global norm of gradients by this amount.

        Returns:
            This updated AlgorithmConfig object.
        NrC   )	r-   trainingr
   r1   r2   r3   r4   r5   r7   )r@   r1   r2   r3   r4   r5   r7   kwargsrA   rC   rD   rF      s    zMARWILConfig.trainingc                 C   s2   | j dkrddlm} t|dS td| j  d)Ntorchr   )DefaultPPOTorchRLModule)module_classThe framework ' is not supported. Use 'torch' instead.)framework_str:ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_modulerI   r   
ValueError)r@   rI   rC   rC   rD   get_default_rl_module_spec  s   

z'MARWILConfig.get_default_rl_module_specr   c                 C   s,   | j dkrddlm} |S td| j  d)NrH   r   )MARWILTorchLearnerrK   rL   )rM   6ray.rllib.algorithms.marwil.torch.marwil_torch_learnerrQ   rO   )r@   rQ   rC   rC   rD   get_default_learner_class  s   
z&MARWILConfig.get_default_learner_classc                    s$   t  jdi | d|v rd| _| S )zuSets the evaluation related configuration.
        Returns:
            This updated AlgorithmConfig object.
        off_policy_estimation_methodsTNrC   )r-   
evaluationr?   )r@   rG   rA   rC   rD   rU   "  s   
zMARWILConfig.evaluationc                    sP   t  jdi | d|v r&ddlm} t|d|s&td|d d| S )Nprelearner_classr   )OfflinePreLearnerz`prelearner_class` z is not a subclass of `OfflinePreLearner`. Any class passed to `prelearner_class` needs to implement the interface given by `OfflinePreLearner`.rC   )r-   offline_dataray.rllib.offline.offline_datarW   
issubclassgetrO   )r@   rG   rW   rA   rC   rD   rX   4  s   zMARWILConfig.offline_dataenvlogger_creatorr   c                    s    | j s	tddd t ||S )NzMARWIL used to have off_policy_estimation_methods is and wis by default. This haschanged to off_policy_estimation_methods: \{\}.If you want to use an off-policy estimator, specify it in.evaluation(off_policy_estimation_methods=...)F)olderror)r?   r   r-   build)r@   r\   r]   rA   rC   rD   r`   H  s   zMARWILConfig.buildc                    sF   t  j|||d}|t  |tt  |t| j	| j
d |S )N)input_observation_spaceinput_action_spacedevice)gammar;   )r-   build_learner_connectorprependr   insert_afterr   r   appendr   rd   r;   )r@   ra   rb   rc   pipelinerA   rC   rD   re   Y  s   	
z$MARWILConfig.build_learner_connectorc                    sv   t    | jdk s| jdkr| d | jdu r#| jdkr#| d | jdkr5| js7| jr9| d d S d S d S d S )Nr)   r(   z"`beta` must be within 0.0 and 1.0!Fz`postprocess_inputs` must be True for MARWIL (to calculate accum., discounted returns)! Try setting `config.offline_data(postprocess_inputs=True)`.r   zWhen using a local Learner (`config.num_learners=0`), the number of iterations per learner (`dataset_num_iters_per_learner`) has to be defined! Set this hyperparameter through `config.offline_data(dataset_num_iters_per_learner=...)`.)r-   validater1   _value_errorr9   num_learnersdataset_num_iters_per_learnerenable_rl_module_and_learnerr@   rA   rC   rD   rj   }  s    



zMARWILConfig.validatec                    s   t  j| jddB S )NF)r1   r*   )r-   _model_auto_keysr1   ro   rA   rC   rD   rp     s   zMARWILConfig._model_auto_keysN)NNrE   N)__name__
__module____qualname____doc__r.   r   r	   r
   r   floatr   rF   r"   rP   r   r   strrS   rU   rX   r    r   r#   r`   re   rj   propertyrp   __classcell__rC   rC   rA   rD   r$   ,   sh    p8
0#r$   c                   @   sp   e Zd ZeeedefddZeeedede	e
e  fddZeeddd	Zedefd
dZdS )r/   rE   c                 C   s   t  S rq   )r$   )clsrC   rC   rD   get_default_config  s   zMARWIL.get_default_configconfigc                 C   sH   |d dkrddl m} |S |d dkrddlm} |S ddlm} |S )N	frameworkrH   r   )MARWILTorchPolicytf)MARWILTF1Policy)MARWILTF2Policy)/ray.rllib.algorithms.marwil.marwil_torch_policyr   ,ray.rllib.algorithms.marwil.marwil_tf_policyr   r   )r{   r}   r   r   r   rC   rC   rD   get_default_policy_class  s   zMARWIL.get_default_policy_classNc                 C   s   | j js|  S | jttf. | j jdkp| j jdk}| j	j
| j j| j j|d}|r2t|d}nt|d}W d   n1 sAw   Y  | jttf$ | jjd	|| j j| j jd| j	j}| jj|td W d   dS 1 suw   Y  dS )
a-  Implements training logic for the new stack

        Note, this includes so far training with the `OfflineData`
        class (multi-/single-learner setup) and evaluation on
        `EnvRunner`s. Note further, evaluation on the dataset itself
        using estimators is not implemented, yet.
        r      )num_samples
num_shardsreturn_iterator)data_iterators)batchN)training_dataminibatch_size	num_iters)keyrC   )r}   "enable_env_runner_and_connector_v2_training_step_old_api_stackmetricslog_timer   r   rl   rm   rX   sampletrain_batch_size_per_learnerr   r   learner_groupupdateiter_batches_kwargs	aggregater   )r@   r   batch_or_iteratorr   learner_resultsrC   rC   rD   training_step  s4   


"zMARWIL.training_stepc                 C   s  | j t  t| jd}W d   n1 sw   Y  |jt| jjd d}| jt	  |
 7  < | jt  | 7  < | jjrGt| |}nt| |}d| jt	 i}| j dkr{| j t  | jjt| |d W d   n1 svw   Y  | j| |S )zImplements training step for the old stack.

        Note, there is no hybrid stack anymore. If you need to use `RLModule`s,
        use the new api stack.
        )
worker_setNr   )	module_idtimestep)policiesglobal_vars)_timersr   r   env_runner_groupas_multi_agentlistr}   r   	_countersr   agent_stepsr   	env_stepssimple_optimizerr   r   num_remote_env_runnersr   sync_weightskeys
env_runnerset_global_vars)r@   train_batchtrain_resultsr   rC   rC   rD   r     s*   

z#MARWIL._training_step_old_api_stackrr   )rs   rt   ru   classmethodr   r   r$   r|   r	   r   r   r   r   r   r   r!   r   rC   rC   rC   rD   r/     s    
1r/   N)4typingr   r   r   r   typing_extensionsr   ray._common.deprecationr   ray.rllib.algorithms.algorithmr   %ray.rllib.algorithms.algorithm_configr	   r
   ray.rllib.connectors.learnerr   r   r   r   ray.rllib.core.learner.learnerr   $ray.rllib.core.learner.training_datar   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.execution.rollout_opsr   ray.rllib.execution.train_opsr   r   ray.rllib.policy.policyr   ray.rllib.utils.annotationsr   r   ray.rllib.utils.metricsr   r   r   r   r   r   r   r   ray.rllib.utils.typingr    r!   r"   ray.tune.loggerr#   r$   r/   rC   rC   rC   rD   <module>   s(    (
  v