o
    ciJ                     @   s
  d Z ddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZmZ e \ZZe Zejdd	d
d ejdeddd ejdeddd edkre Ze jdddjddjddjdddddddgddddgdddddd dd!dd"jd#d$j d%d&j!d'd(d)j"d*gd+d,d-j#d!d!d.dejj$d/d0d1Z%dZ&d2Z'ej(e%d3Z)dZ*e+e&D ]7Z,e-d4e,  e). /eZ0e0re-d51e0e e  e0e e e'krej2re-d61e, e3d d(Z* nqe)4 Z5e5j6Z7e8ej9j9d7d8Z:e8ej9j9d9d8Z;e7<e:e;d Z=e7>e:e;d Z?e@e=e?d ZAe-d:eAB    e	e)jCd;ZDeeDZDe8eDd< ZEe7d<eEi\ZFZe7<eFe8eDd= d ZGe5Hd<eEid ZIe7<eFe8eId ZJe-d>eGB    e-d?eJB    e)K  dS dS )@a  Example on how to use CQL to learn from an offline JSON file.

Important node: Make sure that your offline data file contains only
a single timestep per line to mimic the way SAC pulls samples from
the buffer.

Generate the offline json file by running an SAC algo until it reaches expert
level on your command line. For example:
$ cd ray
$ rllib train -f rllib/tuned_examples/sac/pendulum-sac.yaml --no-ray-ui

Also make sure that in the above SAC yaml file (pendulum-sac.yaml),
you specify an additional "output" key with any path on your local
file system. In that path, the offline json files will be written to.

Use the generated file(s) as "input" in the CQL config below
(`config["input"] = [list of your json files]`), then run this script.
    N) convert_ma_batch_to_sample_batch)cql)synchronous_parallel_sample)try_import_torch)ENV_RUNNER_RESULTSEPISODE_RETURN_MEANEVALUATION_RESULTSz	--as-test
store_truezuWhether this script should be run as a test: --stop-reward must be achieved within --stop-timesteps AND --stop-iters.)actionhelpz--stop-iters   zNumber of iterations to train.)typedefaultr   z--stop-rewardg      I@z!Reward at which we stop training.__main__F)"enable_env_runner_and_connector_v2enable_rl_module_and_learnertorch)	framework)num_env_runners   g{Gzt?auto   relu)fcnet_hiddensfcnet_activationga2U0*3?)actor_learning_ratecritic_learning_rateentropy_learning_rate   )n_stepbc_itersclip_actionstautarget_entropyq_model_configpolicy_model_configoptimization_configtrain_batch_sizetarget_network_update_freq(num_steps_sampled_before_learning_startsi  )!min_train_timesteps_per_iterationINFO)	log_levelzPendulum-v1T)normalize_actionsz tests/data/pendulum/enormous.zipjson)pathsformat)input_config
   sampler)input_)evaluation_num_env_runnersevaluation_intervalevaluation_durationevaluation_parallel_to_trainingevaluation_configi)configzIter z... R={}z Test passed after {} iterations.)r   r   )size)r   r   zfinal_q_values=)
worker_setobsactionszQ-val batch=zQ-val policy=)L__doc__argparsenumpynpray.rllib.policy.sample_batchr   ray.rllib.algorithmsr   ray.rllib.execution.rollout_opsr   ray.rllib.utils.frameworkr   ray.rllib.utils.metricsr   r   r   r   _ArgumentParserparseradd_argumentintfloat__name__
parse_argsargs	CQLConfig	api_stackr   env_runnerstraining	reporting	debuggingenvironmentoffline_data
evaluation	overridesr:   num_iterations
min_rewardCQLcql_algorithmlearntrangeiprinttraingeteval_resultsr0   as_testquit
get_policy
cql_policymodel	cql_model
from_numpyrandom	obs_batchaction_batchget_q_valuesq_valuesget_twin_q_valuestwin_q_valuesminfinal_q_valuesdetachenv_runner_groupbatchr=   	model_outq_values_oldcompute_actions_from_input_dictactions_newq_values_newstop r   r   \/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/examples/offline_rl/offline_rl.py<module>   s   

 !'5