o
    `۷iN                  3   @   s  d dl mZmZmZmZmZmZmZmZ d dl	Z
d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZmZ d dlm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z) e  \Z*Z+e! \Z,Z+edddddddddddddddddddddde-de-deeeeee egee)ee) f f  deeg e'f  deeeegee-e)f f  deeeeeeeef  ee gef  deeeee-e)f ee) eegee-e)f f  deeede)gee-e)f f  deeegee-e)f f  deeee'gdf  deeee
j.e
j.e'gdf  deeee
j.e
j.e'gdf  deeee
j/j.e
j/j.e'gdf  deeee
j.e
j.e'gdf  d eeee
j/j.e
j/j.e'gdf  d!eee)ee) gee)e)f f  d"eeeee)e)e)gee)e0ee) f f  d#eeee
j/j.e
j/j.e'gef  d$eeee
j/j.e
j/j.e'geeee f f  d%eeeegee(e1f f  d&eeedgdf  d'eee0  d(eeege2f  d)ee f0d*d+Z3dS ),    )AnyCallableDictListOptionalTupleTypeUnionN)ModelCatalog)ModelV2)TorchDistributionWrapper)TorchModelV2)Policy)SampleBatch)TorchPolicy)NullContextManager
add_mixins)OldAPIStackoverride)try_import_jaxtry_import_torch)LEARNER_STATS_KEY)convert_to_numpy)AlgorithmConfigDictModelGradients
TensorType)get_default_configstats_fnpostprocess_fnextra_action_out_fnextra_grad_process_fnextra_learn_fetches_fnoptimizer_fnvalidate_spacesbefore_initbefore_loss_init
after_init_after_loss_initaction_sampler_fnaction_distribution_fn
make_modelmake_model_and_action_distcompute_gradients_fnapply_gradients_fnmixinsget_batch_divisibility_reqname	frameworkloss_fnr   r   r   r   r    ztorch.optim.Optimizerr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   returnc                   sz   t   tt|}G  	
fddd|}fdd}t||_| |_| |_|S )a  Helper function for creating a new Policy class at runtime.

    Supports frameworks JAX and PyTorch.

    Args:
        name: name of the policy (e.g., "PPOTorchPolicy")
        framework: Either "jax" or "torch".
            loss_fn (Optional[Callable[[Policy, ModelV2,
                Type[TorchDistributionWrapper], SampleBatch], Union[TensorType,
                List[TensorType]]]]): Callable that returns a loss tensor.
            get_default_config (Optional[Callable[[None], AlgorithmConfigDict]]):
                Optional callable that returns the default config to merge with any
                overrides. If None, uses only(!) the user-provided
                PartialAlgorithmConfigDict as dict for this Policy.
            postprocess_fn (Optional[Callable[[Policy, SampleBatch,
                Optional[Dict[Any, SampleBatch]], Optional[Any]],
                SampleBatch]]): Optional callable for post-processing experience
                batches (called after the super's `postprocess_trajectory` method).
            stats_fn (Optional[Callable[[Policy, SampleBatch],
                Dict[str, TensorType]]]): Optional callable that returns a dict of
                values given the policy and training batch. If None,
                will use `TorchPolicy.extra_grad_info()` instead. The stats dict is
                used for logging (e.g. in TensorBoard).
            extra_action_out_fn (Optional[Callable[[Policy, Dict[str, TensorType],
                List[TensorType], ModelV2, TorchDistributionWrapper]], Dict[str,
                TensorType]]]): Optional callable that returns a dict of extra
                values to include in experiences. If None, no extra computations
                will be performed.
            extra_grad_process_fn (Optional[Callable[[Policy,
                "torch.optim.Optimizer", TensorType], Dict[str, TensorType]]]):
                Optional callable that is called after gradients are computed and
                returns a processing info dict. If None, will call the
                `TorchPolicy.extra_grad_process()` method instead.
            # TODO: (sven) dissolve naming mismatch between "learn" and "compute.."
            extra_learn_fetches_fn (Optional[Callable[[Policy],
                Dict[str, TensorType]]]): Optional callable that returns a dict of
                extra tensors from the policy after loss evaluation. If None,
                will call the `TorchPolicy.extra_compute_grad_fetches()` method
                instead.
            optimizer_fn (Optional[Callable[[Policy, AlgorithmConfigDict],
                "torch.optim.Optimizer"]]): Optional callable that returns a
                torch optimizer given the policy and config. If None, will call
                the `TorchPolicy.optimizer()` method instead (which returns a
                torch Adam optimizer).
            validate_spaces (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): Optional callable that takes the
                Policy, observation_space, action_space, and config to check for
                correctness. If None, no spaces checking will be done.
            before_init (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): Optional callable to run at the
                beginning of `Policy.__init__` that takes the same arguments as
                the Policy constructor. If None, this step will be skipped.
            before_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, AlgorithmConfigDict], None]]): Optional callable to
                run prior to loss init. If None, this step will be skipped.
            after_init (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): DEPRECATED: Use `before_loss_init`
                instead.
            _after_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, AlgorithmConfigDict], None]]): Optional callable to
                run after the loss init. If None, this step will be skipped.
                This will be deprecated at some point and renamed into `after_init`
                to match `build_tf_policy()` behavior.
            action_sampler_fn (Optional[Callable[[TensorType, List[TensorType]],
                Tuple[TensorType, TensorType]]]): Optional callable returning a
                sampled action and its log-likelihood given some (obs and state)
                inputs. If None, will either use `action_distribution_fn` or
                compute actions by calling self.model, then sampling from the
                so parameterized action distribution.
            action_distribution_fn (Optional[Callable[[Policy, ModelV2, TensorType,
                TensorType, TensorType], Tuple[TensorType,
                Type[TorchDistributionWrapper], List[TensorType]]]]): A callable
                that takes the Policy, Model, the observation batch, an
                explore-flag, a timestep, and an is_training flag and returns a
                tuple of a) distribution inputs (parameters), b) a dist-class to
                generate an action distribution object from, and c) internal-state
                outputs (empty list if not applicable). If None, will either use
                `action_sampler_fn` or compute actions by calling self.model,
                then sampling from the parameterized action distribution.
            make_model (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, AlgorithmConfigDict], ModelV2]]): Optional callable
                that takes the same arguments as Policy.__init__ and returns a
                model instance. The distribution class will be determined
                automatically. Note: Only one of `make_model` or
                `make_model_and_action_dist` should be provided. If both are None,
                a default Model will be created.
            make_model_and_action_dist (Optional[Callable[[Policy,
                gym.spaces.Space, gym.spaces.Space, AlgorithmConfigDict],
                Tuple[ModelV2, Type[TorchDistributionWrapper]]]]): Optional
                callable that takes the same arguments as Policy.__init__ and
                returns a tuple of model instance and torch action distribution
                class.
                Note: Only one of `make_model` or `make_model_and_action_dist`
                should be provided. If both are None, a default Model will be
                created.
            compute_gradients_fn (Optional[Callable[
                [Policy, SampleBatch], Tuple[ModelGradients, dict]]]): Optional
                callable that the sampled batch an computes the gradients w.r.
                to the loss function.
                If None, will call the `TorchPolicy.compute_gradients()` method
                instead.
            apply_gradients_fn (Optional[Callable[[Policy,
                "torch.optim.Optimizer"], None]]): Optional callable that
                takes a grads list and applies these to the Model's parameters.
                If None, will call the `TorchPolicy.apply_gradients()` method
                instead.
            mixins (Optional[List[type]]): Optional list of any class mixins for
                the returned policy class. These mixins will be applied in order
                and will have higher precedence than the TorchPolicy class.
            get_batch_divisibility_req (Optional[Callable[[Policy], int]]):
                Optional callable that returns the divisibility requirement for
                sample batches. If None, will assume a value of 1.

    Returns:
        Type[TorchPolicy]: TorchPolicy child class constructed from the
            specified args.
    c                       s   e Zd ZfddZee	d fdd	Ze
fddZefdd	Zefd
dZ	efddZ
e	fddZefddZefddZdd Zdd Z  ZS )z&build_policy_class.<locals>.policy_clsc           	         s  || _  | _| j d< r| ||| j  r| ||| j  	r?
d u s)J d	| |||| _tj|| j d d\}}n(
rL
| |||\| _}ntj|| j d d\}}tj|||| j d d| _t}t| j|ssJ d| _| jj	| |||| j| j d rd n||d d d	 | j
| jj
 p}|r|| | j| j| | jd
| j d rd nd  r | ||| d| _d S )Nr1   zAEither `make_model` or `make_model_and_action_dist` must be None!model)r1   )	obs_spaceaction_spacenum_outputsmodel_configr1   z5ERROR: Generated Model must be a TorchModelV2 object!in_evaluationmax_seq_len)
observation_spacer6   configr4   lossaction_distribution_classr(   r)   r:   r/   T)auto_remove_unneeded_view_reqsr   r   )r<   r1   r4   r
   get_action_distget_model_v2r   
isinstance
parent_cls__init__view_requirementsupdater;   r6   !_initialize_loss_from_dummy_batchglobal_timestep)	selfr5   r6   r<   
dist_class_	logit_dim	model_cls_before_loss_init)r'   r)   r(   r&   r$   r%   r1   r/   r2   r*   r+   rC   r   r#    V/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/rllib/policy/policy_template.pyrD      sz   

	

z/build_policy_class.<locals>.policy_cls.__init__Nc                    s`   |   " t |||}r| |||W  d    S |W  d    S 1 s)w   Y  d S N)_no_grad_contextsuperpostprocess_trajectory)rI   sample_batchother_agent_batchesepisode)	__class__r   rO   rP   rT   L  s   

$z=build_policy_class.<locals>.policy_cls.postprocess_trajectoryc                    s    r | ||S  | ||S )zCalled after optimizer.zero_grad() and loss.backward() calls.

            Allows for gradient processing before optimizer.step() is called.
            E.g. for gradient clipping.
            )extra_grad_process)rI   	optimizerr=   )r    rC   rO   rP   rY   _  s   z9build_policy_class.<locals>.policy_cls.extra_grad_processc                    s.    rt  | }tti ifi |S | S rQ   )r   dictr   extra_compute_grad_fetches)rI   fetches)r!   rC   rO   rP   r\   k  s   
zAbuild_policy_class.<locals>.policy_cls.extra_compute_grad_fetchesc                    s    r | |S  | |S rQ   )compute_gradients)rI   batch)r,   rC   rO   rP   r^   t  s   
z8build_policy_class.<locals>.policy_cls.compute_gradientsc                    s"    r	 | | d S  | | d S rQ   )apply_gradients)rI   	gradients)r-   rC   rO   rP   r`   {  s   z6build_policy_class.<locals>.policy_cls.apply_gradientsc                    s^   |   !  r | ||||}n	| ||||}| |W  d    S 1 s(w   Y  d S rQ   )rR   extra_action_out_convert_to_numpy)rI   
input_dictstate_batchesr4   action_dist
stats_dict)r   rC   rO   rP   rb     s   


$z7build_policy_class.<locals>.policy_cls.extra_action_outc                    s"    r
 | | j }|S | }|S rQ   )r<   rZ   )rI   
optimizers)r"   rC   rO   rP   rZ     s
   
z0build_policy_class.<locals>.policy_cls.optimizerc                    sT   |     r | |}n| j| |}| |W  d    S 1 s#w   Y  d S rQ   )rR   rC   extra_grad_inforc   )rI   train_batchrg   )r   rO   rP   ri     s   
$z6build_policy_class.<locals>.policy_cls.extra_grad_infoc                 S   s   | j dkr	t S t S Ntorch)r1   rl   no_gradr   )rI   rO   rO   rP   rR     s   
z7build_policy_class.<locals>.policy_cls._no_grad_contextc                 S   s   | j dkr	t|S |S rk   )r1   r   )rI   datarO   rO   rP   rc     s   
z8build_policy_class.<locals>.policy_cls._convert_to_numpy)NN)__name__
__module____qualname__rD   r   r   rT   rY   r\   r^   r`   rb   rZ   ri   rR   rc   __classcell__rO   )r'   r)   r(   r&   r-   r$   r%   r,   r   r    r!   r1   r/   r2   r*   r+   r"   rC   r   r   r#   )rX   rP   
policy_cls   s*    &Urs   c                     s   t di t fi | S )a  Creates a Torch|JAXPolicy cls based on settings of another one.

        Keyword Args:
            **overrides: The settings (passed into `build_torch_policy`) that
                should be different from the class that this method is called
                on.

        Returns:
            type: A new Torch|JAXPolicy sub-class.

        Examples:
        >> MySpecialDQNPolicyClass = DQNTorchPolicy.with_updates(
        ..    name="MySpecialDQNPolicyClass",
        ..    loss_function=[some_new_loss_function],
        .. )
        NrO   )build_policy_classr[   )	overrides)original_kwargsrO   rP   with_updates  s   z(build_policy_class.<locals>.with_updates)localscopyr   r   staticmethodrw   ro   rq   )r0   r1   r2   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   basers   rw   rO   )r'   r)   r(   r&   r-   r$   r%   r,   r   r    r!   r1   r/   r2   r*   r+   r"   rv   rC   r   r   r#   rP   rt       s    
S
< 5
rt   )4typingr   r   r   r   r   r   r   r	   	gymnasiumgymray.rllib.models.catalogr
   ray.rllib.models.modelv2r   (ray.rllib.models.torch.torch_action_distr   $ray.rllib.models.torch.torch_modelv2r   ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   ray.rllib.policy.torch_policyr   ray.rllib.utilsr   r   ray.rllib.utils.annotationsr   r   ray.rllib.utils.frameworkr   r   $ray.rllib.utils.metrics.learner_infor   ray.rllib.utils.numpyr   ray.rllib.utils.typingr   r   r   jaxrK   rl   strSpacespacestyper[   intrt   rO   rO   rO   rP   <module>   s4  ( 




#'(+.169>AGLRU
XYZ