o
    `۷i'B                  +   @   s  d dl mZmZmZmZmZmZmZ d dlZ	d dl
mZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z( e" \Z)Z*Z+eddddddddddddddddddedddde,deeeee egee(ee( f f deedge&f  deeeegee,e(f f  deeee&gdf  deeede(ge'f  deeede'gdf  deeeee'gee,e(f f  deeegee,e(f f  deeegee,e(f f  deeee	j-e	j-e&gdf  deeee	j-e	j-e&gdf  deeee	j.j-e	j.j-e&gdf  d eeee	j-e	j-e&gdf  d!eeee	j.j-e	j.j-e&gef  d"eee(ee( gee(e(f f  d#eeeee(e(e(gee(e/ee( f f  d$eee/  d%eeege0f  d&ee f(d'd(Z1dS ))    )CallableDictListOptionalTupleTypeUnionN)DEPRECATED_VALUEdeprecation_warning)ModelV2)TFActionDistribution)eager_tf_policy)DynamicTFPolicy)Policy)SampleBatch)TFPolicy)
add_mixins
force_list)OldAPIStackoverride)try_import_tf)LEARNER_STATS_KEY)AlgorithmConfigDictModelGradients
TensorType)get_default_configpostprocess_fnstats_fnoptimizer_fncompute_gradients_fnapply_gradients_fngrad_stats_fnextra_action_out_fnextra_learn_fetches_fnvalidate_spacesbefore_initbefore_loss_init
after_init
make_modelaction_sampler_fnaction_distribution_fnmixinsget_batch_divisibility_reqobs_include_prev_action_rewardextra_action_fetches_fngradients_fnnameloss_fnr   r   r   ztf.keras.optimizers.Optimizerr   r    ztf.Operationr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   returnc                   s   t   tt||tkrtddd |durtdddd |dur*tdd	dd G  	
fd
dd}fdd}fdd}t||_t||_| |_	| |_
|S )aS  Helper function for creating a dynamic tf policy at runtime.

    Functions will be run in this order to initialize the policy:
        1. Placeholder setup: postprocess_fn
        2. Loss init: loss_fn, stats_fn
        3. Optimizer init: optimizer_fn, gradients_fn, apply_gradients_fn,
                           grad_stats_fn

    This means that you can e.g., depend on any policy attributes created in
    the running of `loss_fn` in later functions such as `stats_fn`.

    In eager mode, the following functions will be run repeatedly on each
    eager execution: loss_fn, stats_fn, gradients_fn, apply_gradients_fn,
    and grad_stats_fn.

    This means that these functions should not define any variables internally,
    otherwise they will fail in eager mode execution. Variable should only
    be created in make_model (if defined).

    Args:
        name: Name of the policy (e.g., "PPOTFPolicy").
            loss_fn (Callable[[
                Policy, ModelV2, Type[TFActionDistribution], SampleBatch],
                Union[TensorType, List[TensorType]]]): Callable for calculating a
                loss tensor.
            get_default_config (Optional[Callable[[None], AlgorithmConfigDict]]):
                Optional callable that returns the default config to merge with any
                overrides. If None, uses only(!) the user-provided
                PartialAlgorithmConfigDict as dict for this Policy.
            postprocess_fn (Optional[Callable[[Policy, SampleBatch,
                Optional[Dict[AgentID, SampleBatch]], Episode], None]]):
                Optional callable for post-processing experience batches (called
                after the parent class' `postprocess_trajectory` method).
            stats_fn (Optional[Callable[[Policy, SampleBatch],
                Dict[str, TensorType]]]): Optional callable that returns a dict of
                TF tensors to fetch given the policy and batch input tensors. If
                None, will not compute any stats.
            optimizer_fn (Optional[Callable[[Policy, AlgorithmConfigDict],
                "tf.keras.optimizers.Optimizer"]]): Optional callable that returns
                a tf.Optimizer given the policy and config. If None, will call
                the base class' `optimizer()` method instead (which returns a
                tf1.train.AdamOptimizer).
            compute_gradients_fn (Optional[Callable[[Policy,
                "tf.keras.optimizers.Optimizer", TensorType], ModelGradients]]):
                Optional callable that returns a list of gradients. If None,
                this defaults to optimizer.compute_gradients([loss]).
            apply_gradients_fn (Optional[Callable[[Policy,
                "tf.keras.optimizers.Optimizer", ModelGradients],
                "tf.Operation"]]): Optional callable that returns an apply
                gradients op given policy, tf-optimizer, and grads_and_vars. If
                None, will call the base class' `build_apply_op()` method instead.
            grad_stats_fn (Optional[Callable[[Policy, SampleBatch, ModelGradients],
                Dict[str, TensorType]]]): Optional callable that returns a dict of
                TF fetches given the policy, batch input, and gradient tensors. If
                None, will not collect any gradient stats.
            extra_action_out_fn (Optional[Callable[[Policy],
                Dict[str, TensorType]]]): Optional callable that returns
                a dict of TF fetches given the policy object. If None, will not
                perform any extra fetches.
            extra_learn_fetches_fn (Optional[Callable[[Policy],
                Dict[str, TensorType]]]): Optional callable that returns a dict of
                extra values to fetch and return when learning on a batch. If None,
                will call the base class' `extra_compute_grad_fetches()` method
                instead.
            validate_spaces (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): Optional callable that takes the
                Policy, observation_space, action_space, and config to check
                the spaces for correctness. If None, no spaces checking will be
                done.
            before_init (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): Optional callable to run at the
                beginning of policy init that takes the same arguments as the
                policy constructor. If None, this step will be skipped.
            before_loss_init (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, AlgorithmConfigDict], None]]): Optional callable to
                run prior to loss init. If None, this step will be skipped.
            after_init (Optional[Callable[[Policy, gym.Space, gym.Space,
                AlgorithmConfigDict], None]]): Optional callable to run at the end of
                policy init. If None, this step will be skipped.
            make_model (Optional[Callable[[Policy, gym.spaces.Space,
                gym.spaces.Space, AlgorithmConfigDict], ModelV2]]): Optional callable
                that returns a ModelV2 object.
                All policy variables should be created in this function. If None,
                a default ModelV2 object will be created.
            action_sampler_fn (Optional[Callable[[TensorType, List[TensorType]],
                Tuple[TensorType, TensorType]]]): A callable returning a sampled
                action and its log-likelihood given observation and state inputs.
                If None, will either use `action_distribution_fn` or
                compute actions by calling self.model, then sampling from the
                so parameterized action distribution.
            action_distribution_fn (Optional[Callable[[Policy, ModelV2, TensorType,
                TensorType, TensorType],
                Tuple[TensorType, type, List[TensorType]]]]): Optional callable
                returning distribution inputs (parameters), a dist-class to
                generate an action distribution object from, and internal-state
                outputs (or an empty list if not applicable). If None, will either
                use `action_sampler_fn` or compute actions by calling self.model,
                then sampling from the so parameterized action distribution.
            mixins (Optional[List[type]]): Optional list of any class mixins for
                the returned policy class. These mixins will be applied in order
                and will have higher precedence than the DynamicTFPolicy class.
            get_batch_divisibility_req (Optional[Callable[[Policy], int]]):
                Optional callable that returns the divisibility requirement for
                sample batches. If None, will assume a value of 1.

    Returns:
        Type[DynamicTFPolicy]: A child class of DynamicTFPolicy based on the
            specified args.
    r-   T)olderrorNr.   r"   )r3   newr4   r/   r   c                       s   e Zd Z		d 
fdd	Zee	dfdd	ZeefddZeefdd	Z	eefd
dZ
eefddZee	fddZdS )z#build_tf_policy.<locals>.policy_clsNc                    st   r	| ||| r| ||| fdd}t j| |||
|	 ||d r5| ||| d| _d S )Nc                    sR    r	 | ||| d u s| j ri }n| }t| dr$| j| d S || _d S )N_extra_action_fetches)	_is_towerhasattrr6   update)policy	obs_spaceaction_spaceconfigextra_action_fetches)r&   r"    Y/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/rllib/policy/tf_policy_template.pybefore_loss_init_wrapper   s   

zNbuild_tf_policy.<locals>.policy_cls.__init__.<locals>.before_loss_init_wrapper)r;   r<   r=   r1   r   r!   r&   r(   r)   r*   existing_inputsexisting_modelr,   r   )r   __init__global_timestep)selfr;   r<   r=   rC   rB   rA   )r*   r)   r'   r%   r&   r"   r,   r!   r1   r(   r   r$   r?   r@   rD      s0   
z,build_tf_policy.<locals>.policy_cls.__init__c                    s"   t | |} r | |||S |S N)r   postprocess_trajectory)rF   sample_batchother_agent_batchesepisode)r   r?   r@   rH   
  s   z:build_tf_policy.<locals>.policy_cls.postprocess_trajectoryc                    sT   r	| | j }n | }t|}| jr| j|}|sd S | j d r&|S |d S N%_tf_policy_handles_more_than_one_lossr   )r=   	optimizerr   explorationget_exploration_optimizer)rF   
optimizers)baser   r?   r@   rN     s   

z-build_tf_policy.<locals>.policy_cls.optimizerc                    sL   t |}t |}r| jd r| ||S | |d |d S  | ||S rL   )r   r=   	gradients)rF   rN   lossrQ   losses)rR   r   r?   r@   rS   )  s   
z-build_tf_policy.<locals>.policy_cls.gradientsc                    s    r | ||S  | ||S rG   )build_apply_op)rF   rN   grads_and_vars)r    rR   r?   r@   rV   9  s   z2build_tf_policy.<locals>.policy_cls.build_apply_opc                    s   t  | fi | jS rG   )dictextra_compute_action_fetchesr6   rF   )rR   r?   r@   rY   @  s
   z@build_tf_policy.<locals>.policy_cls.extra_compute_action_fetchesc                    s&   rt ti ifi | S  | S rG   )rX   r   extra_compute_grad_fetchesrZ   )rR   r#   r?   r@   r[   F  s   
z>build_tf_policy.<locals>.policy_cls.extra_compute_grad_fetches)NN)__name__
__module____qualname__rD   r   r   rH   r   rN   rS   rV   rY   r[   r?   )r*   r)   r'   r    rR   r%   r&   r   r"   r#   r,   r!   r1   r(   r   r   r   r$   r?   r@   
policy_cls   s"    $3	r_   c                     s   t di t fi | S )a  Allows creating a TFPolicy cls based on settings of another one.

        Keyword Args:
            **overrides: The settings (passed into `build_tf_policy`) that
                should be different from the class that this method is called
                on.

        Returns:
            type: A new TFPolicy sub-class.

        Examples:
        >> MySpecialDQNPolicyClass = DQNTFPolicy.with_updates(
        ..    name="MySpecialDQNPolicyClass",
        ..    loss_function=[some_new_loss_function],
        .. )
        Nr?   )build_tf_policyrX   )	overridesoriginal_kwargsr?   r@   with_updatesT  s   z%build_tf_policy.<locals>.with_updatesc                      s   t jdi  S )Nr?   )r   _build_eager_tf_policyr?   rb   r?   r@   as_eagerg  s   z!build_tf_policy.<locals>.as_eager)localscopyr   r   r	   r
   staticmethodrd   rf   r\   r^   )r0   r1   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r_   rd   rf   r?   )r*   r)   r'   r    rR   r%   r&   r   r"   r#   r,   r!   r1   r(   r   rc   r   r   r$   r@   r`      s&    
,
6~

r`   )2typingr   r   r   r   r   r   r   	gymnasiumgymray._common.deprecationr	   r
   ray.rllib.models.modelv2r   "ray.rllib.models.tf.tf_action_distr   ray.rllib.policyr   "ray.rllib.policy.dynamic_tf_policyr   ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   ray.rllib.policy.tf_policyr   ray.rllib.utilsr   r   ray.rllib.utils.annotationsr   r   ray.rllib.utils.frameworkr   $ray.rllib.utils.metrics.learner_infor   ray.rllib.utils.typingr   r   r   tf1tftfvstrSpacespacestypeintr`   r?   r?   r?   r@   <module>   s   $ 	
 %(-0
67<