o
    $ix                     @   s  d dl Z d dlZd dlZd dlmZmZmZmZmZ d dl	Z
d dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZmZm Z m!Z!m"Z" ertd d
l#m$Z$m%Z% d dl&m'Z' d dl(m)Z) e *e+Z,e \Z-Z.dZ/dZ0e-re1dZ2ne3dZ2edddede"dee4e"f fddZ5eddddddee6 de4de"fddZ7ed d!de"fd"d#Z8eded$ dee4e"f fd%d&Z9e		'	'	dmd(ee4 d)e:d*e:d+eed,  fd-d.Z;ednd/e!d(ee4 fd0d1Z<ed2e"d3e"de"fd4d5Z=e		'dod6e!d7ee  d8e:de"fd9d:Z>ed;ee" de"fd<d=Z?edpd/e"d?e6de"fd@dAZ@ed/e"de"fdBdCZAed/e"dDe
jBde"fdEdFZCednd/e"dGeeD de"fdHdIZEe			'dqdJe"dKeeD dLe:de"fdMdNZFedOedPedQe6ddfdRdSZGedddTe"ddfdUdVZHedndWeeD ddfdXdYZIedZe"d[e"de"fd\d]ZJedrd_d`ZKedsdadbZLe	c	d	e	dtdd^dfeDdge6dhe6d(ee4 f
didjZMdkdl ZNdS )u    N)TYPE_CHECKINGDictListOptionalUnion)DiscreteMultiDiscrete)version)RepeatedValues)DeveloperAPIOldAPIStack	PublicAPI)try_import_torch)SMALL_NUMBER)LocalOptimizerNetworkTypeSpaceStructTensorStructType
TensorType)	ParamDict	ParamList)TorchPolicy)TorchPolicyV2gߌ3gߌ3Gz2.0.0zFtorch is not installed. TORCH_COMPILE_REQUIRED_VERSION is not defined.policyr   	optimizerlossreturnc           	      C   s   d}| j d dur| j d }ntj}d}|jD ].}ttdd |d }|rAtj||}t	|t
jr9|  }|t||7 }q|d7 }q|t|jkrOi S d|iS )	aU  Applies gradient clipping to already computed grads inside `optimizer`.

    Note: This function does NOT perform an analogous operation as
    tf.clip_by_global_norm. It merely clips by norm (per gradient tensor) and
    then computes the global norm across all given tensors (but without clipping
    by that global norm).

    Args:
        policy: The TorchPolicy, which calculated `loss`.
        optimizer: A local torch optimizer object.
        loss: The torch loss tensor.

    Returns:
        An info dict containing the "grad_norm" key and the resulting clipped
        gradients.
    r   	grad_clipNc                 S   s
   | j d uS N)grad)p r!   X/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/ray/rllib/utils/torch_utils.py<lambda>K   s   
 z%apply_grad_clipping.<locals>.<lambda>params   
grad_gnorm)confignpinfparam_groupslistfilternnutilsclip_grad_norm_
isinstancetorchTensorcpunumpyminlen)	r   r   r   r&   
clip_valuenum_none_gradsparam_groupr$   global_normr!   r!   r"   apply_grad_clipping-   s    

r;   value)r   grad_clip_bygradients_dictr   r   r=   c                C   s&  |du rdS |dvrt d| d|dkr1|  D ]\}}|du r$dnt|| || |< qdS |dkrY|  D ]\}}|durV|djdd	d
}||krV|||  q9dS t|  }t	|}t
|dkrk|S |tj|d |d }tj|dd}	|D ]}
|
dur|
 |	|
j q|S )a?  Performs gradient clipping on a grad-dict based on a clip value and clip mode.

    Changes the provided gradient dict in place.

    Args:
        gradients_dict: The gradients dict, mapping str to gradient tensors.
        grad_clip: The value to clip with. The way gradients are clipped is defined
            by the `grad_clip_by` arg (see below).
        grad_clip_by: One of 'value', 'norm', or 'global_norm'.

    Returns:
        If `grad_clip_by`="global_norm" and `grad_clip` is not None, returns the global
        norm of all tensors, otherwise returns None.
    N)r<   normr:   z`grad_clip_by` (z*) must be one of [value|norm|global_norm]!r<   r?          e    eAneginfposinfr   gư>)r5         ?)max)
ValueErroritemsr1   clipr?   
nan_to_nummul_r+   valuescompute_global_normr6   clampdetachtodevice)r>   r   r=   kvr?   gradients_list
total_norm
clip_coeffclip_coeff_clampedgr!   r!   r"   clip_gradients`   s<   

rZ   rU   r   c                    sH   d t | dkrtdS tt fdd| D  jddd}|S )	zComputes the global norm for a gradients dict.

    Args:
        gradients_list: The gradients list containing parameters.

    Returns:
        Returns the global norm of all tensors in `gradients_list`.
           @r           c                    s.   g | ]}|d urt |  jdddqS )NrA   rB   rC   )r1   r?   rP   rK   ).0rY   	norm_typer!   r"   
<listcomp>   s    z'compute_global_norm.<locals>.<listcomp>rA   rB   rC   )r6   r1   tensorr?   stackrK   )rU   rV   r!   r^   r"   rN      s   

rN   )r   r   c                    s4   t j fdd jD dd}| _|t |dS )az  Concatenates multi-GPU (per-tower) TD error tensors given TorchPolicy.

    TD-errors are extracted from the TorchPolicy via its tower_stats property.

    Args:
        policy: The TorchPolicy to extract the TD-error values from.

    Returns:
        A dict mapping strings "td_error" and "mean_td_error" to the
        corresponding concatenated and mean-reduced values.
    c                    s*   g | ]}|j d tdg jqS )td_errorr\   )tower_statsgetr1   ra   rQ   rR   r]   tr   r!   r"   r`      s    z.concat_multi_gpu_td_errors.<locals>.<listcomp>r   dim)rc   mean_td_error)r1   catmodel_gpu_towersrc   mean)r   rc   r!   rh   r"   concat_multi_gpu_td_errors   s   
ro   FrR   
pin_memory
use_streamstream)ztorch.cuda.Streamztorch.cuda.classes.Streamc                    s    dur	t  nt d  jdkot j rE|r=dur7tt jjt jjjfs6J dt dnt j n
t jj dnd fddt	
| S )	aj  
    Converts any (possibly nested) structure to torch.Tensors.

    Args:
        x: The input structure whose leaves will be converted.
        device: The device to create the tensor on (e.g. "cuda:0" or "cpu").
        pin_memory: If True, calls `pin_memory()` on the created tensors.
        use_stream: If True, uses a separate CUDA stream for `Tensor.to()`.
        stream: An optional CUDA stream for the host-to-device copy in `Tensor.to()`.

    Returns:
        A new structure with the same layout as `x` but with all leaves converted
        to torch.Tensors. Leaves that are None are left unchanged.
    Nr3   cudaz-`stream` must be a torch.cuda.Stream but got .rR   c                    sb  | d u r| S t | trtt| j| j| jS t| r | }nEt | t	j
r]| jtks2| jjt	ju r4| S | jjsWt  td t| }W d    n1 sQw   Y  nt| }ntt	| }| rs|jtjkrs| }r{r{| }rd urtj |j dd}W d    |S 1 sw   Y  |S |j dd}|S | }|S )NignoreT)non_blocking)r0   r
   treemap_structurerM   lengthsmax_lenr1   	is_tensorr(   ndarraydtypeobjecttypestr_flags	writeablewarningscatch_warningssimplefilter
from_numpyasarrayis_floating_pointfloat16floatrp   rs   rr   rQ   )itemra   rR   is_cudamappingrp   rr   r!   r"   r     sJ   





z(convert_to_torch_tensor.<locals>.mapping)r1   rR   r   rs   is_availabler0   Streamclassesdefault_streamrx   ry   )xrR   rp   rq   rr   r!   r   r"   convert_to_torch_tensor   s   4r   r   c                    s    fdd}t || S )a  Creates a copy of `x` and makes deep copies torch.Tensors in x.

    Also moves the copied tensors to the specified device (if not None).

    Note if an object in x is not a torch.Tensor, it will be shallow-copied.

    Args:
        x : Any (possibly nested) struct possibly containing torch.Tensors.
        device : The device to move the tensors to.

    Returns:
        Any: A new struct with the same structure as `x`, but with all
            torch.Tensors deep-copied and moved to the specified device.

    c                    s4   t | tjr d u rt|  S |   S | S r   )r0   r1   r2   clonerP   rQ   )r   ru   r!   r"   r   X  s   z#copy_torch_tensors.<locals>.mapping)rx   ry   )r   rR   r   r!   ru   r"   copy_torch_tensorsF  s   
r   ypredc                 C   s\   |   }tj|dd}tj||   dd}tdg|j}t|d||t   d S )a,  Computes the explained variance for a pair of labels and predictions.

    The formula used is:
    max(-1.0, 1.0 - (std(y - pred)^2 / std(y)^2))

    Args:
        y: The labels.
        pred: The predictions.

    Returns:
        The explained variance given a pair of labels and predictions.
    r   ri   g      r%   )squeezer1   varra   rQ   rR   rG   r   )r   r   
squeezed_yy_vardiff_varmin_r!   r!   r"   explained_variancee  s
   r   inputsspaces_struct	time_axisc                 C   s8  t | }|durt |ndgt| }d}d}g }t||D ]g\}}	|du r4|jd }|r4|jd }t|	trO|rDt||| g}|	t
||	  q t|	trk|r`t||| dg}|	t
||	  q |rxt||| dg}nt||dg}|	|  q tj|dd}
|rt|
||dg}
|
S )ao	  Flattens arbitrary input structs according to the given spaces struct.

    Returns a single 1D tensor resulting from the different input
    components' values.

    Thereby:
    - Boxes (any shape) get flattened to (B, [T]?, -1). Note that image boxes
    are not treated differently from other types of Boxes and get
    flattened as well.
    - Discrete (int) values are one-hot'd, e.g. a batch of [1, 0, 3] (B=3 with
    Discrete(4) space) results in [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1]].
    - MultiDiscrete values are multi-one-hot'd, e.g. a batch of
    [[0, 2], [1, 4]] (B=2 with MultiDiscrete([2, 5]) space) results in
    [[1, 0,  0, 0, 1, 0, 0], [0, 1,  0, 0, 0, 0, 1]].

    Args:
        inputs: The inputs to be flattened.
        spaces_struct: The structure of the spaces that behind the input
        time_axis: Whether all inputs have a time-axis (after the batch axis).
            If True, will keep not only the batch axis (0th), but the time axis
            (1st) as-is and flatten everything from the 2nd axis up.

    Returns:
        A single 1D tensor resulting from concatenating all
        flattened/one-hot'd input components. Depending on the time_axis flag,
        the shape is (B, n) or (B, T, n).

    .. testcode::

        from gymnasium.spaces import Discrete, Box
        from ray.rllib.utils.torch_utils import flatten_inputs_to_1d_tensor
        import torch
        struct = {
            "a": np.array([1, 3]),
            "b": (
                np.array([[1.0, 2.0], [4.0, 5.0]]),
                np.array(
                    [[[8.0], [7.0]], [[5.0], [4.0]]]
                ),
            ),
                "c": {
                    "cb": np.array([1.0, 2.0]),
                },
        }
        struct_torch = tree.map_structure(lambda s: torch.from_numpy(s), struct)
        spaces = dict(
            {
                "a": gym.spaces.Discrete(4),
                "b": (gym.spaces.Box(-1.0, 10.0, (2,)), gym.spaces.Box(-1.0, 1.0, (2,
                        1))),
                "c": dict(
                    {
                        "cb": gym.spaces.Box(-1.0, 1.0, ()),
                    }
                ),
            }
        )
        print(flatten_inputs_to_1d_tensor(struct_torch, spaces_struct=spaces))

    .. testoutput::

        tensor([[0., 1., 0., 0., 1., 2., 8., 7., 1.],
                [0., 0., 0., 1., 4., 5., 5., 4., 2.]])

    Nr   r%   ri   )rx   flattenr6   zipshaper0   r   r1   reshapeappendone_hotr   r   rl   )r   r   r   flat_inputsflat_spacesBToutinput_spacemergedr!   r!   r"   flatten_inputs_to_1d_tensorz  s:   
H




r   tensorsc                 C   s(   dd | D }t tdd |D dS )aN  Returns the global L2 norm over a list of tensors.

    output = sqrt(SUM(t ** 2 for t in tensors)),
        where SUM reduces over all tensors and over all elements in tensors.

    Args:
        tensors: The list of tensors to calculate the global norm over.

    Returns:
        The global L2 norm over the given tensor list.
    c              
   S   s&   g | ]}t t t |d dqS )r[         ?r1   powsumrf   r!   r!   r"   r`     s   & zglobal_norm.<locals>.<listcomp>c                 s   s    | ]	}t |d V  qdS )r[   N)r1   r   )r]   l2r!   r!   r"   	<genexpr>  s    zglobal_norm.<locals>.<genexpr>r   r   )r   
single_l2sr!   r!   r"   r:     s   r:   rF   deltac                 C   s6   t t | |k t | dd |t | d|   S )a  Computes the huber loss for a given term and delta parameter.

    Reference: https://en.wikipedia.org/wiki/Huber_loss
    Note that the factor of 0.5 is implicitly included in the calculation.

    Formula:
        L = 0.5 * x^2  for small abs x (delta threshold)
        L = delta * (abs(x) - 0.5*delta)  for larger abs x (delta threshold)

    Args:
        x: The input term, e.g. a TD error.
        delta: The delta parmameter in the above formula.

    Returns:
        The Huber loss resulting from `x` and `delta`.
    r[   r   )r1   whereabsr   )r   r   r!   r!   r"   
huber_loss   s
   r   c                 C   s   dt t | d S )zComputes half the L2 norm over a tensor's values without the sqrt.

    output = 0.5 * sum(x ** 2)

    Args:
        x: The input tensor.

    Returns:
        0.5 times the L2 norm over the given tensor's values (w/o sqrt).
    r   r[   )r1   r   r   r   r!   r!   r"   l2_loss  s   r   r   c                    s   t |trtj  |jS t |tr@t |jd t	j
r-t	|j}  jd d n|j}tj fddt|D ddS td|)a  Returns a one-hot tensor, given and int tensor and a space.

    Handles the MultiDiscrete case as well.

    Args:
        x: The input tensor.
        space: The space to use for generating the one-hot tensor.

    Returns:
        The resulting one-hot tensor.

    Raises:
        ValueError: If the given space is not a discrete one.

    .. testcode::

        import torch
        import gymnasium as gym
        from ray.rllib.utils.torch_utils import one_hot
        x = torch.IntTensor([0, 3])  # batch-dim=2
        # Discrete space with 4 (one-hot) slots per batch item.
        s = gym.spaces.Discrete(4)
        print(one_hot(x, s))
        x = torch.IntTensor([[0, 1, 2, 3]])  # batch-dim=1
        # MultiDiscrete space with 5 + 4 + 4 + 7 = 20 (one-hot) slots
        # per batch item.
        s = gym.spaces.MultiDiscrete([5, 4, 4, 7])
        print(one_hot(x, s))

    .. testoutput::

        tensor([[1, 0, 0, 0],
                [0, 0, 0, 1]])
        tensor([[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]])
    r   r   c                    s.   g | ]\}}t j d d |f  |qS r   )r-   
functionalr   long)r]   inr   r!   r"   r`   V  s   . zone_hot.<locals>.<listcomp>ri   z#Unsupported space for `one_hot`: {})r0   r   r-   r   r   r   r   r   nvecr(   r}   ravelr   r   r1   rl   	enumeraterH   format)r   r   r   r!   r   r"   r   (  s   
%
r   axisc                 C   s@   t | td}t || t | }t ||t | | S )zSame as torch.mean() but ignores -inf values.

    Args:
        x: The input tensor to reduce mean over.
        axis: The axis over which to reduce. None for all axes.

    Returns:
        The mean reduced inputs, ignoring inf values.
    z-inf)r1   ner   r   
zeros_liker   )r   r   maskx_zeroedr!   r!   r"   reduce_mean_ignore_inf]  s   r   rz   maxlen
time_majorc                 C   sh   |du r|   }tt| jt|f }|| jjdd	 | k }|s*|	 }|
|p0tj |S )al  Offers same behavior as tf.sequence_mask for torch.

    Thanks to Dimitris Papatheodorou
    (https://discuss.pytorch.org/t/pytorch-equivalent-for-tf-sequence-mask/
    39036).

    Args:
        lengths: The tensor of individual lengths to mask by.
        maxlen: The maximum length to use for the time axis. If None, use
            the max of `lengths`.
        dtype: The torch dtype to use for the resulting mask.
        time_major: Whether to return the mask as [B, T] (False; default) or
            as [T, B] (True).

    Returns:
         The sequence mask resulting from the given input and parameters.
    Nr%   ri   )rG   r1   onestupler   intrQ   rR   cumsumrg   r   bool)rz   r   r~   r   r   r!   r!   r"   sequence_maskm  s   r   main_net
target_nettauc                    s2   |     fdd|   D }|| dS )a  Updates a torch.nn.Module target network using Polyak averaging.

    .. code-block:: text

        new_target_net_weight = (
            tau * main_net_weight + (1.0 - tau) * current_target_net_weight
        )

    Args:
        main_net: The nn.Module to update from.
        target_net: The target network to update.
        tau: The tau value to use in the Polyak averaging formula.
    c                    s*   i | ]\}}| |  d  |  qS )r%   r!   )r]   rS   rT   
state_dictr   r!   r"   
<dictcomp>  s    z)update_target_network.<locals>.<dictcomp>N)r   rI   load_state_dict)r   r   r   new_state_dictr!   r   r"   update_target_network  s
   
r   kl_divergencec                 C   s&   |   r| rtd d S d S d S )Na}  KL divergence is non-finite, this will likely destabilize your model and the training process. Action(s) in a specific state have near-zero probability. This can happen naturally in deterministic environments where the optimal policy has zero mass for a specific action. To fix this issue, consider setting the coefficient for the KL loss term to zero or increasing policy entropy.)loss_initializedisinfloggerwarning)r   r   r!   r!   r"   warn_if_infinite_kl_divergence  s
   r   seedc                 C   s   | durOt rQt |  t jj}|dur-tt jjdkr-dtjd< t j|  t j|  ntt j	tdkr>t 
d nt d dt jj_dt jj_dS dS dS )ztSets the torch random seed to the given value.

    Args:
        seed: The seed to use or None for no seeding.
    Ngffffff$@z:4096:8CUBLAS_WORKSPACE_CONFIGz1.8.0TF)r1   manual_seedr	   rs   r   osenvironmanual_seed_allVersion__version__use_deterministic_algorithmsset_deterministicbackendscudnndeterministic	benchmark)r   cuda_versionr!   r!   r"   set_torch_seed  s   



r   logitslabelsc                 C   s   t | tj| d dS )zSame behavior as tf.nn.softmax_cross_entropy_with_logits.

    Args:
        x: The input predictions.
        labels: The labels corresponding to `x`.

    Returns:
        The resulting softmax cross-entropy given predictions and labels.
    r   )r1   r   r-   r   log_softmax)r   r   r!   r!   r"   !softmax_cross_entropy_with_logits  s   r   torch.Tensorc                 C   s   t | t t | d  S )zThe symlog function as described in [1]:

    [1] Mastering Diverse Domains through World Models - 2023
    D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
    https://arxiv.org/pdf/2301.04104v1.pdf
    r%   )r1   signlogr   r   r!   r!   r"   symlog  s   r   c                 C   s   t | t t | d  S )zInverse of the `symlog` function as desribed in [1]:

    [1] Mastering Diverse Domains through World Models - 2023
    D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
    https://arxiv.org/pdf/2301.04104v1.pdf
    r%   )r1   r   expr   )r   r!   r!   r"   inverse_symlog  s   r            4      4@num_bucketslower_boundupper_boundc                 C   sD  t | ||} t jd| jd |d }|| |d  }| |  | }t |}t |}	t ||	|	d |	}	t |	||	d |	}	|||  }
||	|  }| | |
|  }d| }t j	||gdd}t j	||	gdd}t j
||gdd }t j
||gdd}t j| jd ||d}|||dddf |dddf f< |S )	a+  Returns a two-hot vector of dim=num_buckets with two entries that are non-zero.

    See [1] for more details:
    [1] Mastering Diverse Domains through World Models - 2023
    D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
    https://arxiv.org/pdf/2301.04104v1.pdf

    Entries in the vector represent equally sized buckets within some fixed range
    (`lower_bound` to `upper_bound`).
    Those entries not 0.0 at positions k and k+1 encode the actual `value` and sum
    up to 1.0. They are the weights multiplied by the buckets values at k and k+1 for
    retrieving `value`.

    Example:
        num_buckets=11
        lower_bound=-5
        upper_bound=5
        value=2.5
        -> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0]
        -> [-5   -4   -3   -2   -1   0    1    2    3    4    5] (0.5*2 + 0.5*3=2.5)

    Example:
        num_buckets=5
        lower_bound=-1
        upper_bound=1
        value=0.1
        -> [0.0, 0.0, 0.8, 0.2, 0.0]
        -> [-1  -0.5   0   0.5   1] (0.2*0.5 + 0.8*0=0.1)

    Args:
        value: The input tensor of shape (B,) to be two-hot encoded.
        num_buckets: The number of buckets to two-hot encode into.
        lower_bound: The lower bound value used for the encoding. If input values are
            lower than this boundary, they will be encoded as `lower_bound`.
        upper_bound: The upper bound value used for the encoding. If input values are
            higher than this boundary, they will be encoded as `upper_bound`.

    Returns:
        The two-hot encoded tensor of shape (B, num_buckets).
    r   ru   r%   rF   r[   r   ri   N)r1   rO   aranger   r   floorceilr   eqrb   rl   r   zeros)r<   r  r  r  rR   batch_indicesbucket_deltaidxrS   kp1values_k
values_kp1	weights_kweights_kp1	indices_kindices_kp1indicesupdatesoutputr!   r!   r"   two_hot  s&   1

$r  c                  C   s(   z	dd l m}  W dS  ty   Y dS w )Nr   TF)torch._dynamo_dynamoImportError)dynamor!   r!   r"   _dynamo_is_availablen  s   r  )NFFNr   )NF)rF   )NNF)r   r   r   r   )r   r   r   r   )r  r  r  N)Ologgingr   r   typingr   r   r   r   r   	gymnasiumgymr4   r(   rx   gymnasium.spacesr   r   	packagingr	    ray.rllib.models.repeated_valuesr
   ray.rllib.utils.annotationsr   r   r   ray.rllib.utils.frameworkr   ray.rllib.utils.numpyr   ray.rllib.utils.typingr   r   r   r   r   ray.rllib.core.learner.learnerr   r   ray.rllib.policy.torch_policyr    ray.rllib.policy.torch_policy_v2r   	getLogger__name__r   r1   r-   	FLOAT_MIN	FLOAT_MAXparseTORCH_COMPILE_REQUIRED_VERSIONrH   strr;   r   rZ   rN   ro   r   r   r   r   r   r:   r   r   Spacer   r   r   r   r   r   r   r   r   r   r  r  r!   r!   r!   r"   <module>   s>   


2C#

`r4(
X