o
    ۷i                     @   s\   d dl Zd dlZd dlZddlmZ ddlmZ ddlm	Z	 ddl
mZ G dd deZdS )	    N   )UNet1DModel)DiffusionPipeline)DDPMScheduler)randn_tensorc                       s`   e Zd ZdZdededef fddZdd Zd	d
 Zdd Z	dd Z
dd ZdddZ  ZS )ValueGuidedRLPipelineal  
    Pipeline for value-guided sampling from a diffusion model trained to predict sequences of states.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
        value_function ([`UNet1DModel`]):
            A specialized UNet for fine-tuning trajectories base on reward.
        unet ([`UNet1DModel`]):
            UNet architecture to denoise the encoded trajectories.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded trajectories. Default for this
            application is [`DDPMScheduler`].
        env ():
            An environment following the OpenAI gym API to act in. For now only Hopper has pretrained models.
    value_functionunet	schedulerc                    s   t    | j||||d | | _i | _| j D ]}z| j|  | j|< W q   Y qi | _| j D ]}z| j| 	 | j|< W q7   Y q7|j
jd | _|jjd | _d S )N)r   r	   r
   envr   )super__init__register_modulesget_datasetdatameanskeysmeanstdsstdobservation_spaceshape	state_dimaction_space
action_dim)selfr   r	   r
   r   key	__class__ e/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/experimental/rl/value_guided_sampling.pyr   ,   s"   

zValueGuidedRLPipeline.__init__c                 C   s   || j |  | j|  S N)r   r   r   x_inr   r   r   r    	normalizeG      zValueGuidedRLPipeline.normalizec                 C   s   || j |  | j|  S r!   )r   r   r"   r   r   r    de_normalizeJ   r%   z"ValueGuidedRLPipeline.de_normalizec                    sJ   t |tr fdd| D S t|r| jjS tj| jjdS )Nc                    s   i | ]
\}}|  |qS r   )to_torch).0kvr   r   r    
<dictcomp>O   s    z2ValueGuidedRLPipeline.to_torch.<locals>.<dictcomp>device)	
isinstancedictitemstorch	is_tensortor	   r.   tensor)r   r#   r   r+   r    r'   M   s
   

zValueGuidedRLPipeline.to_torchc                 C   s0   |  D ]\}}| |d d ||d f< q|S r!   )r1   clone)r   r#   condact_dimr   valr   r   r    reset_x0T   s   zValueGuidedRLPipeline.reset_x0c              
   C   sV  |j d }d }t| jjD ]}tj|f|| jjtjd}t	|D ]\}	t
 6 |  | |ddd|j}tj| g|gd }
| j|}td| }||
 }
W d    n1 s`w   Y  d|
|dk < | }|||
  }| ||| j}q!| |ddd|jddd}| j|||d }| ||| j}| |}q||fS )Nr   )r.   dtype      g      ?prev_sample)r   tqdmr
   	timestepsr2   fullr	   r.   longrangeenable_gradrequires_grad_r   permutesampleautogradgradsum_get_varianceexpdetachr:   r   stepr'   )r   x
conditionsn_guide_stepsscale
batch_sizeyir@   _rI   posterior_variance	model_stdprev_xr   r   r    run_diffusionY   s,   


"z#ValueGuidedRLPipeline.run_diffusion@       r<   皙?c                 C   s   |  |d}|d  j|dd}d| |i}||| j| j f}t|| jjd}| ||| j}	| |	}	| 	|	|||\}	}
|
j
ddd }|	| }|d d d d d | jf }|   }| j|dd}|
d urod}ntjd|}||df }|S )	Nobservationsr   )axisr-   T)
descendingactions)r   )r$   repeatr'   r   r   r   r	   r.   r:   rZ   argsortsqueezerM   cpunumpyr&   nprandomrandint)r   obsrS   planning_horizonrQ   rR   rP   r   x1rO   rT   
sorted_idxsorted_valuesra   denorm_actionsselected_indexr   r   r    __call__z   s$   
zValueGuidedRLPipeline.__call__)r[   r\   r<   r]   )__name__
__module____qualname____doc__r   r   r   r$   r&   r'   r:   rZ   rq   __classcell__r   r   r   r    r      s    !r   )rf   rg   r2   r?   models.unets.unet_1dr   	pipelinesr   utils.dummy_pt_objectsr   utils.torch_utilsr   r   r   r   r   r    <module>   s   