o
    ̳iD                     @   sF   d dl mZmZ d dlZd dlmZ d dlmZ G dd dejZ	dS )    )OptionalTupleN)rlhfc                       s   e Zd ZdZ			ddededef fddZ				dd
ejdejdejdejdejdejdeej deej de	ejejejejejf fddZ
  ZS )PPOLossa  
    Proximal Policy Optimization (PPO) Loss module.
    This implementation uses the following references:

    https://arxiv.org/abs/1707.06347 eqn. 7

    https://github.com/vwxyzjn/lm-human-preference-details/blob/ccc19538e817e98a60d3253242ac15e2a562cb49/lm_human_preference_details/train_policy_accelerate.py#L719

    https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/ppo2/model.py#L68-L75


    Args:
        epsilon (float): clipping range for PPO update.
        value_clip_range (float): clipping range for value function update.
        value_coeff (float): coefficient for the value function loss contribution.
    皙?皙?epsilonvalue_clip_rangevalue_coeffc                    s    t    || _|| _|| _d S )N)super__init__r   r	   r
   )selfr   r	   r
   	__class__ K/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/rlhf/loss/ppo.pyr       s   

zPPOLoss.__init__Npi_old_logprobspi_logprobs
advantagesphi_old_values
phi_valuesreturnspadding_masksvalue_padding_masksreturnc	                 C   s  t || }	t |	d| j d| j }
| |
 }| |	 }||k|j}|du r.| nt||}t 	||}|du rB| nt||}t ||| j
 || j
 }t 	|| d || d }|du rmd|  ndt|| }||| j  }|| | |	  | fS )a  

        Forward pass of the PPO loss module.

        Args:
            pi_old_logprobs (torch.Tensor): Log probabilities of the old policy.
            pi_logprobs (torch.Tensor): Log probabilities of the current policy.
            advantages (torch.Tensor): Advantage values.
            phi_old_values (torch.Tensor): Value predictions of the old value function.
            phi_values (torch.Tensor): Value predictions of the current value function.
            returns (torch.Tensor): Return values.
            padding_masks (Optional[torch.Tensor]): Padding token masks of the same shape as ``pi_logprobs``,
                where True indicates the corresponding loss values should participage in policy loss calculation.
            value_padding_masks (Optional[torch.Tensor]): Padding token masks of the same shape as ``pi_logprobs``,
                where True indicates the corresponding loss values should participage in value loss calculation.

        Returns:
            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: A tuple of five tensors:
                - loss: The total PPO loss.
                - policy_loss: The policy function loss.
                - value_loss: The value function loss.
                - ratios: The ratio between the current and old policy probabilities.
                - clipfrac: The fraction of ratios that were clipped.

        g      ?N   g      ?)torchexpclampr   todtypemeanr   masked_meanmaximumr	   r
   detach)r   r   r   r   r   r   r   r   r   ratiosclipped_ratiospolicy_losses_clippedpolicy_losses_unclippedclipfracpolicy_lossvalues_clipped
value_losslossr   r   r   forward+   sF   $




zPPOLoss.forward)r   r   r   )NN)__name__
__module____qualname____doc__floatr   r   Tensorr   r   r.   __classcell__r   r   r   r   r      sB    	
r   )
typingr   r   r   torch.nnnn	torchtuner   Moduler   r   r   r   r   <module>   s
   