o
    }oiA                     @  s   d Z ddlmZ ddlZddlmZ ddlmZ ddlZddl	m
Z
 ddlmZ ddlmZ er?dd	lmZ ddlZddlZndd
lmZ edZedZeeZd%ddZG dd dejjZG dd dZ		d&d'd#d$ZdS )(a  Notations in this Gaussian process implementation

X_train: Observed parameter values with the shape of (len(trials), len(params)).
y_train: Observed objective values with the shape of (len(trials), ).
x: (Possibly batched) parameter value(s) to evaluate with the shape of (..., len(params)).
cov_fX_fX: Kernel matrix X = V[f(X)] with the shape of (len(trials), len(trials)).
cov_fx_fX: Kernel matrix Cov[f(x), f(X)] with the shape of (..., len(trials)).
cov_fx_fx: Kernel scalar value x = V[f(x)]. This value is constant for the Matern 5/2 kernel.
cov_Y_Y_inv:
    The inverse of the covariance matrix (V[f(X) + noise_var])^-1 with the shape of
    (len(trials), len(trials)).
cov_Y_Y_inv_Y: `cov_Y_Y_inv @ y` with the shape of (len(trials), ).
max_Y: The maximum of Y (Note that we transform the objective values such that it is maximized.)
sqd: The squared differences of each dimension between two points.
is_categorical:
    A boolean array with the shape of (len(params), ). If is_categorical[i] is True, the i-th
    parameter is categorical.
    )annotationsN)Any)TYPE_CHECKING)*single_blas_thread_if_scipy_v1_15_or_newer)optuna_warn)
get_logger)Callable)_LazyImportscipytorchvalues
np.ndarrayreturnc                 C  s~   t | }t |r| S td t j|dd}t | t |t jt || t jdddt |t j	t || t j dddS )NzDClip non-finite values to the min/max finite values for GP fittings.r   )axis        )
npisfiniteallr   anyclipwheremininfmax)r   is_values_finiteis_any_finite r   A/home/ubuntu/.local/lib/python3.10/site-packages/optuna/_gp/gp.pywarn_and_convert_inf/   s   

"$r   c                   @  s(   e Zd ZedddZedd	d
ZdS )Matern52Kernelctxr   squared_distancetorch.Tensorr   c                 C  sL   t d| }t | }|d| | d  }d|d  | }| | |S )a  
        This method calculates `exp(-sqrt5d) * (1/3 * sqrt5d ** 2 + sqrt5d + 1)` where
        `sqrt5d = sqrt(5 * squared_distance)`.

        Please note that automatic differentiation by PyTorch does not work well at
        `squared_distance = 0` due to zero division, so we manually save the derivative, i.e.,
        `-5/6 * (1 + sqrt5d) * exp(-sqrt5d)`, for the exact derivative calculation.

        Notice that the derivative of this function is taken w.r.t. d**2, but not w.r.t. d.
           g?   g)r   sqrtexpsave_for_backward)r    r!   sqrt5dexp_partvalderivr   r   r   forward@   s   
zMatern52Kernel.forwardgradc                 C  s   | j \}|| S )z
        Let x be squared_distance, f(x) be forward(ctx, x), and g(f) be a provided function, then
        deriv := df/dx, grad := dg/df, and deriv * grad = df/dx * dg/df = dg/dx.
        )saved_tensors)r    r-   r+   r   r   r   backwardS   s   zMatern52Kernel.backwardN)r    r   r!   r"   r   r"   )r    r   r-   r"   r   r"   )__name__
__module____qualname__staticmethodr,   r/   r   r   r   r   r   ?   s
    r   c                   @  s\   e Zd Zd(d
dZed)ddZd*ddZ	d+d,ddZd-d.ddZd/ddZ	d0d&d'Z
dS )1GPRegressoris_categoricalr"   X_trainy_traininverse_squared_lengthscaleskernel_scale	noise_varr   Nonec                 C  s   || _ || _|| _|d|d  | _| j  r/| jd| j f dktj	| jd| j f< d | _
d | _|| _|| _|| _d S )N.r   )_is_categorical_X_train_y_train	unsqueezesquare__squared_X_diffr   typer   float64_cov_Y_Y_chol_cov_Y_Y_inv_Yr8   r9   r:   )selfr5   r6   r7   r8   r9   r:   r   r   r   __init__^   s   	

zGPRegressor.__init__r   c                 C  s   dt | j    S )Ng      ?)r   r%   r8   detachcpunumpy)rH   r   r   r   length_scalesv   s   zGPRegressor.length_scalesc                 C  s  | j d u r
| jd u sJ dt  |     }W d    n1 s'w   Y  |t	| j
jd   | j 7  < tj|}tjj|jtjj|| j  dddd}t|| _ t|| _| j | _d | j_| j | _d | j_| j | _d | j_d S )Nz(Cannot call cache_matrix more than once.r   T)lowerF)rF   rG   r   no_gradkernelrJ   rK   rL   r   diag_indicesr?   shaper:   itemlinalgcholeskyr
   solve_triangularTr@   
from_numpyr8   r-   r9   )rH   cov_Y_Ycov_Y_Y_cholcov_Y_Y_inv_Yr   r   r   _cache_matrixz   s*   
$zGPRegressor._cache_matrixNX1torch.Tensor | NoneX2c                 C  s   |du r|du s
J | j }n3|du r| j}|jdkr|| n	|d|d  }| j rA|d| jf dktj	|d| jf< |
| j}t|| j S )am  
        Return the kernel matrix with the shape of (..., n_A, n_B) given X1 and X2 each with the
        shapes of (..., n_A, len(params)) and (..., n_B, len(params)).

        If x1 and x2 have the shape of (len(params), ), kernel(x1, x2) is computed as:
            kernel_scale * Matern52Kernel.apply(
                sqd(x1, x2) @ inverse_squared_lengthscales
            )
        where if x1[i] is continuous, sqd(x1, x2)[i] = (x1[i] - x2[i]) ** 2 and if x1[i] is
        categorical, sqd(x1, x2)[i] = int(x1[i] != x2[i]).
        Note that the distance for categorical parameters is the Hamming distance.
        Nr$   r<   r=   .r   )rC   r?   ndimrA   rB   r>   r   rD   r   rE   matmulr8   r   applyr9   )rH   r]   r_   sqdsqdistr   r   r   rP      s   *
zGPRegressor.kernelFxjointbool!tuple[torch.Tensor, torch.Tensor]c           
      C  s  | j dur
| jdusJ d|jdk}|s|n|d}tj| | }| j}tjj| j tjj| j j	|dddddd}|rb|rFJ d| ||}||
|d	d
 }	|	jd
d	dd n| j}|tj|| }	|	d |r|d|	dfS ||	fS )a)  
        This method computes the posterior mean and variance given the points `x` where both mean
        and variance tensors will have the shape of x.shape[:-1].
        If ``joint=True``, the joint posterior will be computed.

        The posterior mean and variance are computed as:
            mean = cov_fx_fX @ inv(cov_fX_fX + noise_var * I) @ y, and
            var = cov_fx_fx - cov_fx_fX @ inv(cov_fX_fX + noise_var * I) @ cov_fx_fX.T.

        Please note that we clamp the variance to avoid negative values due to numerical errors.
        Nz+Call cache_matrix before calling posterior.r$   r   TF)upperleftz3Call posterior with joint=False for a single point.r<   )dim1dim2r   )rF   rG   r`   rA   r   rT   vecdotrP   rV   rW   ra   	transposediagonal
clamp_min_r9   squeeze)
rH   re   rf   is_single_pointx_	cov_fx_fXmeanV	cov_fx_fxvar_r   r   r   	posterior   s*   

 zGPRegressor.posteriorc                 C  s   | j jd }d| tdtj  }|  | jtj|tj	d  }tj
|}|    }tj
j|| jdddf dddddf }d||  }|| | S )a  
        This method computes the marginal log-likelihood of the kernel hyperparameters given the
        training dataset (X, y).
        Assume that N = len(X) in this method.

        Mathematically, the closed form is given as:
            -0.5 * log((2*pi)**N * det(C)) - 0.5 * y.T @ inv(C) @ y
            = -0.5 * log(det(C)) - 0.5 * y.T @ inv(C) @ y + const,
        where C = cov_Y_Y = cov_fX_fX + noise_var * I and inv(...) is the inverse operator.

        We exploit the full advantages of the Cholesky decomposition (C = L @ L.T) in this method:
            1. The determinant of a lower triangular matrix is the diagonal product, which can be
               computed with N flops where log(det(C)) = log(det(L.T @ L)) = 2 * log(det(L)).
            2. Solving linear system L @ u = y, which yields u = inv(L) @ y, costs N**2 flops.
        Note that given `u = inv(L) @ y` and `inv(C) = inv(L @ L.T) = inv(L).T @ inv(L)`,
        y.T @ inv(C) @ y is calculated as (inv(L) @ y) @ (inv(L) @ y).

        In principle, we could invert the matrix C first, but in this case, it costs:
            1. 1/3*N**3 flops for the determinant of inv(C).
            2. 2*N**2-N flops to solve C @ alpha = y, which is alpha = inv(C) @ y.

        Since the Cholesky decomposition costs 1/3*N**3 flops and the matrix inversion costs
        2/3*N**3 flops, the overall cost for the former is 1/3*N**3+N**2+N flops and that for the
        latter is N**3+2*N**2-N flops.
        r   g         dtypeNF)ri   )r?   rR   mathlogpirP   r:   r   eyerE   rT   rU   rp   sumrV   r@   )rH   n_pointsconstrY   Llogdet_partinv_L_y	quad_partr   r   r   marginal_log_likelihood   s   ,z#GPRegressor.marginal_log_likelihood	log_prior%Callable[[GPRegressor], torch.Tensor]minimum_noisefloatdeterministic_objectivegtolc           	   	     s&  j jd ttj   tj	
 tj
 d  gg}d fdd}t  tjj||d	d
d|id}W d    n1 sOw   Y  |js_td|j t|j}t|d  _t| _	 rtjtjdn
t|d   _  S )Nr$   gGz?
raw_paramsr   r   tuple[float, np.ndarray]c                   s   t | d}t  L t |d  _t | _ r)t jt jdn
t |d   _	
   }|  |jd  } rQ|dksQJ W d    n1 s[w   Y  | |j   fS )NTr|   r$   r   )r   rX   requires_grad_enable_gradr&   r8   r9   tensorrE   r:   r   r/   r-   rS   rJ   rK   rL   )r   raw_params_tensorlossraw_noise_var_gradr   r   r   n_paramsrH   r   r   	loss_func  s   
z1GPRegressor._fit_kernel_params.<locals>.loss_funcTzl-bfgs-br   )jacmethodoptionszOptimization failed: r|   )r   r   r   r   )r?   rR   r   concatenater   r8   rJ   rK   rL   r9   rS   r:   r   r
   optimizeminimizesuccessRuntimeErrormessager   rX   re   r&   r   rE   r\   )	rH   r   r   r   r   initial_raw_paramsr   resraw_params_opt_tensorr   r   r   _fit_kernel_params   s:   
zGPRegressor._fit_kernel_params)r5   r"   r6   r"   r7   r"   r8   r"   r9   r"   r:   r"   r   r;   )r   r   )r   r;   )NN)r]   r^   r_   r^   r   r"   )F)re   r"   rf   rg   r   rh   )r   r"   )
r   r   r   r   r   rg   r   r   r   r4   )r0   r1   r2   rI   propertyrM   r\   rP   rz   r   r   r   r   r   r   r4   ]   s    


%#r4   {Gz?XYr5   r   r   r   r   r   rg   	gpr_cacheGPRegressor | Noner   c                   s   t j jd d t jdd fdd}| }	|d u r!| }d }
||	fD ]6}z tt t  t |j|j|jdj	||||d	W   S  t
y] } z|}
W Y d }~q'd }~ww td
|
 d | }|  |S )Nr$   r{   r|   r   r4   c                     sB   t tt td d  d  d  dS )Nr<   rk   r5   r6   r7   r8   r9   r:   )r4   r   rX   cloner   r   r   default_kernel_paramsr5   r   r   _default_gprK  s   

z'fit_kernel_params.<locals>._default_gprr   )r   r   r   r   z/The optimization of kernel parameters failed: 
z<
The default initial kernel parameters will be used instead.)r   r4   )r   onesrR   rE   r4   rX   r8   r9   r:   r   r   loggerwarningr\   )r   r   r5   r   r   r   r   r   r   default_gpr_cacheerrorgpr_cache_to_useedefault_gprr   r   r   fit_kernel_params>  s@   


r   )r   r   r   r   )Nr   )r   r   r   r   r5   r   r   r   r   r   r   rg   r   r   r   r   r   r4   )__doc__
__future__r   r~   typingr   r   rL   r   "optuna._gp.scipy_blas_thread_patchr   optuna._warningsr   optuna.loggingr   collections.abcr   r
   r   optuna._importsr	   r0   r   r   autogradFunctionr   r4   r   r   r   r   r   <module>   s0    

 i