'''
Gaussian process surrogate model.
'''
import numpy as np
import math
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel
from sklearn.gaussian_process.kernels import Matern as MaternKernel, _check_length_scale
from sklearn.utils.optimize import _check_optimize_result
from scipy.optimize import minimize
from scipy.linalg import solve_triangular
from scipy.spatial.distance import pdist, cdist, squareform
from scipy.special import kv, gamma
from autooed.mobo.surrogate_model.base import SurrogateModel
from autooed.utils.operand import safe_divide
class Matern(MaternKernel):
'''
Customized version of Matern kernel to avoid numerical error.
'''
def __call__(self, X, Y=None, eval_gradient=False):
X = np.atleast_2d(X)
length_scale = _check_length_scale(X, self.length_scale)
if Y is None:
dists = pdist(X / length_scale, metric='euclidean')
else:
if eval_gradient:
raise ValueError(
"Gradient can only be evaluated when Y is None.")
dists = cdist(X / length_scale, Y / length_scale,
metric='euclidean')
if self.nu == 0.5:
K = np.exp(-dists)
elif self.nu == 1.5:
K = dists * math.sqrt(3)
K = (1. + K) * np.exp(-K)
elif self.nu == 2.5:
K = dists * math.sqrt(5)
K = (1. + K + K ** 2 / 3.0) * np.exp(-K)
else: # general case; expensive to evaluate
K = dists
K[K == 0.0] += np.finfo(float).eps # strict zeros result in nan
tmp = (math.sqrt(2 * self.nu) * K)
K.fill((2 ** (1. - self.nu)) / gamma(self.nu))
K *= tmp ** self.nu
K *= kv(self.nu, tmp)
if Y is None:
# convert from upper-triangular matrix to square matrix
K = squareform(K)
np.fill_diagonal(K, 1)
if eval_gradient:
if self.hyperparameter_length_scale.fixed:
# Hyperparameter l kept fixed
K_gradient = np.empty((X.shape[0], X.shape[0], 0))
return K, K_gradient
# We need to recompute the pairwise dimension-wise distances
if self.anisotropic:
D = (X[:, np.newaxis, :] - X[np.newaxis, :, :])**2 \
/ (length_scale ** 2)
else:
D = squareform(dists**2)[:, :, np.newaxis]
if self.nu == 0.5:
K_gradient = safe_divide(K[..., np.newaxis] * D,
np.sqrt(D.sum(2))[:, :, np.newaxis])
K_gradient[~np.isfinite(K_gradient)] = 0
elif self.nu == 1.5:
K_gradient = \
3 * D * np.exp(-np.sqrt(3 * D.sum(-1)))[..., np.newaxis]
elif self.nu == 2.5:
tmp = np.sqrt(5 * D.sum(-1))[..., np.newaxis]
K_gradient = 5.0 / 3.0 * D * (tmp + 1) * np.exp(-tmp)
else:
# approximate gradient numerically
def f(theta): # helper function
return self.clone_with_theta(theta)(X, Y)
return K, _approx_fprime(self.theta, f, 1e-10)
if not self.anisotropic:
return K, K_gradient[:, :].sum(-1)[:, :, np.newaxis]
else:
return K, K_gradient
else:
return K
def constrained_optimization(obj_func, initial_theta, bounds):
'''
Customized version of constrained optimization to avoid convergence warning.
'''
opt_res = minimize(obj_func, initial_theta, method="L-BFGS-B", jac=True, bounds=bounds)
# NOTE: Temporarily disable the checking below because of the numerical instability sometimes.
# _check_optimize_result("lbfgs", opt_res)
return opt_res.x, opt_res.fun
[docs]class GaussianProcess(SurrogateModel):
'''
Gaussian process.
'''
[docs] def __init__(self, problem, nu=1, **kwargs):
'''
Initialize a Gaussian process.
Parameters
----------
problem: autooed.problem.Problem
The optimization problem.
nu: int
The parameter nu controlling the type of the Matern kernel. Choices are 1, 3, 5 and -1.
'''
super().__init__(problem)
self.nu = nu
self.gps = []
for _ in range(self.n_obj):
if nu > 0:
main_kernel = Matern(length_scale=np.ones(self.n_var), length_scale_bounds=(np.sqrt(1e-3), np.sqrt(1e3)), nu=0.5 * nu)
else:
main_kernel = RBF(length_scale=np.ones(self.n_var), length_scale_bounds=(np.sqrt(1e-3), np.sqrt(1e3)))
kernel = ConstantKernel(constant_value=1.0, constant_value_bounds=(np.sqrt(1e-3), np.sqrt(1e3))) * \
main_kernel + \
ConstantKernel(constant_value=1e-2, constant_value_bounds=(np.exp(-6), np.exp(0)))
gp = GaussianProcessRegressor(kernel=kernel, optimizer=constrained_optimization)
self.gps.append(gp)
[docs] def _fit(self, X, Y):
for i, gp in enumerate(self.gps):
gp.fit(X, Y[:, i])
[docs] def _evaluate(self, X, std, gradient, hessian):
F, dF, hF = [], [], [] # mean
S, dS, hS = [], [], [] # std
for gp in self.gps:
# mean
K = gp.kernel_(X, gp.X_train_) # K: shape (N, N_train)
y_mean = K.dot(gp.alpha_)
F.append(y_mean) # y_mean: shape (N,)
if std:
if gp._K_inv is None:
L_inv = solve_triangular(gp.L_.T,
np.eye(gp.L_.shape[0]))
gp._K_inv = L_inv.dot(L_inv.T)
y_var = gp.kernel_.diag(X)
y_var -= np.einsum("ij,ij->i",
np.dot(K, gp._K_inv), K)
y_var_negative = y_var < 0
if np.any(y_var_negative):
y_var[y_var_negative] = 0.0
y_std = np.sqrt(y_var)
S.append(y_std) # y_std: shape (N,)
if not (gradient or hessian): continue
ell = np.exp(gp.kernel_.theta[1:-1]) # ell: shape (n_var,)
sf2 = np.exp(gp.kernel_.theta[0]) # sf2: shape (1,)
d = np.expand_dims(cdist(X / ell, gp.X_train_ / ell), 2) # d: shape (N, N_train, 1)
X_, X_train_ = np.expand_dims(X, 1), np.expand_dims(gp.X_train_, 0)
dd_N = X_ - X_train_ # numerator
dd_D = d * ell ** 2 # denominator
dd = safe_divide(dd_N, dd_D) # dd: shape (N, N_train, n_var)
if self.nu == 1:
dK = -sf2 * np.exp(-d) * dd
elif self.nu == 3:
dK = -3 * sf2 * np.exp(-np.sqrt(3) * d) * d * dd
elif self.nu == 5:
dK = -5. / 3 * sf2 * np.exp(-np.sqrt(5) * d) * (1 + np.sqrt(5) * d) * d * dd
else: # RBF
dK = -sf2 * np.exp(-0.5 * d ** 2) * d * dd
dK_T = dK.transpose(0, 2, 1) # dK: shape (N, N_train, n_var), dK_T: shape (N, n_var, N_train)
if gradient:
dy_mean = dK_T @ gp.alpha_ # gp.alpha_: shape (N_train,)
dF.append(dy_mean) # dy_mean: shape (N, n_var)
# TODO: check
if std:
K = np.expand_dims(K, 1) # K: shape (N, 1, N_train)
K_Ki = K @ gp._K_inv # gp._K_inv: shape (N_train, N_train), K_Ki: shape (N, 1, N_train)
dK_Ki = dK_T @ gp._K_inv # dK_Ki: shape (N, n_var, N_train)
dy_var = -np.sum(dK_Ki * K + K_Ki * dK_T, axis=2) # dy_var: shape (N, n_var)
dy_std = 0.5 * safe_divide(dy_var, y_std) # dy_std: shape (N, n_var)
dS.append(dy_std)
if hessian:
d = np.expand_dims(d, 3) # d: shape (N, N_train, 1, 1)
dd = np.expand_dims(dd, 2) # dd: shape (N, N_train, 1, n_var)
hd_N = d * np.expand_dims(np.eye(len(ell)), (0, 1)) - np.expand_dims(X_ - X_train_, 3) * dd # numerator
hd_D = d ** 2 * np.expand_dims(ell ** 2, (0, 1, 3)) # denominator
hd = safe_divide(hd_N, hd_D) # hd: shape (N, N_train, n_var, n_var)
if self.nu == 1:
hK = -sf2 * np.exp(-d) * (hd - dd ** 2)
elif self.nu == 3:
hK = -3 * sf2 * np.exp(-np.sqrt(3) * d) * (d * hd + (1 - np.sqrt(3) * d) * dd ** 2)
elif self.nu == 5:
hK = -5. / 3 * sf2 * np.exp(-np.sqrt(5) * d) * (-5 * d ** 2 * dd ** 2 + (1 + np.sqrt(5) * d) * (dd ** 2 + d * hd))
else: # RBF
hK = -sf2 * np.exp(-0.5 * d ** 2) * ((1 - d ** 2) * dd ** 2 + d * hd)
hK_T = hK.transpose(0, 2, 3, 1) # hK: shape (N, N_train, n_var, n_var), hK_T: shape (N, n_var, n_var, N_train)
hy_mean = hK_T @ gp.alpha_ # hy_mean: shape (N, n_var, n_var)
hF.append(hy_mean)
# TODO: check
if std:
K = np.expand_dims(K, 2) # K: shape (N, 1, 1, N_train)
dK = np.expand_dims(dK_T, 2) # dK: shape (N, n_var, 1, N_train)
dK_Ki = np.expand_dims(dK_Ki, 2) # dK_Ki: shape (N, n_var, 1, N_train)
hK_Ki = hK_T @ gp._K_inv # hK_Ki: shape (N, n_var, n_var, N_train)
hy_var = -np.sum(hK_Ki * K + 2 * dK_Ki * dK + K_Ki * hK_T, axis=3) # hy_var: shape (N, n_var, n_var)
hy_std = 0.5 * safe_divide(hy_var * y_std - dy_var * dy_std, y_var) # hy_std: shape (N, n_var, n_var)
hS.append(hy_std)
F = np.stack(F, axis=1)
dF = np.stack(dF, axis=1) if gradient else None
hF = np.stack(hF, axis=1) if hessian else None
S = np.stack(S, axis=1) if std else None
dS = np.stack(dS, axis=1) if std and gradient else None
hS = np.stack(hS, axis=1) if std and hessian else None
out = {'F': F, 'dF': dF, 'hF': hF, 'S': S, 'dS': dS, 'hS': hS}
return out