Commit d8058236 authored by Konstantinos Chatzilygeroudis's avatar Konstantinos Chatzilygeroudis
Browse files

New HP Optimization structure

parent f95d595b
......@@ -69,11 +69,11 @@ namespace limbo {
class GP {
public:
/// useful because the model might be created before knowing anything about the process
GP() : _dim_in(-1), _dim_out(-1) {}
GP() : _dim_in(-1), _dim_out(-1), _inv_kernel_updated(false) {}
/// useful because the model might be created before having samples
GP(int dim_in, int dim_out)
: _dim_in(dim_in), _dim_out(dim_out), _kernel_function(dim_in), _mean_function(dim_out) {}
: _dim_in(dim_in), _dim_out(dim_out), _kernel_function(dim_in), _mean_function(dim_out), _inv_kernel_updated(false) {}
/// Compute the GP from samples and observations. This call needs to be explicit!
void compute(const std::vector<Eigen::VectorXd>& samples,
......@@ -146,7 +146,7 @@ namespace limbo {
\\rst
return :math:`\mu`, :math:`\sigma^2` (unormalized). If there is no sample, return the value according to the mean function. Using this method instead of separate calls to mu() and sigma() is more efficient because some computations are shared between mu() and sigma().
\\endrst
*/
*/
std::tuple<Eigen::VectorXd, double> query(const Eigen::VectorXd& v) const
{
if (_samples.size() == 0)
......@@ -161,7 +161,7 @@ namespace limbo {
\\rst
return :math:`\mu` (unormalized). If there is no sample, return the value according to the mean function.
\\endrst
*/
*/
Eigen::VectorXd mu(const Eigen::VectorXd& v) const
{
if (_samples.size() == 0)
......@@ -173,7 +173,7 @@ namespace limbo {
\\rst
return :math:`\sigma^2` (unormalized). If there is no sample, return the max :math:`\sigma^2`.
\\endrst
*/
*/
double sigma(const Eigen::VectorXd& v) const
{
if (_samples.size() == 0)
......@@ -242,11 +242,161 @@ namespace limbo {
this->_compute_alpha();
}
/// return the likelihood (do not compute it!)
double get_lik() const { return _lik; }
void compute_inv_kernel()
{
size_t n = _obs_mean.rows();
// K^{-1} using Cholesky decomposition
_inv_kernel = Eigen::MatrixXd::Identity(n, n);
_matrixL.template triangularView<Eigen::Lower>().solveInPlace(_inv_kernel);
_matrixL.template triangularView<Eigen::Lower>().transpose().solveInPlace(_inv_kernel);
_inv_kernel_updated = true;
}
/// compute and return the log likelihood
double compute_log_lik()
{
size_t n = _obs_mean.rows();
// --- cholesky ---
// see:
// http://xcorr.net/2008/06/11/log-determinant-of-positive-definite-matrices-in-matlab/
long double det = 2 * _matrixL.diagonal().array().log().sum();
double a = (_obs_mean.transpose() * _alpha)
.trace(); // generalization for multi dimensional observation
_log_lik = -0.5 * a - 0.5 * det - 0.5 * n * std::log(2 * M_PI);
return _log_lik;
}
/// compute and return the gradient of the log likelihood wrt to the kernel parameters
Eigen::VectorXd compute_kernel_grad_log_lik()
{
size_t n = _obs_mean.rows();
// compute K^{-1} only if needed
if (!_inv_kernel_updated) {
compute_inv_kernel();
}
Eigen::MatrixXd w = _inv_kernel;
// alpha * alpha.transpose() - K^{-1}
w = _alpha * _alpha.transpose() - w;
// only compute half of the matrix (symmetrical matrix)
Eigen::VectorXd grad = Eigen::VectorXd::Zero(_kernel_function.h_params_size());
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j <= i; ++j) {
Eigen::VectorXd g = _kernel_function.grad(_samples[i], _samples[j], i, j);
if (i == j)
grad += w(i, j) * g * 0.5;
else
grad += w(i, j) * g;
}
}
return grad;
}
/// compute and return the gradient of the log likelihood wrt to the mean parameters
Eigen::VectorXd compute_mean_grad_log_lik()
{
size_t n = _obs_mean.rows();
// compute K^{-1} only if needed
if (!_inv_kernel_updated) {
compute_inv_kernel();
}
Eigen::VectorXd grad = Eigen::VectorXd::Zero(_mean_function.h_params_size());
for (int i_obs = 0; i_obs < _dim_out; ++i_obs)
for (size_t n_obs = 0; n_obs < n; n_obs++) {
grad += _obs_mean.col(i_obs).transpose() * _inv_kernel.col(n_obs) * _mean_function.grad(_samples[n_obs], *this).row(i_obs);
}
return grad;
}
/// set the likelihood (you need to compute it from outside!)
void set_lik(const double& lik) { _lik = lik; }
/// return the likelihood (do not compute it -- return last computed)
double get_log_lik() const { return _log_lik; }
/// set the log likelihood (e.g. computed from outside)
void set_log_lik(double log_lik) { _log_lik = log_lik; }
/// compute and return the log probability of LOO CV
double compute_log_loo_cv()
{
// compute K^{-1} only if needed
if (!_inv_kernel_updated) {
compute_inv_kernel();
}
Eigen::VectorXd inv_diag = _inv_kernel.diagonal().array().inverse();
_log_loo_cv = (((-0.5 * (_alpha.array().square().array().colwise() * inv_diag.array())).array().colwise() - 0.5 * inv_diag.array().log().array()) - 0.5 * std::log(2 * M_PI)).colwise().sum().sum();
return _log_loo_cv;
}
/// compute and return the gradient of the log probability of LOO CV wrt to the kernel parameters
Eigen::VectorXd compute_kernel_grad_log_loo_cv()
{
size_t n = _obs_mean.rows();
size_t n_params = _kernel_function.h_params_size();
// compute K^{-1} only if needed
if (!_inv_kernel_updated) {
compute_inv_kernel();
}
Eigen::VectorXd grad = Eigen::VectorXd::Zero(n_params);
Eigen::MatrixXd grads = Eigen::MatrixXd::Zero(n_params, _dim_out);
// only compute half of the matrix (symmetrical matrix)
// TO-DO: Make it better
std::vector<std::vector<Eigen::VectorXd>> full_dk;
for (size_t i = 0; i < n; i++) {
full_dk.push_back(std::vector<Eigen::VectorXd>());
for (size_t j = 0; j <= i; j++)
full_dk[i].push_back(_kernel_function.grad(_samples[i], _samples[j], i, j));
for (size_t j = i + 1; j < n; j++)
full_dk[i].push_back(Eigen::VectorXd::Zero(n_params));
}
for (size_t i = 0; i < n; i++)
for (size_t j = 0; j < i; ++j)
full_dk[j][i] = full_dk[i][j];
Eigen::VectorXd inv_diag = _inv_kernel.diagonal().array().inverse();
for (int j = 0; j < grad.size(); j++) {
Eigen::MatrixXd dKdTheta_j = Eigen::MatrixXd::Zero(n, n);
for (size_t i = 0; i < n; i++) {
for (size_t k = 0; k < n; k++)
dKdTheta_j(i, k) = full_dk[i][k](j);
}
Eigen::MatrixXd Zeta_j = _inv_kernel * dKdTheta_j;
Eigen::MatrixXd Zeta_j_alpha = Zeta_j * _alpha;
Eigen::MatrixXd Zeta_j_K = Zeta_j * _inv_kernel;
grads.row(j) = ((_alpha.array() * Zeta_j_alpha.array() - 0.5 * ((1. + _alpha.array().square().array().colwise() * inv_diag.array()).array().colwise() * Zeta_j_K.diagonal().array())).array().colwise() * inv_diag.array()).colwise().sum();
// for (size_t i = 0; i < n; i++)
// grads.row(j).array() += (_alpha.row(i).array() * Zeta_j_alpha.row(i).array() - 0.5 * (1. + _alpha.row(i).array().square() / _inv_kernel.diagonal()(i)) * Zeta_j_K.diagonal()(i)) / _inv_kernel.diagonal()(i);
}
grad = grads.rowwise().sum();
return grad;
}
/// return the LOO-CV log probability (do not compute it -- return last computed)
double get_log_loo_cv() const { return _log_loo_cv; }
/// set the LOO-CV log probability (e.g. computed from outside)
void set_log_loo_cv(double log_loo_cv) { _log_loo_cv = log_loo_cv; }
/// LLT matrix (from Cholesky decomposition)
//const Eigen::LLT<Eigen::MatrixXd>& llt() const { return _llt; }
......@@ -257,6 +407,8 @@ namespace limbo {
/// return the list of samples that have been tested so far
const std::vector<Eigen::VectorXd>& samples() const { return _samples; }
bool inv_kernel_computed() { return _inv_kernel_updated; }
protected:
int _dim_in;
int _dim_out;
......@@ -272,11 +424,12 @@ namespace limbo {
Eigen::MatrixXd _alpha;
Eigen::VectorXd _mean_observation;
Eigen::MatrixXd _kernel;
Eigen::MatrixXd _kernel, _inv_kernel;
Eigen::MatrixXd _matrixL;
double _lik;
double _log_lik, _log_loo_cv;
bool _inv_kernel_updated;
HyperParamsOptimizer _hp_optimize;
......@@ -306,6 +459,9 @@ namespace limbo {
_matrixL = Eigen::LLT<Eigen::MatrixXd>(_kernel).matrixL();
this->_compute_alpha();
// notify change of kernel
_inv_kernel_updated = false;
}
void _compute_incremental_kernel()
......@@ -335,6 +491,9 @@ namespace limbo {
_matrixL(n - 1, n - 1) = sqrt(L_j);
this->_compute_alpha();
// notify change of kernel
_inv_kernel_updated = false;
}
void _compute_alpha()
......
......@@ -65,8 +65,8 @@ namespace limbo {
Optimizer optimizer;
Eigen::VectorXd params = optimizer(optimization, gp.kernel_function().h_params(), false);
gp.kernel_function().set_h_params(params);
gp.set_lik(opt::eval(optimization, params));
gp.recompute(false);
gp.compute_log_lik();
}
protected:
......@@ -82,42 +82,12 @@ namespace limbo {
gp.recompute(false);
size_t n = gp.obs_mean().rows();
// --- cholesky ---
// see:
// http://xcorr.net/2008/06/11/log-determinant-of-positive-definite-matrices-in-matlab/
Eigen::MatrixXd l = gp.matrixL();
long double det = 2 * l.diagonal().array().log().sum();
double a = (gp.obs_mean().transpose() * gp.alpha())
.trace(); // generalization for multi dimensional observation
// std::cout<<" a: "<<a <<" det: "<< det<<std::endl;
double lik = -0.5 * a - 0.5 * det - 0.5 * n * log(2 * M_PI);
double lik = gp.compute_log_lik();
if (!compute_grad)
return opt::no_grad(lik);
// K^{-1} using Cholesky decomposition
Eigen::MatrixXd w = Eigen::MatrixXd::Identity(n, n);
gp.matrixL().template triangularView<Eigen::Lower>().solveInPlace(w);
gp.matrixL().template triangularView<Eigen::Lower>().transpose().solveInPlace(w);
// alpha * alpha.transpose() - K^{-1}
w = gp.alpha() * gp.alpha().transpose() - w;
// only compute half of the matrix (symmetrical matrix)
Eigen::VectorXd grad = Eigen::VectorXd::Zero(params.size());
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j <= i; ++j) {
Eigen::VectorXd g = gp.kernel_function().grad(gp.samples()[i], gp.samples()[j], i, j);
if (i == j)
grad += w(i, j) * g * 0.5;
else
grad += w(i, j) * g;
}
}
Eigen::VectorXd grad = gp.compute_kernel_grad_log_lik();
return {lik, grad};
}
......
......@@ -66,6 +66,7 @@ namespace limbo {
Eigen::VectorXd params = optimizer(optimization, gp.kernel_function().h_params(), false);
gp.kernel_function().set_h_params(params);
gp.recompute(false);
gp.compute_log_loo_cv();
}
protected:
......@@ -81,64 +82,12 @@ namespace limbo {
gp.recompute(false);
size_t n = gp.obs_mean().rows();
Eigen::MatrixXd l = gp.matrixL();
// K^{-1} using Cholesky decomposition
Eigen::MatrixXd w = Eigen::MatrixXd::Identity(n, n);
l.template triangularView<Eigen::Lower>().solveInPlace(w);
l.template triangularView<Eigen::Lower>().transpose().solveInPlace(w);
// alpha
Eigen::MatrixXd alpha = gp.alpha();
Eigen::VectorXd inv_diag = w.diagonal().array().inverse();
double loo = (((-0.5 * (alpha.array().square().array().colwise() * inv_diag.array())).array().colwise() - 0.5 * inv_diag.array().log().array()) - 0.5 * std::log(2 * M_PI)).colwise().sum().sum(); //LOO.rowwise().sum();
// double loo_total = 0.0;
// for (int K = 0; K < gp.obs_mean().cols(); K++) {
// double loo_tmp = (-0.5 * alpha.col(K).array().square() / w.diagonal().array() - 0.5 * w.diagonal().array().inverse().log()).sum() - 0.5 * n * std::log(2 * M_PI);
// loo_total += loo_tmp;
// }
//
// std::cout << loo << " vs " << loo_total << std::endl;
double loo = gp.compute_log_loo_cv();
if (!compute_grad)
return opt::no_grad(loo);
Eigen::VectorXd grad = Eigen::VectorXd::Zero(params.size());
Eigen::MatrixXd grads = Eigen::MatrixXd::Zero(params.size(), gp.obs_mean().cols());
// only compute half of the matrix (symmetrical matrix)
// TO-DO: Make it better
std::vector<std::vector<Eigen::VectorXd>> full_dk;
for (size_t i = 0; i < n; i++) {
full_dk.push_back(std::vector<Eigen::VectorXd>());
for (size_t j = 0; j <= i; j++)
full_dk[i].push_back(gp.kernel_function().grad(gp.samples()[i], gp.samples()[j], i, j));
for (size_t j = i + 1; j < n; j++)
full_dk[i].push_back(Eigen::VectorXd::Zero(params.size()));
}
for (size_t i = 0; i < n; i++)
for (size_t j = 0; j < i; ++j)
full_dk[j][i] = full_dk[i][j];
for (int j = 0; j < grad.size(); j++) {
Eigen::MatrixXd dKdTheta_j = Eigen::MatrixXd::Zero(n, n);
for (size_t i = 0; i < n; i++) {
for (size_t k = 0; k < n; k++)
dKdTheta_j(i, k) = full_dk[i][k](j);
}
Eigen::MatrixXd Zeta_j = w * dKdTheta_j;
Eigen::MatrixXd Zeta_j_alpha = Zeta_j * alpha;
Eigen::MatrixXd Zeta_j_K = Zeta_j * w;
for (size_t i = 0; i < n; i++)
grads.row(j).array() += (alpha.row(i).array() * Zeta_j_alpha.row(i).array() - 0.5 * (1. + alpha.row(i).array().square() / w.diagonal()(i)) * Zeta_j_K.diagonal()(i)) / w.diagonal()(i);
}
grad = grads.rowwise().sum();
Eigen::VectorXd grad = gp.compute_kernel_grad_log_loo_cv();
return {loo, grad};
}
......
......@@ -63,15 +63,19 @@ namespace limbo {
this->_called = true;
KernelMeanLFOptimization<GP> optimization(gp);
Optimizer optimizer;
int dim = gp.kernel_function().h_params_size() + gp.mean_function().h_params_size();
Eigen::VectorXd init(dim);
init.head(gp.kernel_function().h_params_size()) = gp.kernel_function().h_params();
init.tail(gp.mean_function().h_params_size()) = gp.mean_function().h_params();
Eigen::VectorXd params = optimizer(optimization, init, false);
gp.kernel_function().set_h_params(params.head(gp.kernel_function().h_params_size()));
gp.mean_function().set_h_params(params.tail(gp.mean_function().h_params_size()));
gp.set_lik(opt::eval(optimization, params));
gp.recompute(true);
gp.compute_log_lik();
}
protected:
......@@ -88,47 +92,15 @@ namespace limbo {
gp.recompute(true);
size_t n = gp.obs_mean().rows();
// --- cholesky ---
// see:
// http://xcorr.net/2008/06/11/log-determinant-of-positive-definite-matrices-in-matlab/
Eigen::MatrixXd l = gp.matrixL();
long double det = 2 * l.diagonal().array().log().sum();
double a = (gp.obs_mean().transpose() * gp.alpha())
.trace(); // generalization for multi dimensional observation
// std::cout<<" a: "<<a <<" det: "<< det<<std::endl;
double lik = -0.5 * a - 0.5 * det - 0.5 * n * log(2 * M_PI);
double lik = gp.compute_log_lik();
if (!compute_grad)
return opt::no_grad(lik);
// K^{-1} using Cholesky decomposition
Eigen::MatrixXd K = Eigen::MatrixXd::Identity(n, n);
gp.matrixL().template triangularView<Eigen::Lower>().solveInPlace(K);
gp.matrixL().template triangularView<Eigen::Lower>().transpose().solveInPlace(K);
// alpha * alpha.transpose() - K^{-1}
Eigen::MatrixXd w = gp.alpha() * gp.alpha().transpose() - K;
// only compute half of the matrix (symmetrical matrix)
Eigen::VectorXd grad = Eigen::VectorXd::Zero(params.size());
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j <= i; ++j) {
Eigen::VectorXd g = gp.kernel_function().grad(gp.samples()[i], gp.samples()[j], i, j);
if (i == j)
grad.head(gp.kernel_function().h_params_size()) += w(i, j) * g * 0.5;
else
grad.head(gp.kernel_function().h_params_size()) += w(i, j) * g;
}
}
for (int i_obs = 0; i_obs < gp.dim_out(); ++i_obs)
for (size_t n_obs = 0; n_obs < n; n_obs++) {
grad.tail(gp.mean_function().h_params_size()) += gp.obs_mean().col(i_obs).transpose() * K.col(n_obs) * gp.mean_function().grad(gp.samples()[n_obs], gp).row(i_obs);
}
grad.head(gp.kernel_function().h_params_size()) = gp.compute_kernel_grad_log_lik();
grad.tail(gp.mean_function().h_params_size()) = gp.compute_mean_grad_log_lik();
return {lik, grad};
}
......
......@@ -65,8 +65,8 @@ namespace limbo {
Optimizer optimizer;
Eigen::VectorXd params = optimizer(optimization, gp.mean_function().h_params(), false);
gp.mean_function().set_h_params(params);
gp.set_lik(opt::eval(optimization, params));
gp.recompute(true, false);
gp.compute_log_lik();
}
protected:
......@@ -75,12 +75,7 @@ namespace limbo {
public:
MeanLFOptimization(const GP& gp) : _original_gp(gp)
{
size_t n = gp.obs_mean().rows();
// K^{-1} using Cholesky decomposition
_K = Eigen::MatrixXd::Identity(n, n);
gp.matrixL().template triangularView<Eigen::Lower>().solveInPlace(_K);
gp.matrixL().template triangularView<Eigen::Lower>().transpose().solveInPlace(_K);
_original_gp.compute_inv_kernel();
}
opt::eval_t operator()(const Eigen::VectorXd& params, bool compute_grad) const
......@@ -90,34 +85,18 @@ namespace limbo {
gp.recompute(true, false);
size_t n = gp.obs_mean().rows();
// --- cholesky ---
// see:
// http://xcorr.net/2008/06/11/log-determinant-of-positive-definite-matrices-in-matlab/
Eigen::MatrixXd l = gp.matrixL();
long double det = 2 * l.diagonal().array().log().sum();
double a = (gp.obs_mean().transpose() * gp.alpha())
.trace(); // generalization for multi dimensional observation
// std::cout<<" a: "<<a <<" det: "<< det<<std::endl;
double lik = -0.5 * a - 0.5 * det - 0.5 * n * log(2 * M_PI);
double lik = gp.compute_log_lik();
if (!compute_grad)
return opt::no_grad(lik);
Eigen::VectorXd grad = Eigen::VectorXd::Zero(params.size());
for (int i_obs = 0; i_obs < gp.dim_out(); ++i_obs)
for (size_t n_obs = 0; n_obs < n; n_obs++) {
grad.tail(gp.mean_function().h_params_size()) += gp.obs_mean().col(i_obs).transpose() * _K.col(n_obs) * gp.mean_function().grad(gp.samples()[n_obs], gp).row(i_obs);
}
Eigen::VectorXd grad = gp.compute_mean_grad_log_lik();
return {lik, grad};
}
protected:
const GP& _original_gp;
Eigen::MatrixXd _K;
GP _original_gp;
};
};
}
......
......@@ -385,6 +385,72 @@ BOOST_AUTO_TEST_CASE(test_gp_check_loo_grad_noise)
BOOST_CHECK(results.array().sum() < M * e);
}
BOOST_AUTO_TEST_CASE(test_gp_check_inv_kernel_computation)
{
using namespace limbo;
using KF_t = kernel::SquaredExpARD<Params>;
using Mean_t = mean::FunctionARD<Params, mean::Constant<Params>>;
using GP_t = model::GP<Params, KF_t, Mean_t>;
GP_t gp(4, 2);
std::vector<Eigen::VectorXd> observations, samples;
// Random samples
int N = 10;
for (int i = 0; i < N; i++) {
samples.push_back(tools::random_vector(4));
Eigen::VectorXd ob(2);
ob << std::cos(samples[i](0)), std::sin(samples[i](1));
observations.push_back(ob);
}
gp.compute(samples, observations);
// Check that variable is properly initialized
BOOST_CHECK(!gp.inv_kernel_computed());
// Check that kernel is computed
gp.compute_inv_kernel();
BOOST_CHECK(gp.inv_kernel_computed());
// Check recompute alternatives
gp.recompute(false, false);
BOOST_CHECK(gp.inv_kernel_computed());
gp.recompute(true, false);
BOOST_CHECK(gp.inv_kernel_computed());
gp.recompute(true, true);
BOOST_CHECK(!gp.inv_kernel_computed());
// Check add_sample
gp.compute_inv_kernel();
gp.add_sample(samples.back(), observations.back());
BOOST_CHECK(!gp.inv_kernel_computed());
// Check different implicit computations of inverse kernel
gp.compute_kernel_grad_log_lik();
BOOST_CHECK(gp.inv_kernel_computed());
gp.recompute();
BOOST_CHECK(!gp.inv_kernel_computed());
gp.compute_mean_grad_log_lik();
BOOST_CHECK(gp.inv_kernel_computed());
gp.recompute();
BOOST_CHECK(!gp.inv_kernel_computed());
gp.compute_log_loo_cv();
BOOST_CHECK(gp.inv_kernel_computed());
gp.recompute();
BOOST_CHECK(!gp.inv_kernel_computed());
gp.compute_kernel_grad_log_loo_cv();
BOOST_CHECK(gp.inv_kernel_computed());
}
BOOST_AUTO_TEST_CASE(test_gp_dim)
{
using namespace limbo;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment