gp.hpp 22.6 KB
Newer Older
1
2
3
4
//| Copyright Inria May 2015
//| This project has received funding from the European Research Council (ERC) under
//| the European Union's Horizon 2020 research and innovation programme (grant
//| agreement No 637972) - see http://www.resibots.eu
5
//|
6
7
8
9
10
11
//| Contributor(s):
//|   - Jean-Baptiste Mouret (jean-baptiste.mouret@inria.fr)
//|   - Antoine Cully (antoinecully@gmail.com)
//|   - Kontantinos Chatzilygeroudis (konstantinos.chatzilygeroudis@inria.fr)
//|   - Federico Allocati (fede.allocati@gmail.com)
//|   - Vaios Papaspyros (b.papaspyros@gmail.com)
Konstantinos Chatzilygeroudis's avatar
Konstantinos Chatzilygeroudis committed
12
//|   - Roberto Rama (bertoski@gmail.com)
13
//|
14
15
16
17
18
//| This software is a computer library whose purpose is to optimize continuous,
//| black-box functions. It mainly implements Gaussian processes and Bayesian
//| optimization.
//| Main repository: http://github.com/resibots/limbo
//| Documentation: http://www.resibots.eu/limbo
19
//|
20
21
22
23
24
//| This software is governed by the CeCILL-C license under French law and
//| abiding by the rules of distribution of free software.  You can  use,
//| modify and/ or redistribute the software under the terms of the CeCILL-C
//| license as circulated by CEA, CNRS and INRIA at the following URL
//| "http://www.cecill.info".
25
//|
26
27
28
29
30
//| As a counterpart to the access to the source code and  rights to copy,
//| modify and redistribute granted by the license, users are provided only
//| with a limited warranty  and the software's author,  the holder of the
//| economic rights,  and the successive licensors  have only  limited
//| liability.
31
//|
32
33
34
35
36
37
38
39
40
41
//| In this respect, the user's attention is drawn to the risks associated
//| with loading,  using,  modifying and/or developing or reproducing the
//| software by the user in light of its specific status of free software,
//| that may mean  that it is complicated to manipulate,  and  that  also
//| therefore means  that it is reserved for developers  and  experienced
//| professionals having in-depth computer knowledge. Users are therefore
//| encouraged to load and test the software's suitability as regards their
//| requirements in conditions enabling the security of their systems and/or
//| data to be ensured and,  more generally, to use and operate it in the
//| same conditions as regards security.
42
//|
43
44
//| The fact that you are presently reading this means that you have had
//| knowledge of the CeCILL-C license and that you accept its terms.
45
//|
46
47
#ifndef LIMBO_MODEL_GP_HPP
#define LIMBO_MODEL_GP_HPP
48
49

#include <cassert>
50
#include <iostream>
51
#include <limits>
52
#include <vector>
53

54
#include <Eigen/Cholesky>
55
56
57
#include <Eigen/Core>
#include <Eigen/LU>

58
#include <limbo/tools.hpp>
59
#include <limbo/model/gp/no_lf_opt.hpp>
60
#include <limbo/kernel/squared_exp_ard.hpp>
Jean-Baptiste Mouret's avatar
Jean-Baptiste Mouret committed
61
#include <limbo/kernel/exp.hpp>
62
63
#include <limbo/model/gp/kernel_lf_opt.hpp>
#include <limbo/mean/data.hpp>
Jean-Baptiste Mouret's avatar
Jean-Baptiste Mouret committed
64
#include <limbo/mean/constant.hpp>
65

66
namespace limbo {
67
    namespace model {
Jean-Baptiste Mouret's avatar
Jean-Baptiste Mouret committed
68
69
70
        /// @ingroup model
        /// A classic Gaussian process.
        /// It is parametrized by:
71
        /// - a kernel function
Jean-Baptiste Mouret's avatar
Jean-Baptiste Mouret committed
72
73
        /// - a mean function
        /// - [optionnal] an optimizer for the hyper-parameters
Jean-Baptiste Mouret's avatar
Jean-Baptiste Mouret committed
74
        template <typename Params, typename KernelFunction = kernel::Exp<Params>, typename MeanFunction = mean::Data<Params>, typename HyperParamsOptimizer = gp::NoLFOpt<Params>>
75
        class GP {
76
        public:
77
            /// useful because the model might be created before knowing anything about the process
78
            GP() : _dim_in(-1), _dim_out(-1), _inv_kernel_updated(false) {}
79
80
81

            /// useful because the model might be created  before having samples
            GP(int dim_in, int dim_out)
82
                : _dim_in(dim_in), _dim_out(dim_out), _kernel_function(dim_in), _mean_function(dim_out), _inv_kernel_updated(false) {}
83

84
            /// Compute the GP from samples and observations. This call needs to be explicit!
85
            void compute(const std::vector<Eigen::VectorXd>& samples,
86
                const std::vector<Eigen::VectorXd>& observations, bool compute_kernel = true)
87
88
89
90
91
            {
                assert(samples.size() != 0);
                assert(observations.size() != 0);
                assert(samples.size() == observations.size());

Antoine Cully's avatar
Antoine Cully committed
92
93
94
95
                if (_dim_in != samples[0].size()) {
                    _dim_in = samples[0].size();
                    _kernel_function = KernelFunction(_dim_in); // the cost of building a functor should be relatively low
                }
96

97
98
99
100
                if (_dim_out != observations[0].size()) {
                    _dim_out = observations[0].size();
                    _mean_function = MeanFunction(_dim_out); // the cost of building a functor should be relatively low
                }
101

102
103
104
105
106
107
108
                _samples = samples;

                _observations.resize(observations.size(), _dim_out);
                for (int i = 0; i < _observations.rows(); ++i)
                    _observations.row(i) = observations[i];

                _mean_observation = _observations.colwise().mean();
109

110
                this->_compute_obs_mean();
111
112
                if (compute_kernel)
                    this->_compute_full_kernel();
113
            }
114

115
            /// Do not forget to call this if you use hyper-prameters optimization!!
Konstantinos Chatzilygeroudis's avatar
Konstantinos Chatzilygeroudis committed
116
117
            void optimize_hyperparams()
            {
118
                _hp_optimize(*this);
119
            }
120

121
122
            /// add sample and update the GP. This code uses an incremental implementation of the Cholesky
            /// decomposition. It is therefore much faster than a call to compute()
123
            void add_sample(const Eigen::VectorXd& sample, const Eigen::VectorXd& observation)
124
125
            {
                if (_samples.empty()) {
Antoine Cully's avatar
Antoine Cully committed
126
127
128
129
130
131
132
133
                    if (_dim_in != sample.size()) {
                        _dim_in = sample.size();
                        _kernel_function = KernelFunction(_dim_in); // the cost of building a functor should be relatively low
                    }
                    if (_dim_out != observation.size()) {
                        _dim_out = observation.size();
                        _mean_function = MeanFunction(_dim_out); // the cost of building a functor should be relatively low
                    }
Konstantinos Chatzilygeroudis's avatar
Konstantinos Chatzilygeroudis committed
134
135
                }
                else {
136
137
                    assert(sample.size() == _dim_in);
                    assert(observation.size() == _dim_out);
138
139
                }

140
141
142
143
144
145
146
147
                _samples.push_back(sample);

                _observations.conservativeResize(_observations.rows() + 1, _dim_out);
                _observations.bottomRows<1>() = observation.transpose();

                _mean_observation = _observations.colwise().mean();

                this->_compute_obs_mean();
148
                this->_compute_incremental_kernel();
149
150
            }

151
            /**
152
             \\rst
153
             return :math:`\mu`, :math:`\sigma^2` (unormalized). If there is no sample, return the value according to the mean function. Using this method instead of separate calls to mu() and sigma() is more efficient because some computations are shared between mu() and sigma().
154
             \\endrst
155
            */
156
157
158
159
            std::tuple<Eigen::VectorXd, double> query(const Eigen::VectorXd& v) const
            {
                if (_samples.size() == 0)
                    return std::make_tuple(_mean_function(v, *this),
160
                        _kernel_function(v, v));
161
162

                Eigen::VectorXd k = _compute_k(v);
163
                return std::make_tuple(_mu(v, k), _sigma(v, k));
164
165
166
            }

            /**
167
             \\rst
168
             return :math:`\mu` (unormalized). If there is no sample, return the value according to the mean function.
169
             \\endrst
170
            */
171
172
173
174
175
176
177
178
            Eigen::VectorXd mu(const Eigen::VectorXd& v) const
            {
                if (_samples.size() == 0)
                    return _mean_function(v, *this);
                return _mu(v, _compute_k(v));
            }

            /**
179
             \\rst
180
             return :math:`\sigma^2` (unormalized). If there is no sample, return the max :math:`\sigma^2`.
181
             \\endrst
182
            */
183
184
            double sigma(const Eigen::VectorXd& v) const
            {
185
                if (_samples.size() == 0)
186
                    return _kernel_function(v, v);
187
                return _sigma(v, _compute_k(v));
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
            }

            /// return the number of dimensions of the input
            int dim_in() const
            {
                assert(_dim_in != -1); // need to compute first !
                return _dim_in;
            }

            /// return the number of dimensions of the output
            int dim_out() const
            {
                assert(_dim_out != -1); // need to compute first !
                return _dim_out;
            }

            const KernelFunction& kernel_function() const { return _kernel_function; }

206
            KernelFunction& kernel_function() { return _kernel_function; }
207
208
209
210
211

            const MeanFunction& mean_function() const { return _mean_function; }

            MeanFunction& mean_function() { return _mean_function; }

Jean-Baptiste Mouret's avatar
Jean-Baptiste Mouret committed
212
            /// return the maximum observation (only call this if the output of the GP is of dimension 1)
213
214
215
216
217
218
            Eigen::VectorXd max_observation() const
            {
                if (_observations.cols() > 1)
                    std::cout << "WARNING max_observation with multi dimensional "
                                 "observations doesn't make sense"
                              << std::endl;
219
                return tools::make_vector(_observations.maxCoeff());
220
221
222
223
224
            }

            /// return the mean observation (only call this if the output of the GP is of dimension 1)
            Eigen::VectorXd mean_observation() const
            {
225
                // TODO: Check if _dim_out is correct?!
226
227
228
229
230
231
232
233
234
235
236
                return _samples.size() > 0 ? _mean_observation
                                           : Eigen::VectorXd::Zero(_dim_out);
            }

            const Eigen::MatrixXd& mean_vector() const { return _mean_vector; }

            const Eigen::MatrixXd& obs_mean() const { return _obs_mean; }

            /// return the number of samples used to compute the GP
            int nb_samples() const { return _samples.size(); }

237
            ///  recomputes the GP
238
            void recompute(bool update_obs_mean = true, bool update_full_kernel = true)
239
            {
240
241
242
                assert(!_samples.empty());

                if (update_obs_mean)
243
                    this->_compute_obs_mean();
244

245
246
247
248
                if (update_full_kernel)
                    this->_compute_full_kernel();
                else
                    this->_compute_alpha();
249
250
            }

251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
            void compute_inv_kernel()
            {
                size_t n = _obs_mean.rows();
                // K^{-1} using Cholesky decomposition
                _inv_kernel = Eigen::MatrixXd::Identity(n, n);

                _matrixL.template triangularView<Eigen::Lower>().solveInPlace(_inv_kernel);
                _matrixL.template triangularView<Eigen::Lower>().transpose().solveInPlace(_inv_kernel);

                _inv_kernel_updated = true;
            }

            /// compute and return the log likelihood
            double compute_log_lik()
            {
                size_t n = _obs_mean.rows();

                // --- cholesky ---
                // see:
                // http://xcorr.net/2008/06/11/log-determinant-of-positive-definite-matrices-in-matlab/
                long double det = 2 * _matrixL.diagonal().array().log().sum();

                double a = (_obs_mean.transpose() * _alpha)
                               .trace(); // generalization for multi dimensional observation

                _log_lik = -0.5 * a - 0.5 * det - 0.5 * n * std::log(2 * M_PI);

                return _log_lik;
            }

            /// compute and return the gradient of the log likelihood wrt to the kernel parameters
            Eigen::VectorXd compute_kernel_grad_log_lik()
            {
                size_t n = _obs_mean.rows();

                // compute K^{-1} only if needed
                if (!_inv_kernel_updated) {
                    compute_inv_kernel();
                }
                Eigen::MatrixXd w = _inv_kernel;

                // alpha * alpha.transpose() - K^{-1}
                w = _alpha * _alpha.transpose() - w;

                // only compute half of the matrix (symmetrical matrix)
                Eigen::VectorXd grad = Eigen::VectorXd::Zero(_kernel_function.h_params_size());
                for (size_t i = 0; i < n; ++i) {
                    for (size_t j = 0; j <= i; ++j) {
                        Eigen::VectorXd g = _kernel_function.grad(_samples[i], _samples[j], i, j);
                        if (i == j)
                            grad += w(i, j) * g * 0.5;
                        else
                            grad += w(i, j) * g;
                    }
                }

                return grad;
            }

            /// compute and return the gradient of the log likelihood wrt to the mean parameters
            Eigen::VectorXd compute_mean_grad_log_lik()
            {
                size_t n = _obs_mean.rows();

                // compute K^{-1} only if needed
                if (!_inv_kernel_updated) {
                    compute_inv_kernel();
                }

                Eigen::VectorXd grad = Eigen::VectorXd::Zero(_mean_function.h_params_size());
                for (int i_obs = 0; i_obs < _dim_out; ++i_obs)
                    for (size_t n_obs = 0; n_obs < n; n_obs++) {
                        grad += _obs_mean.col(i_obs).transpose() * _inv_kernel.col(n_obs) * _mean_function.grad(_samples[n_obs], *this).row(i_obs);
                    }

                return grad;
            }
328

329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
            /// return the likelihood (do not compute it -- return last computed)
            double get_log_lik() const { return _log_lik; }

            /// set the log likelihood (e.g. computed from outside)
            void set_log_lik(double log_lik) { _log_lik = log_lik; }

            /// compute and return the log probability of LOO CV
            double compute_log_loo_cv()
            {
                // compute K^{-1} only if needed
                if (!_inv_kernel_updated) {
                    compute_inv_kernel();
                }

                Eigen::VectorXd inv_diag = _inv_kernel.diagonal().array().inverse();

                _log_loo_cv = (((-0.5 * (_alpha.array().square().array().colwise() * inv_diag.array())).array().colwise() - 0.5 * inv_diag.array().log().array()) - 0.5 * std::log(2 * M_PI)).colwise().sum().sum();

                return _log_loo_cv;
            }

            /// compute and return the gradient of the log probability of LOO CV wrt to the kernel parameters
            Eigen::VectorXd compute_kernel_grad_log_loo_cv()
            {
                size_t n = _obs_mean.rows();
                size_t n_params = _kernel_function.h_params_size();

                // compute K^{-1} only if needed
                if (!_inv_kernel_updated) {
                    compute_inv_kernel();
                }

                Eigen::VectorXd grad = Eigen::VectorXd::Zero(n_params);
                Eigen::MatrixXd grads = Eigen::MatrixXd::Zero(n_params, _dim_out);

                // only compute half of the matrix (symmetrical matrix)
                // TO-DO: Make it better
                std::vector<std::vector<Eigen::VectorXd>> full_dk;
                for (size_t i = 0; i < n; i++) {
                    full_dk.push_back(std::vector<Eigen::VectorXd>());
                    for (size_t j = 0; j <= i; j++)
                        full_dk[i].push_back(_kernel_function.grad(_samples[i], _samples[j], i, j));
                    for (size_t j = i + 1; j < n; j++)
                        full_dk[i].push_back(Eigen::VectorXd::Zero(n_params));
                }
                for (size_t i = 0; i < n; i++)
                    for (size_t j = 0; j < i; ++j)
                        full_dk[j][i] = full_dk[i][j];

                Eigen::VectorXd inv_diag = _inv_kernel.diagonal().array().inverse();

                for (int j = 0; j < grad.size(); j++) {
                    Eigen::MatrixXd dKdTheta_j = Eigen::MatrixXd::Zero(n, n);
                    for (size_t i = 0; i < n; i++) {
                        for (size_t k = 0; k < n; k++)
                            dKdTheta_j(i, k) = full_dk[i][k](j);
                    }
                    Eigen::MatrixXd Zeta_j = _inv_kernel * dKdTheta_j;
                    Eigen::MatrixXd Zeta_j_alpha = Zeta_j * _alpha;
                    Eigen::MatrixXd Zeta_j_K = Zeta_j * _inv_kernel;

                    grads.row(j) = ((_alpha.array() * Zeta_j_alpha.array() - 0.5 * ((1. + _alpha.array().square().array().colwise() * inv_diag.array()).array().colwise() * Zeta_j_K.diagonal().array())).array().colwise() * inv_diag.array()).colwise().sum();

                    // for (size_t i = 0; i < n; i++)
                    //     grads.row(j).array() += (_alpha.row(i).array() * Zeta_j_alpha.row(i).array() - 0.5 * (1. + _alpha.row(i).array().square() / _inv_kernel.diagonal()(i)) * Zeta_j_K.diagonal()(i)) / _inv_kernel.diagonal()(i);
                }

                grad = grads.rowwise().sum();

                return grad;
            }

            /// return the LOO-CV log probability (do not compute it -- return last computed)
            double get_log_loo_cv() const { return _log_loo_cv; }

            /// set the LOO-CV log probability (e.g. computed from outside)
            void set_log_loo_cv(double log_loo_cv) { _log_loo_cv = log_loo_cv; }
406
407
408
409
410
411
412
413
414
415

            /// LLT matrix (from Cholesky decomposition)
            //const Eigen::LLT<Eigen::MatrixXd>& llt() const { return _llt; }
            const Eigen::MatrixXd& matrixL() const { return _matrixL; }

            const Eigen::MatrixXd& alpha() const { return _alpha; }

            /// return the list of samples that have been tested so far
            const std::vector<Eigen::VectorXd>& samples() const { return _samples; }

416
417
            bool inv_kernel_computed() { return _inv_kernel_updated; }

418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
        protected:
            int _dim_in;
            int _dim_out;

            KernelFunction _kernel_function;
            MeanFunction _mean_function;

            std::vector<Eigen::VectorXd> _samples;
            Eigen::MatrixXd _observations;
            Eigen::MatrixXd _mean_vector;
            Eigen::MatrixXd _obs_mean;

            Eigen::MatrixXd _alpha;
            Eigen::VectorXd _mean_observation;

433
            Eigen::MatrixXd _kernel, _inv_kernel;
434
435
436

            Eigen::MatrixXd _matrixL;

437
438
            double _log_lik, _log_loo_cv;
            bool _inv_kernel_updated;
439

440
441
            HyperParamsOptimizer _hp_optimize;

442
443
444
445
446
447
448
449
            void _compute_obs_mean()
            {
                _mean_vector.resize(_samples.size(), _dim_out);
                for (int i = 0; i < _mean_vector.rows(); i++)
                    _mean_vector.row(i) = _mean_function(_samples[i], *this);
                _obs_mean = _observations - _mean_vector;
            }

450
            void _compute_full_kernel()
451
            {
452
453
454
455
456
457
                size_t n = _samples.size();
                _kernel.resize(n, n);

                // O(n^2) [should be negligible]
                for (size_t i = 0; i < n; i++)
                    for (size_t j = 0; j <= i; ++j)
458
                        _kernel(i, j) = _kernel_function(_samples[i], _samples[j], i, j);
459
460
461
462
463
464
465
466

                for (size_t i = 0; i < n; i++)
                    for (size_t j = 0; j < i; ++j)
                        _kernel(j, i) = _kernel(i, j);

                // O(n^3)
                _matrixL = Eigen::LLT<Eigen::MatrixXd>(_kernel).matrixL();

467
                this->_compute_alpha();
468
469
470

                // notify change of kernel
                _inv_kernel_updated = false;
471
472
473
            }

            void _compute_incremental_kernel()
474
            {
475
                // Incremental LLT
476
                // This part of the code is inpired from the Bayesopt Library (cholesky_add_row function).
477
478
                // However, the mathematical fundations can be easily retrieved by detailling the equations of the
                // extended L matrix that produces the desired kernel.
479

480
481
                size_t n = _samples.size();
                _kernel.conservativeResize(n, n);
482

483
                for (size_t i = 0; i < n; ++i) {
484
                    _kernel(i, n - 1) = _kernel_function(_samples[i], _samples[n - 1], i, n - 1);
485
                    _kernel(n - 1, i) = _kernel(i, n - 1);
486
487
                }

488
489
490
491
492
493
                _matrixL.conservativeResizeLike(Eigen::MatrixXd::Zero(n, n));

                double L_j;
                for (size_t j = 0; j < n - 1; ++j) {
                    L_j = _kernel(n - 1, j) - (_matrixL.block(j, 0, 1, j) * _matrixL.block(n - 1, 0, 1, j).transpose())(0, 0);
                    _matrixL(n - 1, j) = (L_j) / _matrixL(j, j);
494
495
                }

496
497
                L_j = _kernel(n - 1, n - 1) - (_matrixL.block(n - 1, 0, 1, n - 1) * _matrixL.block(n - 1, 0, 1, n - 1).transpose())(0, 0);
                _matrixL(n - 1, n - 1) = sqrt(L_j);
Antoine Cully's avatar
Antoine Cully committed
498

499
                this->_compute_alpha();
500
501
502

                // notify change of kernel
                _inv_kernel_updated = false;
503
504
            }

505
            void _compute_alpha()
506
            {
507
                // alpha = K^{-1} * this->_obs_mean;
508
                Eigen::TriangularView<Eigen::MatrixXd, Eigen::Lower> triang = _matrixL.template triangularView<Eigen::Lower>();
509
510
                _alpha = triang.solve(_obs_mean);
                triang.adjoint().solveInPlace(_alpha);
511
512
513
514
515
516
517
518
519
            }

            Eigen::VectorXd _mu(const Eigen::VectorXd& v, const Eigen::VectorXd& k) const
            {
                return (k.transpose() * _alpha) + _mean_function(v, *this).transpose();
            }

            double _sigma(const Eigen::VectorXd& v, const Eigen::VectorXd& k) const
            {
520
521
                Eigen::VectorXd z = _matrixL.triangularView<Eigen::Lower>().solve(k);
                double res = _kernel_function(v, v) - z.dot(z);
522
523
524
525
526
527
528
529
530
531
532
533

                return (res <= std::numeric_limits<double>::epsilon()) ? 0 : res;
            }

            Eigen::VectorXd _compute_k(const Eigen::VectorXd& v) const
            {
                Eigen::VectorXd k(_samples.size());
                for (int i = 0; i < k.size(); i++)
                    k[i] = _kernel_function(_samples[i], v);
                return k;
            }
        };
Jean-Baptiste Mouret's avatar
Jean-Baptiste Mouret committed
534
535
536
537
538
539
540
541
542
        /// GPBasic is a GP with a "mean data" mean function, Exponential kernel, 
        /// and NO hyper-parameter optimization
        template <typename Params>
        using GPBasic = GP <Params, kernel::Exp<Params>, mean::Data<Params>, gp::NoLFOpt<Params>>;

        /// GPOpt is a GP with a "mean data" mean function, Exponential kernel with Automatic Relevance
        /// Determination (ARD), and hyper-parameter optimization based on Rprop
        template <typename Params>
        using GPOpt = GP<Params, kernel::SquaredExpARD<Params>, mean::Data<Params>, gp::KernelLFOpt<Params>>;
543
544
    }
}
545

546
#endif