-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathgpw03fastlik.m
91 lines (78 loc) · 2.72 KB
/
gpw03fastlik.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
function [fX, dfX] = gpw03fastlik(X, input, t, dflag);
global invQ logdetQ; % get from gpw03lik to avoid recomputation
% gpw03fastlik: Compute minus log likelihood and its derivatives with respect to
% warping parameters only, for the warped Gaussian process with
% neural net covariance. Return zero gradients for covariance hyperparameters.
%
% Based on code for GPs by Carl Edward Rasmussen (gp03lik)
%
% X is the vector of parameters of length D+3+3*num, where num is
% the number of tanh functions in the warp (see warp):
%
% X = [ log(S_1)
% log(S_2)
% .
% log(S_D+1)
% log(v1)
% log(v0)
% a
% b
% c ]
%
% S,v are the hyperparameters of the covariance (see gp01lik)
% a,b,c are vectors of warping parameters of length num (see warp)
%
% input is an (n x D) array of data
% t is a (n x 1) vector of target values
% dflag == 'g' outputs only gradients; dflag = 'f' outputs only
% function values
%
% Use in conjunction with gradient based minimizer such as Carl
% Edward Rasmussen's "minimize":
%
% [X, fX, i] = minimize(X, 'gpw03fastlik', length, input, t, dflag)
if nargin < 4, dflag = 'default'; end
[n, D] = size(input); % number of examples and dimension of input space
num = (length(X) - D - 3)/3; % number of tanh functions in warp
expX = exp(X);
% prior means and variances in hyperparameters
% alter these to do MAP rather than ML optimization
sX = repmat(1e15,length(X),1);
muX = zeros(size(X));
%sX(D+4+num:D+3+2*num) = 4;
for i = 1:num % the parameters for warping function
ea(i,1) = expX(D+3+i); eb(i,1) = expX(D+3+num+i);
c(i,1) = X(D+3+2*num+i);
end
z = warp(t,ea,eb,c); % warp target values onto latent variable z
invQz = invQ*z;
w = ones(n,1);
for i = 1:num % various useful constructs
s{i} = eb(i)*(t + c(i));
r{i} = tanh(s{i});
y{i} = 1 - r{i}.^2;
w = w + ea(i)*eb(i).*y{i};
end
% calculate minus log likelihood ...
if dflag == 'default' | dflag == 'f'
jacob = sum(log(w)); % jacobian term
fX = 0.5*logdetQ + 0.5*z'*invQz + 0.5*n*log(2*pi) - jacob + ...
0.5*sum(((X-muX).^2)./sX);% + log(2*pi*sX));
else
fX = [];
end
% ... and its partial derivatives
if dflag == 'default' | dflag == 'g'
dfX = zeros(D+3+3*num,1); % set the size of the derivative
% vector
for i = 1:num % derivatives wrt warping parameters
dfX(D+3+i) = ea(i)*(r{i}'*invQz - sum(eb(i).*y{i}./w));
dfX(D+3+num+i) = eb(i)*((ea(i)*(t+c(i)).*y{i})'*invQz - sum((ea(i).*y{i} - ea(i)*eb(i)*2.*y{i}.*r{i}.*(t+c(i)))./w));
dfX(D+3+2*num+i) = (ea(i)*eb(i).*y{i})'*invQz + sum(ea(i)*eb(i)^2*2.*y{i}.*r{i}./w);
end
for d = D+4:length(X)
dfX(d) = dfX(d) + (X(d)-muX(d))/sX(d);
end
else
dfX = [];
end