forked from soulmachine/machine-learning-cheat-sheet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchapterOptimization.tex
190 lines (142 loc) · 7.63 KB
/
chapterOptimization.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
\chapter{Optimization methods}
\label{chap:Optimization-methods}
\section{Convexity}
\label{sec:Convexity}
\begin{definition}
(\textbf{Convex set}) We say a set $\mathcal{S}$ is convex if for any $\vec{x}_1, \vec{x}_2 \in \mathcal{S}$, we have
\begin{equation}
\lambda\vec{x}_1+(1-\lambda)\vec{x}_2 \in \mathcal{S}, \forall \lambda \in [0,1]
\end{equation}
\end{definition}
\begin{definition}
(\textbf{Convex function}) A function $f(\vec{x})$ is called convex if its \textbf{epigraph}(the set of points above the function) defines a convex set. Equivalently, a function $f(\vec{x})$ is called convex if it is defined on a convex set and if, for any $\vec{x}_1, \vec{x}_2 \in \mathcal{S}$, and any $\lambda \in [0,1]$, we have
\begin{equation}
f(\lambda\vec{x}_1+(1-\lambda)\vec{x}_2) \leq \lambda f(\vec{x}_1)+(1-\lambda)f(\vec{x}_2)
\end{equation}
\end{definition}
\begin{definition}
A function $f(\vec{x})$ is said to be \textbf{strictly convex} if the inequality is strict
\begin{equation}
f(\lambda \vec{x}_1 + (1 - \lambda)\vec{x}_2) < \lambda f(\vec{x}_1) + (1 - \lambda)f(\vec{x}_2)
\end{equation}
\end{definition}
\begin{definition}
A function $f(\vec{x})$ is said to be (strictly) \textbf{concave} if $-f(\vec{x})$ is (strictly) convex.
\end{definition}
\begin{theorem}
If $f(x)$ is twice differentiable on $[a, b]$ and $f''(x) \geq 0$ on $[a, b]$ then $f(x)$ is convex on $[a, b]$.
\end{theorem}
\begin{proposition}
$\log(x)$ is strictly convex on $(0, \infty)$.
\end{proposition}
Intuitively, a (strictly) convex function has a “bowl shape”, and hence has a unique global minimum $x^*$ corresponding to the bottom of the bowl. Hence its second derivative must be positive everywhere, $\frac{\mathrm{d}^2}{\mathrm{d}x^2}f(x)>0$. A twice-continuously differentiable, multivariate function $f$ is convex iff its Hessian is positive definite for all $\vec{x}$. In the machine learning context, the function $f$ often corresponds to the NLL.
Models where the NLL is convex are desirable, since this means we can always find the globally optimal MLE. We will see many examples of this later in the book. However, many models of interest will not have concave likelihoods. In such cases, we will discuss ways to derive locally optimal parameter estimates.
\section{Gradient descent}
\label{sec:Gradient-descent}
\subsection{Stochastic gradient descent}
\begin{algorithm}[htbp]
\SetKwInOut{Input}{input}\SetKwInOut{Output}{output}
\Input{Training data $\mathcal{D}=\left\{(\vec{x}_i,y_i) | i=1:N\right\}$}
\Output{A linear model: $y_i=\vec{\theta}^T\vec{x}$}
$\vec{w} \leftarrow 0;\; b \leftarrow 0;\; k \leftarrow 0$\;
\While{no mistakes made within the for loop}{
\For{$i\leftarrow 1$ \KwTo $N$}{
\If{$y_i(\vec{w} \cdot \vec{x}_i+b) \leq 0$}{
$\vec{w} \leftarrow \vec{w}+\eta y_i \vec{x}_i$\;
$b \leftarrow b+\eta y_i$\;
$k \leftarrow k+1$\;
}
}
}
\caption{Stochastic gradient descent}
\end{algorithm}
\subsection{Batch gradient descent}
\subsection{Line search}
The \textbf{line search}\footnote{\url{http://en.wikipedia.org/wiki/Line_search}} approach first finds a descent direction along which the objective function $f$ will be reduced and then computes a step size that determines how far $\vec{x}$ should move along that direction. The descent direction can be computed by various methods, such as gradient descent(Section \ref{sec:Gradient-descent}), Newton's method(Section \ref{sec:Newtons-method}) and Quasi-Newton method(Section \ref{sec:Quasi-Newton-method}). The step size can be determined either exactly or inexactly.
\subsection{Momentum term}
\section{Lagrange duality}
\subsection{Primal form}
Consider the following, which we'll call the \textbf{primal} optimization problem:
\begin{eqnarray}
xyz
\end{eqnarray}
\subsection{Dual form}
\section{Newton's method}
\label{sec:Newtons-method}
\begin{align}
f(\vec{x})& \approx f(\vec{x}_k)+\vec{g}_k^T(\vec{x}-\vec{x}_k)+\dfrac{1}{2}(\vec{x}-\vec{x}_k)^T\vec{H}_k(\vec{x}-\vec{x}_k) \nonumber \\
\text{where } & \vec{g}_k \triangleq \vec{g}(\vec{x}_k)=f'(\vec{x}_k), \vec{H}_k \triangleq \vec{H}(\vec{x}_k), \nonumber \\
& \vec{H}(\vec{x}) \triangleq \left[\dfrac{\partial^2 f}{\partial x_i \partial x_j}\right]_{D \times D} \quad \text{(Hessian matrix)} \nonumber \\
f'(\vec{x})& = \vec{g}_k+\vec{H}_k(\vec{x}-\vec{x}_k)=0 \Rightarrow \label{eqn:newton-stationary-point} \\
\vec{x}_{k+1}& = \vec{x}_k-\vec{H}_k^{-1}\vec{g}_k
\end{align}
\begin{algorithm}[htbp]
Initialize $\vec{x}_0$ \\
\While{(!convergency)} {
Evaluate $\vec{g}_k=\nabla f(\vec{x}_k)$ \\
Evaluate $\vec{H}_k=\nabla^2 f(\vec{x}_k)$ \\
$\vec{d}_k=-\vec{H}_k^{-1}\vec{g}_k$ \\
Use line search to find step size $\eta_k$ along $\vec{d}_k$ \\
$\vec{x}_{k+1}=\vec{x}_k+\eta_k\vec{d}_k$
}
\caption{Newton’s method for minimizing a strictly convex function}
\end{algorithm}
\section{Quasi-Newton method}
\label{sec:Quasi-Newton-method}
From Equation \ref{eqn:newton-stationary-point} we can infer out the \textbf{quasi-Newton condition} as follows:
\begin{align}
f'(\vec{x})-\vec{g}_k & = \vec{H}_k(\vec{x}-\vec{x}_k) \nonumber \\
\vec{g}_{k-1}-\vec{g}_k & = \vec{H}_k(\vec{x}_{k-1}-\vec{x}_k) \Rightarrow \nonumber \\
\vec{g}_k- \vec{g}_{k-1} & = \vec{H}_k(\vec{x}_k-\vec{x}_{k-1}) \nonumber \\
\vec{g}_{k+1}- \vec{g}_k & = \vec{H}_{k+1}(\vec{x}_{k+1}-\vec{x}_k) \quad \text{(quasi-Newton condition)} \label{eqn:quasi-Newton-condition}
\end{align}
The idea is to replace $\vec{H}_k^{-1}$ with a approximation $\vec{B}_k$, which satisfies the following properties:
\begin{enumerate}
\item{$\vec{B}_k$ must be symmetric}
\item{$\vec{B}_k$ must satisfies the quasi-Newton condition, i.e., $\vec{g}_{k+1} - \vec{g}_k= \vec{B}_{k+1}(\vec{x}_{k+1}-\vec{x}_k)$. \\
Let $\vec{y}_k=\vec{g}_{k+1}- \vec{g}_k$, $\vec{\delta}_k=\vec{x}_{k+1}-\vec{x}_k$, then
\begin{equation}
\vec{B}_{k+1}\vec{y}_k=\vec{\delta}_k \label{eqn:secant-equation}
\end{equation}
}
\item{Subject to the above, $\vec{B}_k$ should be as close as possible to $\vec{B_{k-1}}$.}
\end{enumerate}
Note that we did not require that $\vec{B}_k$ be positive definite. That is because we can show that it must be
positive definite if $\vec{B_{k-1}}$ is. Therefore, as long as the initial Hessian approximation $\vec{B}_0$ is positive definite,
all $\vec{B}_k$ are, by induction.
\subsection{DFP}
Updating rule:
\begin{equation}
\vec{B}_{k+1}=\vec{B}_k+\vec{P}_k+\vec{Q}_k
\end{equation}
From Equation \ref{eqn:secant-equation} we can get
\begin{equation*}
\vec{B}_{k+1}\vec{y}_k=\vec{B}_k\vec{y}_k+\vec{P}_k\vec{y}_k+\vec{Q}_k\vec{y}_k=\vec{\delta}_k
\end{equation*}
To make the equation above establish, just let
\begin{align*}
\vec{P}_k\vec{y}_k & = \vec{\delta}_k \\
\vec{Q}_k\vec{y}_k & = -\vec{B}_k\vec{y}_k
\end{align*}
In DFP algorithm, $\vec{P}_k$ and $\vec{Q}_k$ are
\begin{align}
\vec{P}_k &= \dfrac{\vec{\delta}_k\vec{\delta}_k^T}{\vec{\delta}_k^T\vec{y}_k} \\
\vec{Q}_k &= -\dfrac{\vec{B}_k\vec{y}_k\vec{y}_k^T\vec{B}_k}{\vec{y}_k^T\vec{B}_k\vec{y}_k}
\end{align}
\subsection{BFGS}
Use $\vec{B}_k$ as a approximation to $\vec{H}_k$, then the quasi-Newton condition becomes
\begin{equation}
\vec{B}_{k+1}\vec{\delta}_k=\vec{y}_k
\end{equation}
The updating rule is similar to DFP, but $\vec{P}_k$ and $\vec{Q}_k$ are different. Let
\begin{align*}
\vec{P}_k\vec{\delta}_k & = \vec{y}_k \\
\vec{Q}_k\vec{\delta}_k & = -\vec{B}_k\vec{\delta}_k
\end{align*}
Then
\begin{align}
\vec{P}_k &= \dfrac{\vec{y}_k\vec{y}_k^T}{\vec{y}_k^T\vec{\delta}_k} \\
\vec{Q}_k &= -\dfrac{\vec{B}_k\vec{\delta}_k\vec{\delta}_k^T\vec{B}_k}{\vec{\delta}_k^T\vec{B}_k\vec{\delta}_k}
\end{align}
\subsection{Broyden}
Broyden's algorithm is a linear combination of DFP and BFGS.