-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy paththree_layer_net.py
executable file
·312 lines (262 loc) · 14.4 KB
/
three_layer_net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
from past.builtins import xrange
from layers import *
class ThreeLayerNet(object):
"""
A three-layer fully-connected neural network with a residual block.
The net has an input dimension of N, a hidden layer dimension of H,
a hidden2 layer dimension of H2, and performs classification over C classes.
We train the network with a svm loss function and L2 regularization on the
weight matrices. The network uses a ReLU nonlinearity after the first and second
fully connected layer.
In other words, the network has the following architecture:
|- residual fully connected layer -|
input - fully connected layer - ReLU - fully connected layer - -> relu - FC - SVM
The outputs of the second fully-connected layer are the scores for each class.
"""
def __init__(self, input_size, hidden_size, hidden2_size, output_size, std=1e-4, use_Res=False):
"""
Initialize the model. Weights are initialized to small random values and
biases are initialized to zero. Weights and biases are stored in the
variable self.params, which is a dictionary with the following keys:
W1: First layer weights; has shape (D, H)
b1: First layer biases; has shape (H,)
W2: Second layer weights; has shape (H, H2)
b2: Second layer biases; has shape (H2,)
W3: Third layer weights; has shape (H2, C)
b3: Third layer biases; has shape(C,)
Wr: Residual layer weights; has shape (D, H2)
bb: Residual layer biases; has shape(H2,)
Inputs:
- input_size: The dimension D of the input data.
- hidden_size: The number of neurons H in the first hidden layer.
- hidden2_size: The number of neurons H2 in the second hidden layer.
- output_size: The number of classes C.
- use_Res: Apply residual block shortcut (or not).
"""
self.params = {}
self.params['W1'] = std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros(hidden_size)
self.params['W2'] = std * np.random.randn(hidden_size, hidden2_size)
self.params['b2'] = np.zeros(hidden2_size)
self.params['W3'] = std * np.random.randn(hidden2_size, output_size)
self.params['b3'] = np.zeros(output_size)
self.use_Res = use_Res
if use_Res == True:
self.params['Wr'] = std * np.random.randn(input_size, hidden2_size)
self.params['br'] = np.zeros(hidden2_size)
def loss(self, X, y=None, reg=0.0):
"""
Compute the loss and gradients for a two layer fully connected neural
network.
Inputs:
- X: Input data of shape (N, D). Each X[i] is a training sample.
- y: Vector of training labels. y[i] is the label for X[i], and each y[i] is
an integer in the range 0 <= y[i] < C. This parameter is optional; if it
is not passed then we only return scores, and if it is passed then we
instead return the loss and gradients.
- reg: Regularization strength.
Returns:
If y is None, return a matrix scores of shape (N, C) where scores[i, c] is
the score for class c on input X[i].
If y is not None, instead return a tuple of:
- loss: Loss (data loss and regularization loss) for this batch of training
samples.
- grads: Dictionary mapping parameter names to gradients of those parameters
with respect to the loss function; has the same keys as self.params.
"""
# Unpack variables from the params dictionary
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
W3, b3 = self.params['W3'], self.params['b3']
if self.use_Res == True:
Wr, br = self.params['Wr'], self.params['br']
N, D = X.shape
scores = None
#############################################################################
# TODO: Perform the forward pass, computing the class scores for the input. #
# Store the result in the scores variable, which should be an array of #
# shape (N, C). #
#############################################################################
layer1_relu_out, cache1_relu = affine_relu_forward(X, W1, b1)
layer2_out, cache2 = affine_forward(layer1_relu_out, W2, b2)
if self.use_Res == True:
layerr_out, cacher = affine_forward(X, Wr, br)
layer2_out += layerr_out
layer2_relu_out, cache2_relu = relu_forward(layer2_out)
layer3_out, cache3 = affine_forward(layer2_relu_out, W3, b3)
scores = layer3_out
pass
#############################################################################
# END OF YOUR CODE #
#############################################################################
# If the targets are not given then jump out, we're done
if y is None:
return scores
# Compute the loss
loss = None
#############################################################################
# TODO: Finish the forward pass, and compute the loss. This should include #
# both the data loss and L2 regularization for W1 and W2. Store the result #
# in the variable loss, which should be a scalar. Use the Softmax #
# classifier loss. #
#############################################################################
data_loss, dout = svm_loss(scores, y)
reg_loss = 0.5 * reg * (np.sum(W3*W3) + np.sum(W2*W2) + np.sum(W1*W1))
if self.use_Res == True:
reg_loss += 0.5 * reg * (np.sum(Wr*Wr))
loss = data_loss + reg_loss
pass
#############################################################################
# END OF YOUR CODE #
#############################################################################
# Backward pass: compute gradients
grads = {}
#############################################################################
# TODO: Compute the backward pass, computing the derivatives of the weights #
# and biases. Store the results in the grads dictionary. For example, #
# grads['W1'] should store the gradient on W1, and be a matrix of same size #
#############################################################################
dlayer2_relu_out, dW3, db3 = affine_backward(dout, cache3)
dlayer2_out = relu_backward(dlayer2_relu_out, cache2_relu)
dlayer1_relu_out, dW2, db2 = affine_backward(dlayer2_out, cache2)
dx, dW1, db1 = affine_relu_backward(dlayer1_relu_out, cache1_relu)
grads['b1'] = db1
grads['W1'] = dW1 + 1 * reg * W1
grads['b2'] = db2
grads['W2'] = dW2 + 1 * reg * W2
grads['b3'] = db3
grads['W3'] = dW3 + 1 * reg * W3
if self.use_Res == True:
dx, dWr, dbr = affine_backward(dlayer2_out, cacher)
grads['Wr'] = dWr
grads['br'] = dbr
pass
#############################################################################
# END OF YOUR CODE #
#############################################################################
return loss, grads
def train(self, X, y, X_val, y_val,
learning_rate=1e-3, learning_rate_decay=0.95,
reg=5e-6, num_iters=100,
batch_size=200, verbose=False):
"""
Train this neural network using stochastic gradient descent.
Inputs:
- X: A numpy array of shape (N, D) giving training data.
- y: A numpy array f shape (N,) giving training labels; y[i] = c means that
X[i] has label c, where 0 <= c < C.
- X_val: A numpy array of shape (N_val, D) giving validation data.
- y_val: A numpy array of shape (N_val,) giving validation labels.
- learning_rate: Scalar giving learning rate for optimization.
- learning_rate_decay: Scalar giving factor used to decay the learning rate
after each epoch.
- reg: Scalar giving regularization strength.
- num_iters: Number of steps to take when optimizing.
- batch_size: Number of training examples to use per step.
- verbose: boolean; if true print progress during optimization.
"""
num_train = X.shape[0]
iterations_per_epoch = max(num_train / batch_size, 1)
# Use SGD to optimize the parameters in self.model
loss_history = []
train_acc_history = []
val_acc_history = []
for it in xrange(num_iters):
X_batch = None
y_batch = None
#########################################################################
# TODO: Create a random minibatch of training data and labels, storing #
# them in X_batch and y_batch respectively. #
#########################################################################
indices = np.random.choice(num_train, batch_size, replace=False)
X_batch = X[indices]
y_batch = y[indices]
pass
#########################################################################
# END OF YOUR CODE #
#########################################################################
# Compute loss and gradients using the current minibatch
loss, grads = self.loss(X_batch, y=y_batch, reg=reg)
loss_history.append(loss)
#########################################################################
# TODO: Use the gradients in the grads dictionary to update the #
# parameters of the network (stored in the dictionary self.params) #
# using stochastic gradient descent. You'll need to use the gradients #
# stored in the grads dictionary defined above. #
#########################################################################
self.params['W1'] += -grads['W1'] * learning_rate
self.params['W2'] += -grads['W2'] * learning_rate
self.params['W3'] += -grads['W3'] * learning_rate
self.params['b1'] += -grads['b1'] * learning_rate
self.params['b2'] += -grads['b2'] * learning_rate
self.params['b3'] += -grads['b3'] * learning_rate
if self.use_Res == True:
self.params['Wr'] += -grads['Wr'] * learning_rate
self.params['br'] += -grads['br'] * learning_rate
pass
#########################################################################
# END OF YOUR CODE #
#########################################################################
if verbose and it % 100 == 0:
print('iteration %d / %d: loss %f val_acc %f train_acc %f' % (it,
num_iters, loss, (self.predict(X_val) == y_val).mean(),
(self.predict(X) == y).mean()))
# Every epoch, check train and val accuracy and decay learning rate.
if it % iterations_per_epoch == 0:
# Check accuracy
train_acc = (self.predict(X_batch) == y_batch).mean()
val_acc = (self.predict(X_val) == y_val).mean()
train_acc_history.append(train_acc)
val_acc_history.append(val_acc)
# Decay learning rate
learning_rate *= learning_rate_decay
return {
'loss_history': loss_history,
'train_acc_history': train_acc_history,
'val_acc_history': val_acc_history,
}
def predict(self, X):
"""
Use the trained weights of this two-layer network to predict labels for
data points. For each data point we predict scores for each of the C
classes, and assign each data point to the class with the highest score.
Inputs:
- X: A numpy array of shape (N, D) giving N D-dimensional data points to
classify.
Returns:
- y_pred: A numpy array of shape (N,) giving predicted labels for each of
the elements of X. For all i, y_pred[i] = c means that X[i] is predicted
to have class c, where 0 <= c < C.
"""
y_pred = None
###########################################################################
# TODO: Implement this function; it should be VERY simple! #
###########################################################################
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
W3, b3 = self.params['W3'], self.params['b3']
if self.use_Res == True:
Wr, br = self.params['Wr'], self.params['br']
scores = None
#############################################################################
# TODO: Perform the forward pass, computing the class scores for the input. #
# Store the result in the scores variable, which should be an array of #
# shape (N, C). #
#############################################################################
layer1_relu_out, _ = affine_relu_forward(X, W1, b1)
layer2_out, _ = affine_forward(layer1_relu_out, W2, b2)
if self.use_Res == True:
layerr_out, _ = affine_forward(X, Wr, br)
layer2_out += layerr_out
layer2_relu_out, _ = relu_forward(layer2_out)
layer3_out, _ = affine_forward(layer2_relu_out, W3, b3)
scores = layer3_out
y_pred = np.argmax(scores, axis=1)
pass
###########################################################################
# END OF YOUR CODE #
###########################################################################
return y_pred