-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrnnlm.py
236 lines (172 loc) · 6.82 KB
/
rnnlm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
from numpy import *
import itertools
import time
import sys
# Import NN utils
from nn.base import NNBase
from nn.math import softmax, sigmoid
from nn.math import MultinomialSampler, multinomial_sample
from misc import random_weight_matrix
class RNNLM(NNBase):
"""
Implements an RNN language model of the form:
h(t) = sigmoid(H * h(t-1) + L[x(t)])
y(t) = softmax(U * h(t))
where y(t) predicts the next word in the sequence
U = |V| * dim(h) as output vectors
L = |V| * dim(h) as input vectors
You should initialize each U[i,j] and L[i,j]
as Gaussian noise with mean 0 and variance 0.1
Arguments:
L0 : initial input word vectors
U0 : initial output word vectors
alpha : default learning rate
bptt : number of backprop timesteps
"""
def __init__(self, L0, U0=None,
alpha=0.005, rseed=10, bptt=1):
self.hdim = L0.shape[1] # word vector dimensions
self.vdim = L0.shape[0] # vocab size
param_dims = dict(H = (self.hdim, self.hdim),
U = L0.shape)
# note that only L gets sparse updates
param_dims_sparse = dict(L = L0.shape)
NNBase.__init__(self, param_dims, param_dims_sparse)
#### YOUR CODE HERE ####
# Initialize word vectors
# either copy the passed L0 and U0 (and initialize in your notebook)
# or initialize with gaussian noise here
# Initialize H matrix, as with W and U in part 1
#### END YOUR CODE ####
def _acc_grads(self, xs, ys):
"""
Accumulate gradients, given a pair of training sequences:
xs = [<indices>] # input words
ys = [<indices>] # output words (to predict)
Your code should update self.grads and self.sgrads,
in order for gradient_check and training to work.
So, for example:
self.grads.H += (your gradient dJ/dH)
self.sgrads.L[i] = (gradient dJ/dL[i]) # update row
Per the handout, you should:
- make predictions by running forward in time
through the entire input sequence
- for *each* output word in ys, compute the
gradients with respect to the cross-entropy
loss for that output word
- run backpropagation-through-time for self.bptt
timesteps, storing grads in self.grads (for H, U)
and self.sgrads (for L)
You'll want to store your predictions \hat{y}(t)
and the hidden layer values h(t) as you run forward,
so that you can access them during backpropagation.
At time 0, you should initialize the hidden layer to
be a vector of zeros.
"""
# Expect xs as list of indices
ns = len(xs)
# make matrix here of corresponding h(t)
# hs[-1] = initial hidden state (zeros)
hs = zeros((ns+1, self.hdim))
# predicted probas
ps = zeros((ns, self.vdim))
#### YOUR CODE HERE ####
##
# Forward propagation
##
# Backward propagation through time
#### END YOUR CODE ####
def grad_check(self, x, y, outfd=sys.stderr, **kwargs):
"""
Wrapper for gradient check on RNNs;
ensures that backprop-through-time is run to completion,
computing the full gradient for the loss as summed over
the input sequence and predictions.
Do not modify this function!
"""
bptt_old = self.bptt
self.bptt = len(y)
print >> outfd, "NOTE: temporarily setting self.bptt = len(y) = %d to compute true gradient." % self.bptt
NNBase.grad_check(self, x, y, outfd=outfd, **kwargs)
self.bptt = bptt_old
print >> outfd, "Reset self.bptt = %d" % self.bptt
def compute_seq_loss(self, xs, ys):
"""
Compute the total cross-entropy loss
for an input sequence xs and output
sequence (labels) ys.
You should run the RNN forward,
compute cross-entropy loss at each timestep,
and return the sum of the point losses.
"""
J = 0
#### YOUR CODE HERE ####
#### END YOUR CODE ####
return J
def compute_loss(self, X, Y):
"""
Compute total loss over a dataset.
(wrapper for compute_seq_loss)
Do not modify this function!
"""
if not isinstance(X[0], ndarray): # single example
return self.compute_seq_loss(X, Y)
else: # multiple examples
return sum([self.compute_seq_loss(xs,ys)
for xs,ys in itertools.izip(X, Y)])
def compute_mean_loss(self, X, Y):
"""
Normalize loss by total number of points.
Do not modify this function!
"""
J = self.compute_loss(X, Y)
ntot = sum(map(len,Y))
return J / float(ntot)
def generate_sequence(self, init, end, maxlen=100):
"""
Generate a sequence from the language model,
by running the RNN forward and selecting,
at each timestep, a random word from the
a word from the emitted probability distribution.
The MultinomialSampler class (in nn.math) may be helpful
here for sampling a word. Use as:
y = multinomial_sample(p)
to sample an index y from the vector of probabilities p.
Arguments:
init = index of start word (word_to_num['<s>'])
end = index of end word (word_to_num['</s>'])
maxlen = maximum length to generate
Returns:
ys = sequence of indices
J = total cross-entropy loss of generated sequence
"""
J = 0 # total loss
ys = [init] # emitted sequence
#### YOUR CODE HERE ####
#### YOUR CODE HERE ####
return ys, J
class ExtraCreditRNNLM(RNNLM):
"""
Implements an improved RNN language model,
for better speed and/or performance.
We're not going to place any constraints on you
for this part, but we do recommend that you still
use the starter code (NNBase) framework that
you've been using for the NER and RNNLM models.
"""
def __init__(self, *args, **kwargs):
#### YOUR CODE HERE ####
raise NotImplementedError("__init__() not yet implemented.")
#### END YOUR CODE ####
def _acc_grads(self, xs, ys):
#### YOUR CODE HERE ####
raise NotImplementedError("_acc_grads() not yet implemented.")
#### END YOUR CODE ####
def compute_seq_loss(self, xs, ys):
#### YOUR CODE HERE ####
raise NotImplementedError("compute_seq_loss() not yet implemented.")
#### END YOUR CODE ####
def generate_sequence(self, init, end, maxlen=100):
#### YOUR CODE HERE ####
raise NotImplementedError("generate_sequence() not yet implemented.")
#### END YOUR CODE ####