-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path5_language.py
105 lines (92 loc) · 3.87 KB
/
5_language.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from mxnet import gluon, init, autograd, ndarray as nd
from mxnet.gluon import rnn, nn
import mxnet as mx
from intermediate.data.data_utils import Corpus
import time
# TODO: this achievement is not efficient
# choose cpu or gpu --- default cpu
gpu = True
ctx = mx.gpu() if gpu else mx.cpu()
start = time.time()
# Hyper Parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 1000 # number of words to be sampled
batch_size = 20
seq_length = 30
learning_rate = 0.002
# Load Penn Treebank Dataset
train_path = './data/train.txt'
sample_path = './data/sample.txt'
corpus = Corpus()
ids = corpus.get_data(train_path, batch_size)
vocab_size = len(corpus.dictionary)
num_batches = ids.shape[1] // seq_length
# RNN Based Language Model
class RNNLM(gluon.Block):
def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
super(RNNLM, self).__init__()
with self.name_scope():
self.embed = nn.Embedding(vocab_size, embed_size, weight_initializer=init.Uniform(0.1))
self.lstm = rnn.LSTM(hidden_size, num_layers, layout='NTC')
self.linear = nn.Dense(vocab_size, weight_initializer=init.Uniform(0.1))
def forward(self, x, h):
# Embed word ids to vectors
x = self.embed(x)
# Forward propagate RNN
out, h = self.lstm(x, h)
# Reshape output to (batch_size*sequence_length, hidden_size)
out = out.reshape((out.shape[0] * out.shape[1], out.shape[2]))
# Decode hidden states of all time step
out = self.linear(out)
return out, h
model = RNNLM(vocab_size, embed_size, hidden_size, num_layers)
model.initialize(ctx=ctx)
# Loss and Optimizer
criterion = gluon.loss.SoftmaxCrossEntropyLoss()
optimizer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': learning_rate})
# Training
for epoch in range(num_epochs):
# Initial hidden and memory states
states = [nd.zeros((num_layers, batch_size, hidden_size), ctx=ctx),
nd.zeros((num_layers, batch_size, hidden_size), ctx=ctx)]
for i in range(0, ids.shape[1] - seq_length, seq_length):
# Get batch inputs and targets
inputs = ids[:, i:i + seq_length].as_in_context(ctx)
targets = ids[:, (i + 1):i + 1 + seq_length].as_in_context(ctx)
states = [state.detach() for state in states]
with autograd.record():
outputs, states = model(inputs, states)
loss = criterion(outputs, targets.reshape((targets.size,)))
loss.backward()
grads = [i.grad(ctx) for i in model.collect_params().values()]
gluon.utils.clip_global_norm(grads, 0.5 * batch_size)
optimizer.step(batch_size)
step = (i + 1) // seq_length
if step % 100 == 0:
print('Epoch [%d/%d], Step[%d/%d], Loss: %.3f, Perplexity: %5.2f' %
(epoch + 1, num_epochs, step, num_batches, (loss / 1000).sum().asscalar(),
(loss / 1000).sum().exp().asscalar()))
# Sampling
with open(sample_path, 'w') as f:
# Set intial hidden ane memory states
states = [nd.zeros((num_layers, 1, hidden_size), ctx=ctx),
nd.zeros((num_layers, 1, hidden_size), ctx=ctx)]
# Select one word id randomly
prob = nd.ones((vocab_size,)) / vocab_size
input1 = nd.sample_multinomial(prob).reshape((1, 1)).as_in_context(ctx)
for i in range(num_samples):
output, state = model(input1, states)
# Sample a word id
prob = nd.exp(output).reshape((-1,))
prob = prob / prob.sum()
word_id = nd.sample_multinomial(prob, 1)
input1 = word_id.reshape((1, 1))
word = corpus.dictionary.idx2word[word_id.asscalar()]
word = '\n' if word == '<eos>' else word + ' '
f.write(word)
if (i + 1) % 100 == 0:
print('Sampled [%d/%d] words and save to %s' % (i + 1, num_samples, sample_path))
print("total time:", time.time() - start)