-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathw2v_try.py
85 lines (68 loc) · 2.53 KB
/
w2v_try.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from keras.layers import Input, Activation
from keras.layers.merge import Multiply, Dot
from keras.layers.core import Lambda, Reshape, Flatten
from keras.models import Model
from keras.layers.embeddings import Embedding
import utils
# load data
# - sentences: list of (list of word-id)
# - index2word: list of string
sentences, index2word = utils.load_sentences_brown(100)
# params
nb_epoch = 3
# learn `batch_size words` at a time
batch_size = 128
vec_dim = 64
negsampling_num = 0.2
# half of window
window_size = 8
vocab_size = len(index2word)
# create input
couples, labels = utils.skip_grams(sentences, window_size, vocab_size, negsampling_num)
print 'shape of couples: ', couples.shape
print 'shape of labels: ', labels.shape
# metrics
nb_batch = len(labels) // batch_size
samples_per_epoch = batch_size * nb_batch
# graph definition (pvt: center of window, ctx: context)
input_pvt = Input(batch_shape=(batch_size, 1), dtype='int32')
input_ctx = Input(batch_shape=(batch_size, 1), dtype='int32')
embedded_pvt = Embedding(input_dim=vocab_size,
output_dim=vec_dim,
input_length=1)(input_pvt)
embedded_pvt = Reshape((vec_dim, 1))(embedded_pvt)
embedded_ctx = Embedding(input_dim=vocab_size,
output_dim=vec_dim,
input_length=1)(input_ctx)
'''
# merged = merge(inputs=[embedded_pvt, embedded_ctx],
# mode=lambda x: (x[0] * x[1]).sum(-1),
# output_shape=(batch_size, 1))
#embedded_pvt = tf.Session().run(embedded_pvt)
#embedded_ctx = tf.Session().run(embedded_ctx)
#merged = Lambda(lambda x: (x[0] * x[1]).sum(-1),
# output_shape=(batch_size, 1))([embedded_pvt, embedded_ctx])
#merged = tf.reduce_sum(merged, -1)
'''
merged = Dot((1,2))([embedded_pvt, embedded_ctx])
print merged
flatten = Flatten()(merged)
print flatten
raw_input()
predictions = Activation('sigmoid')(flatten)
# build and train the model
model = Model(input=[input_pvt, input_ctx], output=predictions)
model.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy'])
model.fit_generator(generator=utils.batch_generator(couples, labels, batch_size, nb_batch),
steps_per_epoch=samples_per_epoch,
epochs=nb_epoch,
workers=1)
# save weights
utils.save_weights(model, index2word, vec_dim)
# eval using gensim
print 'the....'
utils.most_similar(positive=['the'])
print 'all....'
utils.most_similar(positive=['all'])
print 'she - he + him....'
utils.most_similar(positive=['she', 'him'], negative=['he'])