forked from apache/singa
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqabot_data.py
282 lines (240 loc) · 10.3 KB
/
qabot_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
import numpy as np
import random
download_dir = "/tmp/"
import os
import urllib
def check_exist_or_download(url):
''' download data into tmp '''
name = url.rsplit('/', 1)[-1]
filename = os.path.join(download_dir, name)
if not os.path.isfile(filename):
print("Downloading %s" % url)
urllib.request.urlretrieve(url, filename)
return filename
def unzip_data(download_dir, data_zip):
data_dir = download_dir + "insuranceQA-master/V2/"
if not os.path.exists(data_dir):
print("extracting %s to %s" % (download_dir, data_dir))
from zipfile import ZipFile
with ZipFile(data_zip, 'r') as zipObj:
zipObj.extractall(download_dir)
return data_dir
def get_label2answer(data_dir):
import gzip
label2answer = dict()
with gzip.open(data_dir +
"/InsuranceQA.label2answer.token.encoded.gz") as fin:
for line in fin:
pair = line.decode().strip().split("\t")
idxs = pair[1].split(" ")
idxs = [int(idx.replace("idx_", "")) for idx in idxs]
label2answer[int(pair[0])] = idxs
return label2answer
pad_idx = 0
pad_string = "<pad>"
pad_embed = np.zeros((300,))
insuranceqa_train_filename = "/InsuranceQA.question.anslabel.token.100.pool.solr.train.encoded.gz"
insuranceqa_test_filename = "/InsuranceQA.question.anslabel.token.100.pool.solr.test.encoded.gz"
insuranceQA_url = "https://github.com/shuzi/insuranceQA/archive/master.zip"
insuranceQA_cache_fp = download_dir + "insuranceQA_cache.pickle"
google_news_pretrain_embeddings_link = "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
def get_idx2word(data_dir):
idx2word = dict()
with open(data_dir + "vocabulary", encoding="utf-8") as vc_f:
for line in vc_f:
pair = line.strip().split("\t")
idx = int(pair[0].replace("idx_", ""))
idx2word[idx] = pair[1]
# add padding string to idx2word lookup
idx2word[pad_idx] = pad_string
return idx2word
def get_train_raw(data_dir, data_filename):
''' deserialize training data file
args:
data_dir: dir of data file
return:
train_raw: list of QnA pair, length of list == number of samples,
each pair has 3 fields:
0 is question sentence idx encoded, use idx2word to decode,
idx2vec to get embedding.
1 is ans labels, each label corresponds to a ans sentence,
use label2answer to decode.
2 is top K candidate ans, these are negative ans for
training.
'''
train_raw = []
import gzip
with gzip.open(data_dir + data_filename) as fin:
for line in fin:
tpl = line.decode().strip().split("\t")
question = [
int(idx.replace("idx_", "")) for idx in tpl[1].split(" ")
]
ans = [int(label) for label in tpl[2].split(" ")]
candis = [int(label) for label in tpl[3].split(" ")]
train_raw.append((question, ans, candis))
return train_raw
def limit_encode_train(train_raw, label2answer, idx2word, q_seq_limit,
ans_seq_limit, idx2vec):
''' prepare train data to embedded word vector sequence given sequence limit
return:
questions_encoded: np ndarray, shape
(number samples, seq length, vector size)
poss_encoded: same layout, sequence for positive answer
negs_encoded: same layout, sequence for negative answer
'''
questions = [question for question, answers, candis in train_raw]
# choose 1 answer from answer pool
poss = [
label2answer[random.choice(answers)]
for question, answers, candis in train_raw
]
# choose 1 candidate from candidate pool
negs = [
label2answer[random.choice(candis)]
for question, answers, candis in train_raw
]
# filtered word not in idx2vec
questions_filtered = [
[idx for idx in q if idx in idx2vec] for q in questions
]
poss_filtered = [[idx for idx in ans if idx in idx2vec] for ans in poss]
negs_filtered = [[idx for idx in ans if idx in idx2vec] for ans in negs]
# crop to seq limit
questions_crop = [
q[:q_seq_limit] + [0] * max(0, q_seq_limit - len(q))
for q in questions_filtered
]
poss_crop = [
ans[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(ans))
for ans in poss_filtered
]
negs_crop = [
ans[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(ans))
for ans in negs_filtered
]
# encoded, word idx to word vector
questions_encoded = [[idx2vec[idx] for idx in q] for q in questions_crop]
poss_encoded = [[idx2vec[idx] for idx in ans] for ans in poss_crop]
negs_encoded = [[idx2vec[idx] for idx in ans] for ans in negs_crop]
# make nd array
questions_encoded = np.array(questions_encoded).astype(np.float32)
poss_encoded = np.array(poss_encoded).astype(np.float32)
negs_encoded = np.array(negs_encoded).astype(np.float32)
return questions_encoded, poss_encoded, negs_encoded
def get_idx2vec_weights(wv, idx2word):
idx2vec = {k: wv[v] for k, v in idx2word.items() if v in wv}
# add padding embedding (all zeros) to idx2vec lookup
idx2vec[pad_idx] = pad_embed
return idx2vec
def prepare_data(use_cache=True):
import pickle
if not os.path.isfile(insuranceQA_cache_fp) or not use_cache:
# no cache is found, preprocess data from scratch
print("prepare data from scratch")
# get pretained word vector
from gensim.models.keyedvectors import KeyedVectors
google_news_pretrain_fp = check_exist_or_download(
google_news_pretrain_embeddings_link)
wv = KeyedVectors.load_word2vec_format(google_news_pretrain_fp,
binary=True)
# prepare insurance QA dataset
data_zip = check_exist_or_download(insuranceQA_url)
data_dir = unzip_data(download_dir, data_zip)
label2answer = get_label2answer(data_dir)
idx2word = get_idx2word(data_dir)
idx2vec = get_idx2vec_weights(wv, idx2word)
train_raw = get_train_raw(data_dir, insuranceqa_train_filename)
test_raw = get_train_raw(data_dir, insuranceqa_test_filename)
with open(insuranceQA_cache_fp, 'wb') as handle:
pickle.dump((train_raw, test_raw, label2answer, idx2word, idx2vec),
handle,
protocol=pickle.HIGHEST_PROTOCOL)
else:
# load from cached pickle
with open(insuranceQA_cache_fp, 'rb') as handle:
(train_raw, test_raw, label2answer, idx2word,
idx2vec) = pickle.load(handle)
return train_raw, test_raw, label2answer, idx2word, idx2vec
def limit_encode_eval(train_raw,
label2answer,
idx2word,
q_seq_limit,
ans_seq_limit,
idx2vec,
top_k_candi_limit=6):
''' prepare train data to embedded word vector sequence given sequence limit for testing
return:
questions_encoded: np ndarray, shape
(number samples, seq length, vector size)
poss_encoded: same layout, sequence for positive answer
negs_encoded: same layout, sequence for negative answer
'''
questions = [question for question, answers, candis in train_raw]
# combine truth and candidate answers label,
candi_pools = [
list(answers + candis)[:top_k_candi_limit]
for question, answers, candis in train_raw
]
assert all([len(pool) == top_k_candi_limit for pool in candi_pools])
ans_count = [len(answers) for question, answers, candis in train_raw]
assert all([c > 0 for c in ans_count])
# encode ans
candi_pools_encoded = [[label2answer[candi_label]
for candi_label in pool]
for pool in candi_pools]
# filtered word not in idx2vec
questions_filtered = [
[idx for idx in q if idx in idx2vec] for q in questions
]
candi_pools_filtered = [[[idx
for idx in candi_encoded
if idx in idx2vec]
for candi_encoded in pool]
for pool in candi_pools_encoded]
# crop to seq limit
questions_crop = [
q[:q_seq_limit] + [0] * max(0, q_seq_limit - len(q))
for q in questions_filtered
]
candi_pools_crop = [[
candi[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(candi))
for candi in pool
]
for pool in candi_pools_filtered]
# encoded, word idx to word vector
questions_encoded = [[idx2vec[idx] for idx in q] for q in questions_crop]
candi_pools_encoded = [[[idx2vec[idx]
for idx in candi]
for candi in pool]
for pool in candi_pools_crop]
questions_encoded = np.array(questions_encoded).astype(np.float32)
candi_pools_encoded = np.array(candi_pools_encoded).astype(np.float32)
# candi_pools_encoded shape
# (number of sample QnA,
# number of candi in pool,
# number of sequence word idx per candi,
# 300 word embedding for 1 word idx)
# e.g 10 QnA to test
# 5 each question has 5 possible ans
# 8 each ans has 8 words
# 300 each word has vector size 300
return questions_encoded, candi_pools_encoded, ans_count