-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathgenerate_train.py
417 lines (347 loc) · 13.6 KB
/
generate_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
import streamlit as st
import pandas as pd
import numpy as np
# Word Embeddings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
import builtins
# Text
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
# Storing as objects via serialization
from tempfile import mkdtemp
import pickle
import joblib
# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks", color_codes=True)
# Directory
import os
import yaml
import collections
import math
## LOADING OBJECTS
processed_inbound = pd.read_pickle("../objects/processed_inbound_extra.pkl")
processed = pd.read_pickle("../objects/processed.pkl")
# Reading back in intents
with open(r"../objects/intents.yml") as file:
intents = yaml.load(file, Loader=yaml.FullLoader)
if __name__ == "__main":
main()
def main():
st.title("Training Data Generator Tool")
"""Making my idealized dataset - generating N Tweets similar to this artificial Tweet
This will then be concatenated to current inbound data so it can be included in the doc2vec training
"""
# Version 2 - I realized that keywords might get the job done, and it's less risky to
# add more words for the association power because it's doc2vec
ideal = {
"battery": "battery power",
"forgot_password": "password account login",
"payment": "credit card payment pay",
"update": "update upgrade",
"info": "info information",
# "lost_replace": "replace lost gone missing trade",
"location": "nearest apple location store",
}
def add_extra(current_tokenized_data, extra_tweets):
""" Adding extra tweets to current tokenized data"""
# Storing these extra Tweets in a list to concatenate to the inbound data
extra_tweets = pd.Series(extra_tweets)
# Making string form
print("Converting to string...")
string_processed_data = current_tokenized_data.apply(" ".join)
# Adding it to the data, updating processed_inbound
string_processed_data = pd.concat([string_processed_data, extra_tweets], axis=0)
# We want a tokenized version
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
# print('Tokenizing...')
# string_processed_data.apply(tknzr.tokenize)
return string_processed_data
# Getting the lengthened data
processed_inbound_extra = add_extra(
processed["Processed Inbound"], list(ideal.values())
)
# Saving updated processed inbound into a serialized saved file
processed_inbound_extra.to_pickle("../objects/processed_inbound_extra.pkl")
st.subheader("Processed Inbound Extra")
st.dataframe(processed_inbound_extra)
st.text(
"As you can see, I appended the documents I wanted to find the similarity of in this dataframe,\
and this is something you need to do before I doc2vec vectorize my data.\
This is because doc2vec model similarity function only could find similarity\
among Tweets that already exist in the vectorized data."
)
@st.cache
def train_doc2vec(string_data, max_epochs, vec_size, alpha):
# Tagging each of the data with an ID, and I use the most memory efficient one of just using it's ID
tagged_data = [
TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)])
for i, _d in enumerate(string_data)
]
# Instantiating my model
model = Doc2Vec(
size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1
)
model.build_vocab(tagged_data)
for epoch in range(max_epochs):
print("iteration {0}".format(epoch))
model.train(
tagged_data, total_examples=model.corpus_count, epochs=model.iter
)
# Decrease the learning rate
model.alpha -= 0.0002
# Fix the learning rate, no decay
model.min_alpha = model.alpha
# Saving model
model.save("../models/d2v.model")
print("Model Saved")
if st.button("Train doc2vec"):
train_doc2vec(processed_inbound_extra, max_epochs=100, vec_size=20, alpha=0.025)
# Loading in my model
model = Doc2Vec.load("../models/d2v.model")
# Storing my data into a list - this is the data I will cluster
inbound_d2v = np.array(
[model.docvecs[i] for i in range(processed_inbound_extra.shape[0])]
)
if st.button("Save vectorized doc2vec"):
# Saving
path = "../objects/inbound_d2v.pkl"
with open(path, "wb") as f:
pickle.dump(inbound_d2v, f)
st.text(f"Saved to {path}")
st.subheader("Doc2Vec vectorized data")
st.dataframe(inbound_d2v)
st.text(f"Shape: {inbound_d2v.shape}")
"""
Finding tags of ideal Tweets
"""
# Version 2
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
## Just tokenizing all the values of ideal' values to be able to be fed in to matching function
# intents_repr = dict(zip(ideal.keys(), [tknzr.tokenize(v) for v in ideal.values()]))
# Pythonic way
intents_repr = {k: tknzr.tokenize(v) for k, v in ideal.items()}
print(intents_repr)
# Saving intents_repr into YAML
with open("../objects/intents_repr.yml", "w") as outfile:
yaml.dump(intents_repr, outfile, default_flow_style=False)
# Tags for my dictionary
tags = []
tokenized_processed_inbound = processed_inbound.apply(tknzr.tokenize)
# Find the index locations of specific Tweets
def report_index_loc(tweet, intent_name):
""" Takes in the Tweet to find the index for and returns a report of that Tweet index along with what the
representative Tweet looks like"""
try:
tweets = []
for i, j in enumerate(tokenized_processed_inbound):
if j == tweet:
tweets.append((i, True))
else:
tweets.append((i, False))
index = []
for i in tweets:
if i[1] == True:
index.append(i[0])
preview = processed_inbound.iloc[index]
# Appending to indexes for dictionary
tags.append(str(index[0]))
except IndexError:
print("Index not in list, move on")
return
return intent_name, str(index[0]), preview
# Reporting and storing indexes with the function
st.text("TAGGED INDEXES TO LOOK FOR")
for j, i in intents_repr.items():
try:
st.text("\n{} \nIndex: {}\nPreview: {}".format(*report_index_loc(i, j)))
except Exception as e:
st.text("Index ended")
# Pythonic way of making new dictionary from 2 lists
intents_tags = dict(zip(intents_repr.keys(), tags))
st.header("Intents Tags Dictionary")
st.write(intents_tags)
"""
ACTUALLY GENERATING MY TRAINING DATA
"""
## Getting top n tweets similar to the 0th Tweet
# This will return the a list of tuples (i,j) where i is the index and j is
# the cosine similarity to the tagged document index
# Storing all intents in this dataframe
train = pd.DataFrame()
# intent_indexes = {}
# 1. Adding intent content based on similarity
def generate_intent(target, itag):
similar_doc = model.docvecs.most_similar(itag, topn=target)
# Getting just the indexes
indexes = [int(i[0]) for i in similar_doc]
# intent_indexes[intent_name] = indexes
# Actually seeing the top 1000 Tweets similar to the 0th Tweet which seems to be about updates
# Adding just the values, not the index
# Tokenizing the output
return [
word_tokenize(tweet)
for tweet in list(processed_inbound.iloc[indexes].values)
]
# Updating train data
for intent_name, itag in intents_tags.items():
train[intent_name] = generate_intent(1000, itag)
# 2. Manually added intents
# These are the remainder intents
manually_added_intents = {
"speak_representative": [
["talk", "human", "please"],
["let", "me", "talk", "to", "apple", "support"],
["can", "i", "speak", "agent", "person"],
],
"greeting": [
["hi"],
["hello"],
["whats", "up"],
["good", "morning"],
["good", "evening"],
["good", "night"],
],
"goodbye": [["goodbye"], ["bye"], ["thank"], ["thanks"], ["done"]],
"challenge_robot": [
["robot", "human"],
["are", "you", "robot"],
["who", "are", "you"],
],
}
# Inserting manually added intents to data
def insert_manually(target, prototype):
""" Taking a prototype tokenized document to repeat until
you get length target"""
factor = math.ceil(target / len(prototype))
print(factor)
content = prototype * factor
return [content[i] for i in range(target)]
# Updating training data
for intent_name in manually_added_intents.keys():
train[intent_name] = insert_manually(
1000, [*manually_added_intents[intent_name]]
)
# 3. Adding in the hybrid intents
hybrid_intents = {
"update": (
300,
700,
[
["want", "update"],
["update", "not", "working"],
["phone", "need", "update"],
],
intents_tags["update"],
),
"info": (
800,
200,
[
["need", "information"],
["want", "to", "know", "about"],
["what", "are", "macbook", "stats"],
["any", "info", "next", "release", "?"],
],
intents_tags["info"],
),
"payment": (
300,
700,
[
["payment", "not", "through"],
["iphone", "apple", "pay", "but", "not", "arrive"],
["how", "pay", "for", "this"],
["can", "i", "pay", "for", "this", "first"],
],
intents_tags["payment"],
),
"forgot_password": (
600,
400,
[
["forgot", "my", "pass"],
["forgot", "my", "login", "details"],
["cannot", "log", "in", "password"],
["lost", "account", "recover", "password"],
],
intents_tags["forgot_password"],
),
}
def insert_hybrid(manual_target, generated_target, prototype, itag):
return insert_manually(manual_target, prototype) + list(
generate_intent(generated_target, itag)
)
# Updating training data
for intent_name, args in hybrid_intents.items():
train[intent_name] = insert_hybrid(*args)
# 4. Converting to long dataframe from wide that my NN model can read in for the next notebook - and wrangling
neat_train = (
pd.DataFrame(train.T.unstack())
.reset_index()
.iloc[:, 1:]
.rename(columns={"level_1": "Intent", 0: "Utterance"})
)
# Reordering
neat_train = neat_train[["Utterance", "Intent"]]
# 5. Saving this raw training data into a serialized file
neat_train.to_pickle("../objects/train.pkl")
# Styling display
show = (
lambda x: x.style.set_properties(
**{
"background-color": "black",
"color": "lawngreen",
"border-color": "white",
}
)
.applymap(lambda x: f"color: {'lawngreen' if isinstance(x,str) else 'red'}")
.background_gradient(cmap="Blues")
)
st.header("Training data - Comparing different intents view")
st.dataframe(show(train))
st.header("Training data in format to feed into models")
st.dataframe(show(neat_train))
"""
INTENT EVALUATION
"""
st.subheader("Looking at top words at each intent")
# Storing word rank table dataframes in this dict
wordranks = {}
# For visualizing top 10
def top10_bagofwords(data, output_name, title):
""" Taking as input the data and plots the top 10 words based on counts in this text data"""
bagofwords = CountVectorizer()
# Output will be a sparse matrix
inbound = bagofwords.fit_transform(data)
# Inspecting of often contractions and colloquial language is used
word_counts = np.array(np.sum(inbound, axis=0)).reshape((-1,))
words = np.array(bagofwords.get_feature_names())
words_df = pd.DataFrame({"word": words, "count": word_counts})
words_rank = words_df.sort_values(by="count", ascending=False)
wordranks[output_name] = words_rank
# words_rank.to_csv('words_rank.csv') # Storing it in a csv so I can inspect and go through it myself
# Visualizing top 10 words
plt.figure(figsize=(12, 6))
sns.barplot(
words_rank["word"][:10],
words_rank["count"][:10].astype(str),
palette="inferno",
)
plt.title(title)
# Saving
# plt.savefig(f'visualizations/next_ver/{output_name}.png')
st.pyplot()
# Doing my bucket evaluations here - seeing what each distinct bucket intent means
for i in train.columns:
top10_bagofwords(
train[i].apply(" ".join), f"bucket_eval/{i}", f"Top 10 Words in {i} Intent",
)
if __name__ == "__main__":
main()