-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_helper.py
101 lines (85 loc) · 3.05 KB
/
text_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# Text Helper Functions
# ---------------------------------------
#
# We pull out text helper functions to reduce redundant code
import string
import os
import collections
from tqdm import tqdm
import glove_loader
# Normalize text
def normalize_text(text):
# Lower case
text = text.lower()
# Remove punctuation
text = ''.join(c if c not in string.punctuation else ' '+c for c in text)
# Trim extra whitespace
text = ' '.join(text.split())
return text
# Build dictionary of words
def build_dictionary(sentences, vocabulary_size=60000):
# Turn sentences (list of strings) into lists of words
split_sentences = sentences
words = [x for sublist in split_sentences for x in sublist]
# Initialize list of [word, word_count] for each word, starting with unknown
count = [['[UNK]', -1]]
# Now add most frequent words, limited to the N-most frequent (N=vocabulary size)
count.extend(collections.Counter(words).most_common())
# Now create the dictionary
word_dict = {'[PAD]': 0, '[BEGIN]': 1, '[EOS]': 2, '[CLS]': 3, '[SEP]': 4, '[MASK]': 5}
# For each word, that we want in the dictionary, add it, then make it
# the value of the prior dictionary length
for word, word_count in tqdm(count):
word_dict[word] = len(word_dict)
if len(word_dict) >= vocabulary_size:
break
return word_dict
def load_glove():
glove = glove_loader.load_glove()
return glove
# Turn text data into lists of integers from dictionary
def text_to_numbers(sentences, word_dict, glove=False):
# Initialize the returned data
data = []
if glove:
word_dict = word_dict.vocab
for i, sentence in tqdm(enumerate(sentences)):
sentence_data = []
# For each word, either use selected index or rare word index
split_sentences = sentence
for word in split_sentences:
if glove:
if word in word_dict:
word_ix = word_dict[word].index
else:
word_ix = word_dict['unk'].index
else:
if word in word_dict:
word_ix = word_dict[word]
else:
word_ix = word_dict['[UNK]']
sentence_data.append(word_ix)
data.append(sentence_data)
return data
# Turn text data into lists of integers from dictionary
def numbers_to_text(sentences, word_dict, glove=False):
if glove:
word_dict = word_dict.index2word
unk_str = 'unk'
else:
word_dict = dict(zip(word_dict.values(), word_dict.keys()))
unk_str = '[UNK]'
# Initialize the returned data
data = []
for sentence in sentences:
sentence_data = []
# For each word, either use selected index or rare word index
split_sentences = sentence
for word in split_sentences:
if word in word_dict:
word_ix = word_dict[word]
else:
word_ix = unk_str
sentence_data.append(word_ix)
data.append(sentence_data)
return data