forked from litian96/FedProx
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlanguage_utils.py
executable file
·118 lines (87 loc) · 2.93 KB
/
language_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""Utils for language models."""
import re
# ------------------------
# utils for shakespeare dataset
ALL_LETTERS = "\n !\"&'(),-.0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz}"
NUM_LETTERS = len(ALL_LETTERS)
def _one_hot(index, size):
'''returns one-hot vector with given size and value 1 at given index
'''
vec = [0 for _ in range(size)]
vec[int(index)] = 1
return vec
def letter_to_vec(letter):
'''returns one-hot representation of given letter
'''
index = ALL_LETTERS.find(letter)
return _one_hot(index, NUM_LETTERS)
def word_to_indices(word):
'''returns a list of character indices
Args:
word: string
Return:
indices: int list with length len(word)
'''
indices = []
for c in word:
indices.append(ALL_LETTERS.find(c))
return indices
# ------------------------
# utils for sent140 dataset
def split_line(line):
'''split given line/phrase into list of words
Args:
line: string representing phrase to be split
Return:
list of strings, with each string representing a word
'''
return re.findall(r"[\w']+|[.,!?;]", line)
def _word_to_index(word, indd):
'''returns index of given word based on given lookup dictionary
returns the length of the lookup dictionary if word not found
Args:
word: string
indd: dictionary with string words as keys and int indices as values
'''
if word in indd:
return indd[word]
else:
return len(indd)
def line_to_indices(line, indd, max_words=25):
'''converts given phrase into list of word indices
if the phrase has more than max_words words, returns a list containing
indices of the first max_words words
if the phrase has less than max_words words, repeatedly appends integer
representing unknown index to returned list until the list's length is
max_words
Args:
line: string representing phrase/sequence of words
indd: dictionary with string words as keys and int indices as values
max_words: maximum number of word indices in returned list
Return:
indl: list of word indices, one index for each word in phrase
'''
line_list = split_line(line) # split phrase in words
indl = []
for word in line_list:
cind = _word_to_index(word, indd)
indl.append(cind)
if (len(indl) == max_words):
break
for i in range(max_words - len(indl)):
indl.append(len(indd))
return indl
def bag_of_words(line, vocab):
'''returns bag of words representation of given phrase using given vocab
Args:
line: string representing phrase to be parsed
vocab: dictionary with words as keys and indices as values
Return:
integer list
'''
bag = [0]*len(vocab)
words = split_line(line)
for w in words:
if w in vocab:
bag[vocab[w]] += 1
return bag