forked from yufengm/Adaptive
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_vocab.py
117 lines (94 loc) · 3.68 KB
/
build_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import nltk
import pickle
from collections import Counter
import string
import os
from data_utils import get_karpathy_split, get_refcoco_captions
class Vocabulary(object):
"""Simple vocabulary wrapper."""
def __init__(self):
self.word2idx = {}
self.idx2word = {}
self.idx = 0
self.unknown_token = '<unk>'
def add_word(self, word):
if not word in self.word2idx:
self.word2idx[word] = self.idx
self.idx2word[self.idx] = word
self.idx += 1
def __call__(self, word):
if not word in self.word2idx:
return self.word2idx[self.unknown_token]
return self.word2idx[word]
def __len__(self):
return len(self.word2idx)
def build_vocab(caption_list, threshold):
"""Build a simple vocabulary wrapper."""
vocab = Vocabulary()
vocab.add_word('<pad>')
vocab.add_word('<start>')
vocab.add_word('<end>')
vocab.add_word(vocab.unknown_token)
tokens = []
for c in caption_list:
c = c.casefold()
c = c.translate(str.maketrans('', '', string.punctuation))
tokens += nltk.word_tokenize(c)
counter = Counter(tokens)
words = [w for w, cnt in counter.items() if cnt >= threshold]
for w in words:
vocab.add_word(w)
return vocab
def main(
coco_threshold, refcoco_threshold,
splits_path, caps_path,
refcoco_path, refcocoplus_path,
refcocog_path, out_dir
):
# create vocab directory if it doesn't exist
if not os.path.isdir(out_dir):
print('create vocab directory: {}'.format(out_dir))
os.makedirs(out_dir)
print('generate vocab for coco captions')
caps_df = get_karpathy_split(splits_path=splits_path, caps_path=caps_path)
train_caps = caps_df.loc[caps_df.split == 'train'].caption.to_list()
vocab = build_vocab(train_caps, coco_threshold)
out_path = os.path.join(out_dir, 'coco_vocab.pkl')
with open(out_path, 'wb') as f:
pickle.dump(vocab, f)
print('saved vocab with size {} to {}.'.format(len(vocab), out_path))
print('generate vocab for refcoco')
reg_df = get_refcoco_captions(refcoco_path)
train_caps = reg_df.loc[reg_df.split == 'train'].caption.to_list()
vocab = build_vocab(train_caps, refcoco_threshold)
out_path = os.path.join(out_dir, 'refcoco_vocab.pkl')
with open(out_path, 'wb') as f:
pickle.dump(vocab, f)
print('saved vocab with size {} to {}.'.format(len(vocab), out_path))
print('generate vocab for refcoco+')
reg_df = get_refcoco_captions(refcocoplus_path)
train_caps = reg_df.loc[reg_df.split == 'train'].caption.to_list()
vocab = build_vocab(train_caps, refcoco_threshold)
out_path = os.path.join(out_dir, 'refcocoplus_vocab.pkl')
with open(out_path, 'wb') as f:
pickle.dump(vocab, f)
print('saved vocab with size {} to {}.'.format(len(vocab), out_path))
print('generate vocab for refcocog')
reg_df = get_refcoco_captions(refcocog_path)
train_caps = reg_df.loc[reg_df.split == 'train'].caption.to_list()
vocab = build_vocab(train_caps, refcoco_threshold)
out_path = os.path.join(out_dir, 'refcocog_vocab.pkl')
with open(out_path, 'wb') as f:
pickle.dump(vocab, f)
print('saved vocab with size {} to {}.'.format(len(vocab), out_path))
if __name__ == '__main__':
main(
coco_threshold=5,
refcoco_threshold=3,
splits_path='./data/splits/karpathy/caption_datasets/',
caps_path='./data/captions/',
refcoco_path='./data/refcoco/',
refcocoplus_path='./data/refcoco+/',
refcocog_path='./data/refcocog/',
out_dir='/home/simeon/Dokumente/Code/Uni/Repos/Adaptive/data/'
)