forked from pedropro/TACO
-
Notifications
You must be signed in to change notification settings - Fork 0
/
split_dataset.py
99 lines (77 loc) · 3.27 KB
/
split_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os.path
import json
import argparse
import numpy as np
import random
import datetime as dt
import copy
parser = argparse.ArgumentParser(description='User args')
parser.add_argument('--dataset_dir', required=True, help='Path to dataset annotations')
parser.add_argument('--test_percentage', type=int, default=10, required=False, help='Percentage of images used for the testing set')
parser.add_argument('--val_percentage', type=int, default=10, required=False, help='Percentage of images used for the validation set')
parser.add_argument('--nr_trials', type=int, default=10, required=False, help='Number of splits')
args = parser.parse_args()
ann_input_path = args.dataset_dir + '/' + 'annotations.json'
# Load annotations
with open(ann_input_path, 'r') as f:
dataset = json.loads(f.read())
anns = dataset['annotations']
scene_anns = dataset['scene_annotations']
imgs = dataset['images']
nr_images = len(imgs)
nr_testing_images = int(nr_images*args.test_percentage*0.01+0.5)
nr_nontraining_images = int(nr_images*(args.test_percentage+args.val_percentage)*0.01+0.5)
for i in range(args.nr_trials):
random.shuffle(imgs)
# Add new datasets
train_set = {
'info': None,
'images': [],
'annotations': [],
'scene_annotations': [],
'licenses': [],
'categories': [],
'scene_categories': [],
}
train_set['info'] = dataset['info']
train_set['categories'] = dataset['categories']
train_set['scene_categories'] = dataset['scene_categories']
val_set = copy.deepcopy(train_set)
test_set = copy.deepcopy(train_set)
test_set['images'] = imgs[0:nr_testing_images]
val_set['images'] = imgs[nr_testing_images:nr_nontraining_images]
train_set['images'] = imgs[nr_nontraining_images:nr_images]
# Aux Image Ids to split annotations
test_img_ids, val_img_ids, train_img_ids = [],[],[]
for img in test_set['images']:
test_img_ids.append(img['id'])
for img in val_set['images']:
val_img_ids.append(img['id'])
for img in train_set['images']:
train_img_ids.append(img['id'])
# Split instance annotations
for ann in anns:
if ann['image_id'] in test_img_ids:
test_set['annotations'].append(ann)
elif ann['image_id'] in val_img_ids:
val_set['annotations'].append(ann)
elif ann['image_id'] in train_img_ids:
train_set['annotations'].append(ann)
# Split scene tags
for ann in scene_anns:
if ann['image_id'] in test_img_ids:
test_set['scene_annotations'].append(ann)
elif ann['image_id'] in val_img_ids:
val_set['scene_annotations'].append(ann)
elif ann['image_id'] in train_img_ids:
train_set['scene_annotations'].append(ann)
# Write dataset splits
ann_train_out_path = args.dataset_dir + '/' + 'annotations_' + str(i) +'_train.json'
ann_val_out_path = args.dataset_dir + '/' + 'annotations_' + str(i) + '_val.json'
ann_test_out_path = args.dataset_dir + '/' + 'annotations_' + str(i) + '_test.json'
with open(ann_train_out_path, 'w+') as f:
f.write(json.dumps(train_set))
with open(ann_val_out_path, 'w+') as f:
f.write(json.dumps(val_set))
with open(ann_test_out_path, 'w+') as f:
f.write(json.dumps(test_set))