-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_data.py
131 lines (101 loc) · 3.72 KB
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/python3
"""Preprocess data.
Usage:
python preprocess_data.py [--<task>=[<mode>]]
"""
import argparse
from itertools import chain
from typing import List, Tuple, Dict, Set
from constant import MODE_STR, NONE_MODE, MODE_ID
from constant import TASK_STR, TASK_ID
from constant import KNOWLEDGE_TASK, KNOWLEDGE_SUBTASKS
from dataset import RawData
from dataset.tidy_data import generate_tidy_data_file
# Constants.
TASKS: List[str] = list(TASK_STR.values())
MODES: List[str] = list(MODE_STR.values())
RAW_DATA_ARG = 'raw_data'
def parse_cmd() -> Dict[str, List[str]]:
"""Parse commandline parameters.
Returns:
Dict[str, List[str]]: Parse result.
"""
# Definition of argument parser.
parser = argparse.ArgumentParser(
description='Preprocess data. Specify multiple TASKS and modes at '
'the time may be faster than process them one by one.'
)
for task in TASKS:
parser.add_argument(
'--{}'.format(task),
nargs='*',
choices=MODES,
help='{} task and its modes (train/valid/test).'.format(task)
)
parser.add_argument(
'--{}'.format(RAW_DATA_ARG),
nargs='*',
choices=MODES,
help='raw data of modes (train/valid/test).'
)
# Namespace -> Dict
parse_res: Dict[str, List[str]] = vars(parser.parse_args())
return parse_res
def standardize_parse_result(
cmd_args: Dict[str, List[str]]) -> Tuple[Dict[str, List[str]], Set[str]]:
"""Standardize parsing result.
Note:
not specify modes == all modes
But the mode of TASKS not shown is EMPTY! (EXCEPT NO ARGUMENTS)
No arguments: all TASKS and all modes
For example: `--intention` means `--intention train valid test`
Args:
cmd_args (Dict[str, List[str]]): Commandline arguments.
Returns:
Tuple[Dict[str, List[str]], Set[str]]: Standardized parsing result
and total modes.
"""
def remove_none_elements():
for _task, _modes in cmd_args.items():
if _modes is not None and not _modes:
cmd_args[_task] = MODES
elif _modes is None:
cmd_args[_task] = []
if cmd_args[RAW_DATA_ARG] is not None:
for task, modes in cmd_args.items():
if task != RAW_DATA_ARG and modes is not None:
raise ValueError('raw_data and tasks are mutual exclusion.')
total_modes = MODES
remove_none_elements()
else:
remove_none_elements()
total_modes: Set[str] = set(chain.from_iterable(cmd_args.values()))
cmd_args.pop(RAW_DATA_ARG)
# Special: no task is specified -> all task.
if not total_modes:
for task in TASKS:
cmd_args[task] = MODES
total_modes = MODES
return cmd_args, total_modes
def main():
# Parse commandline parameters and standardize.
parse_result: Dict[str, List[str]] = parse_cmd()
parse_result, total_modes = standardize_parse_result(parse_result)
print('Dataset will be processed: {}'.format(parse_result))
print('Modes will be processed: {}'.format(total_modes))
# Modes: Set[str] -> int
raw_data_mode: int = NONE_MODE
for mode in total_modes:
raw_data_mode |= MODE_ID[mode]
# Get necessary raw data.
raw_data = RawData(raw_data_mode)
# Generate tidy data file.
for task, modes in parse_result.items():
for mode in modes:
if TASK_ID[task] == KNOWLEDGE_TASK:
for subtask in KNOWLEDGE_SUBTASKS:
generate_tidy_data_file(raw_data, subtask, MODE_ID[mode])
else:
generate_tidy_data_file(raw_data, TASK_ID[task], MODE_ID[mode])
if __name__ == '__main__':
main()