-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheval.py
142 lines (119 loc) · 5.99 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import argparse
import random, sys, os, time
from typing import Dict
from tqdm import tqdm
from parse_tree import ParseTree, ParseNode
from grammar import Grammar, Rule
from start import get_times, START
from lark import Lark
from oracle import CachingOracle, ExternalOracle
import string
"""
High-level command line to launch Arvada evaluation.
See __main__ dispatch at the bottom for usage.
"""
PRECISION_SIZE=1000
def main_internal(external_folder, log_file, random_guides=False):
"""
`external_folder`: the base folder for the benchmark, which contains:
- random-guides: dir of random guide examples
- guides: dir of minimal guide examples
- test_set: dir of held-out test examples
- parse_bench_name: the parser command (oracle). assume bench_name is the
base (i.e. without parent directories) name of external_folder
`log_file`: where to write results
`fast`: use internal caching oracle created with the Lark grammar, instead
of the external command
`random_guides`: learn from the guide examples in random-guides instead of guides
"""
import os
bench_name = os.path.basename(external_folder)
test_folder = os.path.join(external_folder, "test_set")
parser_command = os.path.join(external_folder, f"parse_{bench_name}")
main(parser_command, log_file, test_folder)
def main(oracle_cmd, log_file_name, test_examples_folder ):
oracle = ExternalOracle(oracle_cmd)
real_recall_set = []
for filename in os.listdir(test_examples_folder):
full_filename = os.path.join(test_examples_folder, filename)
test_raw = open(full_filename).read()
real_recall_set.append(test_raw)
# TODO: make an option to try
# Create the log file and write positive and negative examples to it
# Also write the initial starting grammar to the file
with open(log_file_name + ".eval", 'w+') as f:
start_time = time.time()
import pickle
learned_grammar = Grammar(START)
grammar_dict : Dict[str, Rule] = pickle.load(open(log_file_name + ".gramdict", "rb"))
for key, rule in grammar_dict.items():
learned_grammar.add_rule(rule)
try:
learned_grammar.parser()
print('\n\nInitial grammar loaded:\n%s' % str(learned_grammar), file=f)
except Exception as e:
print('\n\nLoaded grammar does not compile! %s' % str(e), file=f)
print(learned_grammar, file=f)
exit()
parser: Lark = learned_grammar.parser()
precision_set = learned_grammar.sample_positives(PRECISION_SIZE, 5)
num_precision_parsed = 0
print(f"Precision set (size {len(precision_set)}):", file=f)
print(f"Precision set (size {len(precision_set)}):")
print("Eval of precision:")
for example in tqdm(precision_set):
try:
oracle.parse(example, timeout=10)
print("Passed\n", example, file=f)
num_precision_parsed += 1
except Exception as e:
print("Failed", example, " <----- FAILURE", file=f)
continue
example_gen_time = time.time()
num_recall_parsed = 0
if real_recall_set is not None:
print(f"Recall set (size {len(real_recall_set)}):", file=f)
print(f"Recall set (size {len(real_recall_set)}):")
print("Recall eval:")
for example in tqdm(real_recall_set):
try:
parser.parse(example)
print(" ", example, file=f)
num_recall_parsed += 1
except Exception as e:
print(" ", example, " <----- FAILURE", file=f)
continue
print(
f'Recall: {num_recall_parsed / len(real_recall_set)}, Precision: {num_precision_parsed / len(precision_set)}',
file=f)
print(
f'Recall: {num_recall_parsed / len(real_recall_set)}, Precision: {num_precision_parsed / len(precision_set)}')
else:
print(
f'Recall: [no test set provided], Precision: {num_precision_parsed / len(precision_set)}',
file=f)
print(
f'Recall: [no test set provided], Precision: {num_precision_parsed / len(precision_set)}')
print(f'Example gen time: {example_gen_time - start_time}', file=f)
print(f'Scoring time: {time.time() - example_gen_time}', file=f)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
subparser = parser.add_subparsers(dest='mode', help='benchmark mode (probably external unless you match the internal format)')
internal_parser = subparser.add_parser('internal')
external_parser = subparser.add_parser('external')
internal_parser.add_argument('bench_folder', help='folder containing the benchmark', type=str)
internal_parser.add_argument('log_file', help='log file output from search.py', type=str)
external_parser.add_argument('oracle_cmd', help='the oracle command; should be invocable on a filename via `oracle_cmd filename`, and return a non-zero exit code on invalid inputs', type=str)
external_parser.add_argument('examples_dir', help='folder containing the test (recall) examples', type=str)
external_parser.add_argument('log_file', help='log file output from search.py', type=str)
external_parser.add_argument('-n', '--precision_set_size', help='size of precision set to sample from learned grammar (default 1000)', type=int, default=1000)
args = parser.parse_args()
if args.mode == 'internal':
main_internal(args.bench_folder, args.log_file)
elif args.mode == 'external':
if args.precision_set_size is not None:
PRECISION_SIZE = args.precision_set_size
main(args.oracle_cmd, args.log_file, args.examples_dir)
else:
parser.print_help()
exit(1)