forked from alex-berard/seq2seq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstats-bleu.py
executable file
·67 lines (52 loc) · 2.36 KB
/
stats-bleu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python3
import argparse
import sys
import numpy as np
import re
from translate.evaluation import corpus_bleu, corpus_ter, corpus_wer
from collections import OrderedDict
parser = argparse.ArgumentParser()
parser.add_argument('source')
parser.add_argument('target')
parser.add_argument('--bleu', action='store_true')
#parser.add_argument('--ter', action='store_true')
#parser.add_argument('--wer', action='store_true')
#parser.add_argument('--all', '-a', action='store_true')
parser.add_argument('--max-size', type=int)
parser.add_argument('--case-insensitive', '-i', action='store_true')
parser.add_argument('--draws', type=int, default=1000)
parser.add_argument('--sample-size', type=int, default=0)
parser.add_argument('-p', type=float, default=0.05)
if __name__ == '__main__':
args = parser.parse_args()
with open(args.source) as src_file, open(args.target) as trg_file:
if args.case_insensitive:
hypotheses = [line.strip().lower() for line in src_file]
references = [line.strip().lower() for line in trg_file]
else:
hypotheses = [line.strip() for line in src_file]
references = [line.strip() for line in trg_file]
if args.max_size is not None:
hypotheses = hypotheses[:args.max_size]
references = references[:args.max_size]
if len(hypotheses) != len(references):
sys.stderr.write('warning: source and target don\'t have the same length\n')
size = min(len(hypotheses), len(references))
hypotheses = hypotheses[:size]
references = references[:size]
indices = np.arange(len(hypotheses))
if args.sample_size == 0:
args.sample_size = len(hypotheses)
bleu_scores = []
hypotheses = np.array(hypotheses)
references = np.array(references)
for _ in range(args.draws):
indices = np.random.randint(len(hypotheses), size=args.sample_size)
hypotheses_ = hypotheses[indices]
references_ = references[indices]
bleu, _ = corpus_bleu(hypotheses_, references_)
bleu_scores.append(bleu)
bleu_scores = sorted(bleu_scores)
k = int(len(bleu_scores) * args.p) // 2 # FIXME
bleu_scores = bleu_scores[k:len(bleu_scores) - k]
print('[{:.3f}, {:.3f}]'.format(bleu_scores[0], bleu_scores[-1]))