-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheval_span.py
144 lines (108 loc) · 4.58 KB
/
eval_span.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import re
import os
import json
import nltk.translate.bleu_score as bleu
from codebleu import calc_codebleu
from nltk.translate.bleu_score import SmoothingFunction
import editdistance
from utils import tokenize_for_bleu_eval, truncate, dpo_deduplicate_jsonl
import argparse
import logging
from tqdm import tqdm
def deduplicate_samples(samples, max_samples=3):
seen_completions = set()
valid_samples = []
for sample in samples:
completion = sample.get("completion")
ground_truth = sample.get("output")
if ground_truth is None or completion is None:
continue
if completion == ground_truth:
continue
if completion in ground_truth:
continue
if completion not in seen_completions:
seen_completions.add(completion)
valid_samples.append(sample)
return valid_samples[:max_samples] if len(valid_samples) >= max_samples else valid_samples
logging.basicConfig(level=logging.ERROR)
parser = argparse.ArgumentParser(description="Run model with specified parameters.")
parser.add_argument("--work_dir", type=str, required=True, help="Working directory")
parser.add_argument("--model_name", type=str, required=True, help="Model name")
parser.add_argument("--model_path", type=str, required=True, help="Model path")
parser.add_argument("--bleu_threshold", type=float, default=0.5, help="BLEU_threshold for filter")
args = parser.parse_args()
print(f"Work Directory: {args.work_dir}")
print(f"Model Name: {args.model_name}")
work_dir = args.work_dir
model_name = args.model_name
model_path = args.model_path
bleu_threshold = args.bleu_threshold
in_file = False
merge_file = os.path.join(work_dir, model_name, "span_results.jsonl")
final_file = os.path.join(work_dir, model_name, "span_final_results.jsonl")
save_dir = os.path.join(work_dir, model_name, "span")
if in_file:
result_file = os.path.join(save_dir, "results_infile.jsonl")
else:
result_file = os.path.join(save_dir, "results.jsonl")
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
tot_completion_token_len = 0
exact_match = 0
bleu_score = 0
length = 0
tot = 0
distance = 0
code_bleu = 0
namespace_samples = {}
with open(result_file, "r") as f:
for line in f.readlines():
example = json.loads(line)
namespace = example.get("namespace")
namespace += example.get("output", "")
if namespace not in namespace_samples:
namespace_samples[namespace] = []
namespace_samples[namespace].append(example)
count = 0
for namespace, samples in tqdm(namespace_samples.items()):
unique_samples = deduplicate_samples(samples)
for example in samples:
length += 1
code = example["completion"]
tot_completion_token_len += len(tokenizer(example["completion"], truncation=False, add_special_tokens=False)['input_ids'])
ground_truth = example["output"]
if code == ground_truth:
exact_match += 1
example["exact_match"] = 1
else:
example["exact_match"] = 0
result = calc_codebleu([ground_truth], [code], lang="python", weights=(1, 0, 0, 0), tokenizer=tokenize_for_bleu_eval)
example["ngram_match_score"] = result["ngram_match_score"]
bleu_score += result["ngram_match_score"]
example["codebleu"] = result["codebleu"]
code_bleu += result["codebleu"]
code_tokens = tokenize_for_bleu_eval(code)
ground_truth_tokens = tokenize_for_bleu_eval(ground_truth)
example["editdistance"] = editdistance.eval(code_tokens, ground_truth_tokens)
distance += example["editdistance"]
for example in unique_samples:
with open(merge_file, "a") as f_out:
count += 1
f_out.write(json.dumps(example) + "\n")
exact_match_score = exact_match / length
bleu_avg_score = bleu_score / length
code_bleu_avg = code_bleu / length
avg_edit_distance = distance / length
avg_token_len = tot_completion_token_len / length
print(f"Exact Match: {exact_match_score:.4f}")
print(f"BLEU Score: {bleu_avg_score:.4f}")
print(f"Code BLEU: {code_bleu_avg:.4f}")
print(f"Average Edit Distance: {avg_edit_distance:.4f}")
print(f"Average Token Length: {avg_token_len:.4f}")
print("All Saved Unique samples:", count)
# dpo_deduplicate_jsonl(merge_file, final_file)
# print("Deduplicated results saved to", merge_file, final_file)
#docker exec -it lhy_Deveval python /workspace/liuhuanyu/RepoCoder/new_eval/eval.py \
#--work_dir="/workspace/liuhuanyu/RepoCoder/Results-function-level" \
#--model_name="aixcoder-7b-base-psm"