-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheval_line.py
157 lines (117 loc) · 4.86 KB
/
eval_line.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import re
import os
import json
import nltk.translate.bleu_score as bleu
from codebleu import calc_codebleu
from nltk.translate.bleu_score import SmoothingFunction
import editdistance
from utils import tokenize_for_bleu_eval, truncate, dpo_deduplicate_jsonl
import argparse
import logging
from tqdm import tqdm
def deduplicate_samples(samples, max_samples=3):
seen_completions = set()
valid_samples = []
for sample in samples:
completion = sample.get("completion")
ground_truth = sample.get("output")
# 筛选条件
if ground_truth is None or completion is None:
continue
if completion == ground_truth:
continue
if completion in ground_truth:
continue
# 去重处理
if completion not in seen_completions:
seen_completions.add(completion)
valid_samples.append(sample)
return valid_samples[:max_samples] if len(valid_samples) >= max_samples else valid_samples
logging.basicConfig(level=logging.ERROR)
# 设置命令行参数解析
parser = argparse.ArgumentParser(description="Run model with specified parameters.")
parser.add_argument("--work_dir", type=str, required=True, help="Working directory")
parser.add_argument("--model_name", type=str, required=True, help="Model name")
parser.add_argument("--model_path", type=str, required=True, help="Model path")
parser.add_argument("--bleu_threshold", type=float, default=0.5, help="BLEU_threshold for filter")
# 解析命令行参数
args = parser.parse_args()
# 打印解析结果(调试用)
print(f"Work Directory: {args.work_dir}")
print(f"Model Name: {args.model_name}")
# 后续逻辑使用解析的参数
work_dir = args.work_dir
model_name = args.model_name
model_path = args.model_path
bleu_threshold = args.bleu_threshold
in_file = False
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
merge_file = os.path.join(work_dir, model_name, "line_results.jsonl")
final_file = os.path.join(work_dir, model_name, "line_final_results.jsonl")
save_dir = os.path.join(work_dir, model_name, "line")
if in_file:
result_file = os.path.join(save_dir, "results_infile.jsonl")
else:
result_file = os.path.join(save_dir, "results.jsonl")
import pdb
tot_completion_token_len = 0
exact_match = 0
bleu_score = 0
length = 0
tot = 0
distance = 0
code_bleu = 0
# 按照namespace对数据进行分组
namespace_samples = {}
with open(result_file, "r") as f:
for line in f.readlines():
example = json.loads(line)
namespace = example.get("namespace") # 获取namespace字段
namespace += example.get("output", "")
if namespace not in namespace_samples:
namespace_samples[namespace] = []
namespace_samples[namespace].append(example)
# 对每个namespace进行处理
count = 0
for namespace, samples in tqdm(namespace_samples.items()):
unique_samples = deduplicate_samples(samples)
for example in samples:
length += 1
code = example["completion"]
tot_completion_token_len += len(tokenizer(example["completion"], truncation=False, add_special_tokens=False)['input_ids'])
ground_truth = example["output"]
if code == ground_truth:
exact_match += 1
example["exact_match"] = 1
else:
example["exact_match"] = 0
result = calc_codebleu([ground_truth], [code], lang="python", weights=(1, 0, 0, 0), tokenizer=tokenize_for_bleu_eval)
example["ngram_match_score"] = result["ngram_match_score"]
bleu_score += result["ngram_match_score"]
example["codebleu"] = result["codebleu"]
code_bleu += result["codebleu"]
code_tokens = tokenize_for_bleu_eval(code)
ground_truth_tokens = tokenize_for_bleu_eval(ground_truth)
example["editdistance"] = editdistance.eval(code_tokens, ground_truth_tokens)
distance += example["editdistance"]
for example in unique_samples:
with open(merge_file, "a") as f_out:
count += 1
f_out.write(json.dumps(example) + "\n")
# 计算最终结果
exact_match_score = exact_match / length
bleu_avg_score = bleu_score / length
code_bleu_avg = code_bleu / length
avg_edit_distance = distance / length
avg_token_len = tot_completion_token_len / length
print(f"Exact Match: {exact_match_score:.4f}")
print(f"BLEU Score: {bleu_avg_score:.4f}")
print(f"Code BLEU: {code_bleu_avg:.4f}")
print(f"Average Edit Distance: {avg_edit_distance:.4f}")
print(f"Average Token Length: {avg_token_len:.4f}")
print("All Saved Unique samples:", count)
# docker exec -it lhy_Deveval python /workspace/liuhuanyu/RepoCoder/new_eval/eval_line.py \
# --work_dir="/workspace/liuhuanyu/RepoCoder/Results_5" \
# --model_path="/workspace/zhuhao/models/aixcoder/1207_single_line/checkpoint-682" \
# --model_name="1207_single_line-checkpoint-682"