-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsklearn_utils.py
106 lines (92 loc) · 3.82 KB
/
sklearn_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python
# -*-coding:utf-8 -*-
"""
@Project : Study
@File : sklearn_utils.py
@Time : 2024-04-10 13:54
@Author : Wu Xiaomin <>
@Version : 1.0
@License : (C)Copyright 2024, Wu Xiaomin
@Desc : sklearn 相关工具类
"""
import inspect
import os
from collections import defaultdict
from pathlib import Path
import pandas as pd
from joblib import load
from openpyxl import load_workbook
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
def get_var_name(var):
current_frame = inspect.currentframe()
caller_frame = inspect.getouterframes(current_frame)[2]
local_vars = caller_frame.frame.f_locals
for name, value in local_vars.items():
if value is var and name != "var":
return name
else:
raise ValueError(f"Variable {var} not found in local scope.")
class ClassifierZoo(object):
def __init__(self, **kwargs):
self.classifiers = []
self.config = {
"dump_path": "./",
# 分类问题: accuracy、f1、f1_micro、f1_macro(这两个用于多分类的f1_score)、precision、recall、roc_auc
"scoring": "f1_micro",
"cv": 5
}
self.config.update(kwargs)
self.train_report = defaultdict(list)
def add_classifier(self, classifier):
self.classifiers.append(classifier)
@classmethod
def from_model_path(cls, model_path: os.PathLike, **kwargs):
instance = cls(**kwargs)
for model in Path(model_path).glob("*.pkl"):
instance.add_classifier(load(model))
return instance
def train(self, X, y):
for cls in self.classifiers:
cls_name = get_var_name(cls)
cls.fit(X, y)
scores = cross_val_score(cls, X, y, scoring=self.config["scoring"], cv=self.config["cv"])
print(f"train {cls_name}, score_metric: {self.config['scoring']}")
print(scores.mean())
print(scores.std())
self.train_report["classifier"].append(cls_name)
self.train_report[self.config["scoring"] + "_mean"].append(scores.mean())
self.train_report[self.config["scoring"] + "_std"].append(scores.std())
print(f"train {len(self.classifiers)} classifiers complete.")
pd.DataFrame(self.train_report).to_excel(self.config["dump_path"] + "/train_report.xlsx")
def test(self, X_test, y_test):
for cls in self.classifiers:
cls_name = get_var_name(cls)
y_pred = cls.predict(X_test)
d = classification_report(y_test, y_pred, output_dict=True)
summay_excel_report(pd.DataFrame(d).T, self.config["dump_path"] + "/test_report.xlsx", cls_name)
def summay_excel_report(df: pd.DataFrame, filename: os.PathLike, sheet: str, adjust: bool = True) -> None:
if filename.exists():
with pd.ExcelWriter(filename, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
df.to_excel(writer, sheet_name=sheet, index=False)
else:
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
df.to_excel(writer, sheet_name=sheet, index=False)
if adjust:
adjust_column_width(filename)
def adjust_column_width(file_name: os.PathLike) -> None:
wb = load_workbook(filename=file_name)
for worksheets in wb.sheetnames:
ws = wb[worksheets]
for column in ws.columns:
max_length = 0
column_letter = column[0].column_letter
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(cell.value)
except:
pass
adjusted_width = (max_length + 2) * 1.2
ws.column_dimensions[column_letter].width = adjusted_width
wb.save(filename=file_name)