forked from thepycoder/sarcasm_detector
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_sklearn.py
108 lines (91 loc) · 3.38 KB
/
train_sklearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import time
from pathlib import Path
from uuid import uuid4
import joblib
from clearml import Dataset, Task
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from utils import plot_confusion_matrix
class SklearnTrainer():
def __init__(self, model='LinearRegression', seed=42, subset_size=0):
self.task = Task.init(
project_name="sarcasm_detector",
task_name="Sklearn Training",
output_uri=True
)
self.task.set_parameter("model", model)
self.task.set_parameter("seed", seed)
self.task.set_parameter("subset_size", subset_size)
self.seed = seed
self.model = model
self.subset_size = subset_size
self.pipeline = self.create_pipeline()
def create_pipeline(self):
# Vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=50000, min_df=2)
# Model
if self.model == "LinearRegression":
# multinomial logistic regression a.k.a softmax classifier
cfg = {
"C": 1,
"n_jobs": 4,
"solver": 'lbfgs',
"random_state": 17,
"verbose": 1
}
self.task.connect(cfg)
model = LogisticRegression(
**cfg
)
else:
model = None
# Pipeline
return Pipeline([('vectorizer', vectorizer), ('model', model)])
def get_data(self):
local_dataset_path = Path(Dataset.get(
dataset_project="sarcasm_detector",
dataset_name="sarcasm_dataset",
alias="sarcasm_dataset"
).get_local_copy())
dataset = load_dataset(
"csv",
data_files=[str(local_dataset_path / csv_path)
for csv_path in os.listdir(local_dataset_path)
],
split="all"
)
dataset = dataset.train_test_split(
test_size=0.2,
shuffle=True,
seed=self.seed
)
if self.subset_size:
dataset['train'] = dataset['train'].select(range(self.subset_size))
dataset = dataset.filter(lambda x: bool(x['comment']))
return (dataset['train']['comment'],
dataset['train']['label'],
dataset['test']['comment'],
dataset['test']['label'])
def train(self):
train, y_train, test, y_test = self.get_data()
start_training = time.time()
self.pipeline.fit(train, y_train)
self.task.get_logger().report_single_value("train_runtime", time.time() - start_training)
y_pred = self.pipeline.predict(test)
self.task.get_logger().report_single_value("Accuracy", accuracy_score(y_test, y_pred))
plot_confusion_matrix(
y_test,
y_pred,
["NORMAL", "SARCASTIC"],
figsize=(8, 8),
title=f"{self.model} Confusion Matrix"
)
os.makedirs("my_awesome_model", exist_ok=True)
joblib.dump(self.pipeline, f"my_awesome_model/sklearn_classifier_{uuid4()}.joblib")
if __name__ == '__main__':
sarcasm_trainer = SklearnTrainer(subset_size=1000)
sarcasm_trainer.train()