-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmodel.py
218 lines (181 loc) · 7.91 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import json
import os
import os.path
import numpy as np
import pandas as pd
import joblib
from vf_portalytics.tool import rm_file_or_dir
class PredictionModel(object):
def __init__(self, id, path=None, one_hot_encode=None):
if id is None:
raise ValueError('Model Id cannot be None')
self.id = id
# filepaths
self.path = path or '/srv/model'
self.meta_path = os.path.join(self.path, str(self.id) + '.meta')
self.model_path = os.path.join(self.path, str(self.id) + '.pkl')
# try to load metadata
self._load_metadata()
if one_hot_encode is not None:
self.one_hot_encode = one_hot_encode
# try to load model
self._load_model()
def _load_metadata(self):
"""Load metadata from disk"""
if os.path.exists(self.meta_path):
with open(self.meta_path, 'r') as meta_file:
model_data = json.load(meta_file)
else:
model_data = {}
self.target = model_data.get('target', None)
self.features = model_data.get('features', {})
self.labels = model_data.get('labels', {})
self.one_hot_encode = model_data.get('one_hot_encode', True)
self.ordered_column_list = model_data.get('ordered_column_list', [])
def _save_metadata(self):
"""Save metadata to disk"""
model_data = {'features': self.features,
'target': self.target,
'labels': self.labels,
'one_hot_encode': self.one_hot_encode,
'ordered_column_list': self.ordered_column_list}
with open(self.meta_path, 'w') as meta_file:
json.dump(model_data, meta_file)
def _load_model(self):
"""Load model from disk"""
if os.path.exists(self.model_path):
self.model = joblib.load(self.model_path)
else:
self.model = None
def _save_model(self, compress_level=3):
"""Save model to disk"""
if self.model is not None:
joblib.dump(self.model, self.model_path, compress=compress_level)
def save(self, compress_level=3):
"""Save model and metadata to disk"""
self._save_metadata()
self._save_model(compress_level=compress_level)
def create_train_df(self, df) -> pd.DataFrame:
"""Create a training data frame from the input data frame"""
return self.pre_processing(df, train_mode=True)
def create_test_df(self, df) -> pd.DataFrame:
"""Create a test data frame from the input data frame"""
return self.pre_processing(df, train_mode=False)
def predict(self, df) -> pd.Series:
"""Predict the target value"""
if self.model is None:
raise ValueError('No model is available for prediction')
df = self.pre_processing(df)
prediction_df = self.model.predict(df)
prediction_df = self._post_processing(prediction_df)
return prediction_df
def pre_processing(self, df, train_mode=False, silent_mode=False) -> pd.DataFrame:
"""
Pre-process the input data frame
Input:
df: pd.DataFrame, the input data frame
train_mode: bool, whether the data frame is for training
silent_mode: bool, whether to suppress warnings
"""
# check the columns against the features and targets
if not self.features:
raise ValueError('No features defined')
if train_mode:
missing_columns = [x for x in self.features.keys() if x not in df]
if missing_columns:
raise KeyError('Missing features columns ' + ', '.join(missing_columns))
df = df[[x for x in self.features.keys() if x in df]].copy()
# handle categorical features
categorical_features = [col for col, transformations in self.features.items() if 'C' in transformations]
if categorical_features:
output = self._prerprocess_categoricals(categorical_features, df, silent_mode, train_mode)
if self.one_hot_encode:
df = pd.concat(output, axis=1)
self._transform_columns(df)
col_list = self._order_columns(df, train_mode)
self._fill_columns(categorical_features, col_list, df)
# set the columns in order
df = df[col_list]
return df
def _order_columns(self, df, train_mode):
"""Order the columns of the data frame"""
if train_mode:
col_list = sorted([x for x in df.columns if x not in self.target])
self.ordered_column_list = col_list
else:
col_list = self.ordered_column_list
return col_list
def _transform_columns(self, df):
"""Apply transformations to the columns of the data frame"""
for column, transforms in self.features.items():
if not transforms:
continue
for transform in transforms:
if transform == 'log':
df[column] = np.log(df[column])
elif transform != 'C':
raise KeyError('Unknown transform option: ' + transform)
def _fill_columns(self, categorical_features, col_list, df):
"""Fill the columns of the data frame"""
for col in col_list:
if col not in df:
if self.one_hot_encode and \
any(1 for feature_col in categorical_features if col.startswith(feature_col)):
# features (one hot encoded) should be set to 0
df[col] = 0
elif col in categorical_features:
# features (not one hot encoded) should be set to -1
df[col] = -1
else:
df[col] = 0.0
def _prerprocess_categoricals(self, categorical_features, df, silent_mode, train_mode):
"""Pre-process the categorical features of the data frame"""
if train_mode:
# reset labels
self.labels = {}
output = [df]
for col in categorical_features:
if train_mode:
# refresh label encoding for this column
self.labels[col] = list(df[col].value_counts().index) # sorted label based on counts
if not silent_mode and len(self.labels[col]) <= 2:
print('Column ' + col + ' has less then expected unique values for a categorical ' +
'feature (' + ', '.join([str(x) for x in self.labels[col]]) + ')')
elif col not in df:
# skip it if we're not training and it's missing
if not silent_mode:
print('Missing categorical feature: ' + col)
continue
# apply label encoding
lookup = {x: i for i, x in enumerate(self.labels[col])}
missing = len(lookup)
df[col] = df[col].apply(lambda x: lookup.get(x, missing))
if self.one_hot_encode:
# apply one hot encoding
output.append(pd.get_dummies(df[col], prefix=col))
del df[col]
return output
def _post_processing(self, ser) -> pd.Series:
"""
Post-process the prediction series
Input:
ser: pd.Series, the prediction series
"""
for col, transforms in self.target.items():
for transform in transforms:
ser = self._back_transformation(transform, ser)
return ser
@staticmethod
def _back_transformation(transform, input_ser) -> pd.Series:
"""
Apply back transformation to the input series
Input:
transform: str, the transformation to apply
input_ser: pd.Series, the input series
"""
if transform == 'log':
return np.exp(input_ser)
def delete(self):
"""Delete the model and metadata files"""
rm_file_or_dir(self.meta_path)
rm_file_or_dir(self.model_path)