Skip to content

Commit

Permalink
add a possibility to have regression algorithms
Browse files Browse the repository at this point in the history
  • Loading branch information
asavinov committed Dec 15, 2024
1 parent 7c50eeb commit 66837d5
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 12 deletions.
19 changes: 14 additions & 5 deletions common/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.svm import SVC, SVR

import lightgbm as lgbm

Expand Down Expand Up @@ -425,6 +425,7 @@ def train_svc(df_X, df_y, model_config: dict):
Train model with the specified hyper-parameters and return this model (and scaler if any).
"""
is_scale = model_config.get("train", {}).get("is_scale", True)
is_regression = model_config.get("train", {}).get("is_regression", False)

#
# Prepare data
Expand All @@ -443,8 +444,11 @@ def train_svc(df_X, df_y, model_config: dict):
# Create model
#
args = model_config.get("params").copy()
args['probability'] = True # Required if we are going to use predict_proba()
model = SVC(**args)
if is_regression:
model = SVR(**args)
else:
args['probability'] = True # Required if we are going to use predict_proba()
model = SVC(**args)

#
# Train
Expand All @@ -459,6 +463,8 @@ def predict_svc(models: tuple, df_X_test, model_config: dict):
Use the model(s) to make predictions for the test data.
The first model is a prediction model and the second model (optional) is a scaler.
"""
is_regression = model_config.get("train", {}).get("is_regression", False)

#
# Double column set if required
#
Expand All @@ -482,8 +488,11 @@ def predict_svc(models: tuple, df_X_test, model_config: dict):
df_X_test_nonans = df_X_test.dropna() # Drop nans, possibly create gaps in index
nonans_index = df_X_test_nonans.index

y_test_hat_nonans = models[0].predict_proba(df_X_test_nonans.values) # It returns pairs or probas for 0 and 1
y_test_hat_nonans = y_test_hat_nonans[:, 1] # Or y_test_hat.flatten()
if is_regression:
y_test_hat_nonans = models[0].predict(df_X_test_nonans.values)
else:
y_test_hat_nonans = models[0].predict_proba(df_X_test_nonans.values) # It returns pairs or probas for 0 and 1
y_test_hat_nonans = y_test_hat_nonans[:, 1] # Or y_test_hat.flatten()
y_test_hat_nonans = pd.Series(data=y_test_hat_nonans, index=nonans_index) # Attach indexes with gaps

df_ret = pd.DataFrame(index=input_index) # Create empty dataframe with original index
Expand Down
14 changes: 12 additions & 2 deletions common/generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,12 @@ def predict_feature_set(df, fs, config, models: dict):

# For each new score, compare it with the label true values
if label in df:
scores[score_column_name] = compute_scores(df[label], df_y_hat)
df_y = df[label]
if df_y.dtype == "float64" and df_y_hat.dtype == "float64":
# TODO Regression scores
scores[score_column_name] = dict(rmse=0.0, mae=0.0, mse=0.0, mape=0.0, r2=0.0)
else:
scores[score_column_name] = compute_scores(df_y, df_y_hat) # Classification stores

return out_df, features, scores

Expand Down Expand Up @@ -226,9 +231,14 @@ def train_feature_set(df, fs, config):
print(f"ERROR: Unknown algorithm type {algo_type}. Check algorithm list.")
return

scores[score_column_name] = compute_scores(df_y, df_y_hat)
out_df[score_column_name] = df_y_hat

if df_y.dtype == "float64" and df_y_hat.dtype == "float64":
# TODO Regression scores
scores[score_column_name] = dict(rmse=0.0, mae=0.0, mse=0.0, mape=0.0, r2=0.0)
else:
scores[score_column_name] = compute_scores(df_y, df_y_hat) # Classification stores

return out_df, models, scores


Expand Down
10 changes: 7 additions & 3 deletions scripts/predict_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ def main(config_file):
df = df[out_columns + [x for x in all_features if x not in out_columns]]

for label in labels:
# "category" NN does not work without this (note that we assume a classification task here)
df[label] = df[label].astype(int)
if np.issubdtype(df[label].dtype, bool):
df[label] = df[label].astype(int) # For classification tasks we want to use integers

df.replace([np.inf, -np.inf], np.nan, inplace=True)
#in_df = in_df.dropna(subset=labels)
Expand Down Expand Up @@ -306,7 +306,11 @@ def main(config_file):

print(f"Using {len(df_scores)} non-nan rows for scoring.")

score = compute_scores(y_true, y_predicted)
if y_true.dtype == "float64" and y_predicted.dtype == "float64":
# TODO Regression scores
score = dict(rmse=0.0, mae=0.0, mse=0.0, mape=0.0, r2=0.0)
else:
score = compute_scores(y_true, y_predicted) # Classification stores

score_lines.append(f"{score_column_name}, {score.get('auc'):.3f}, {score.get('ap'):.3f}, {score.get('f1'):.3f}, {score.get('precision'):.3f}, {score.get('recall'):.3f}")

Expand Down
4 changes: 2 additions & 2 deletions scripts/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ def main(config_file):
df = df[out_columns + [x for x in all_features if x not in out_columns]]

for label in labels:
# "category" NN does not work without this (note that we assume a classification task here)
df[label] = df[label].astype(int)
if np.issubdtype(df[label].dtype, bool):
df[label] = df[label].astype(int) # For classification tasks we want to use integers

# Remove the tail data for which no (correct) labels are available
# The reason is that these labels are computed from future values which are not available and hence labels might be wrong
Expand Down

0 comments on commit 66837d5

Please sign in to comment.