add a possibility to have regression algorithms

norvege-gul · Dec 15, 2024 · 66837d5 · 66837d5
1 parent 7c50eeb
commit 66837d5
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 12 deletions.
diff --git a/common/classifiers.py b/common/classifiers.py
@@ -12,7 +12,7 @@
 from sklearn.metrics import f1_score
 
 from sklearn.linear_model import LogisticRegression, SGDClassifier
-from sklearn.svm import SVC
+from sklearn.svm import SVC, SVR
 
 import lightgbm as lgbm
 
@@ -425,6 +425,7 @@ def train_svc(df_X, df_y, model_config: dict):
     Train model with the specified hyper-parameters and return this model (and scaler if any).
     """
     is_scale = model_config.get("train", {}).get("is_scale", True)
+    is_regression = model_config.get("train", {}).get("is_regression", False)
 
     #
     # Prepare data
@@ -443,8 +444,11 @@ def train_svc(df_X, df_y, model_config: dict):
     # Create model
     #
     args = model_config.get("params").copy()
-    args['probability'] = True  # Required if we are going to use predict_proba()
-    model = SVC(**args)
+    if is_regression:
+        model = SVR(**args)
+    else:
+        args['probability'] = True  # Required if we are going to use predict_proba()
+        model = SVC(**args)
 
     #
     # Train
@@ -459,6 +463,8 @@ def predict_svc(models: tuple, df_X_test, model_config: dict):
     Use the model(s) to make predictions for the test data.
     The first model is a prediction model and the second model (optional) is a scaler.
     """
+    is_regression = model_config.get("train", {}).get("is_regression", False)
+
     #
     # Double column set if required
     #
@@ -482,8 +488,11 @@ def predict_svc(models: tuple, df_X_test, model_config: dict):
     df_X_test_nonans = df_X_test.dropna()  # Drop nans, possibly create gaps in index
     nonans_index = df_X_test_nonans.index
 
-    y_test_hat_nonans = models[0].predict_proba(df_X_test_nonans.values)  # It returns pairs or probas for 0 and 1
-    y_test_hat_nonans = y_test_hat_nonans[:, 1]  # Or y_test_hat.flatten()
+    if is_regression:
+        y_test_hat_nonans = models[0].predict(df_X_test_nonans.values)
+    else:
+        y_test_hat_nonans = models[0].predict_proba(df_X_test_nonans.values)  # It returns pairs or probas for 0 and 1
+        y_test_hat_nonans = y_test_hat_nonans[:, 1]  # Or y_test_hat.flatten()
     y_test_hat_nonans = pd.Series(data=y_test_hat_nonans, index=nonans_index)  # Attach indexes with gaps
 
     df_ret = pd.DataFrame(index=input_index)  # Create empty dataframe with original index

diff --git a/common/generators.py b/common/generators.py
@@ -161,7 +161,12 @@ def predict_feature_set(df, fs, config, models: dict):
 
             # For each new score, compare it with the label true values
             if label in df:
-                scores[score_column_name] = compute_scores(df[label], df_y_hat)
+                df_y = df[label]
+                if df_y.dtype == "float64" and df_y_hat.dtype == "float64":
+                    # TODO Regression scores
+                    scores[score_column_name] = dict(rmse=0.0, mae=0.0, mse=0.0, mape=0.0, r2=0.0)
+                else:
+                    scores[score_column_name] = compute_scores(df_y, df_y_hat)  # Classification stores
 
     return out_df, features, scores
 
@@ -226,9 +231,14 @@ def train_feature_set(df, fs, config):
                 print(f"ERROR: Unknown algorithm type {algo_type}. Check algorithm list.")
                 return
 
-            scores[score_column_name] = compute_scores(df_y, df_y_hat)
             out_df[score_column_name] = df_y_hat
 
+            if df_y.dtype == "float64" and df_y_hat.dtype == "float64":
+                # TODO Regression scores
+                scores[score_column_name] = dict(rmse=0.0, mae=0.0, mse=0.0, mape=0.0, r2=0.0)
+            else:
+                scores[score_column_name] = compute_scores(df_y, df_y_hat)  # Classification stores
+
     return out_df, models, scores
 
 

diff --git a/scripts/predict_rolling.py b/scripts/predict_rolling.py
@@ -134,8 +134,8 @@ def main(config_file):
     df = df[out_columns + [x for x in all_features if x not in out_columns]]
 
     for label in labels:
-        # "category" NN does not work without this (note that we assume a classification task here)
-        df[label] = df[label].astype(int)
+        if np.issubdtype(df[label].dtype, bool):
+            df[label] = df[label].astype(int)  # For classification tasks we want to use integers
 
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
     #in_df = in_df.dropna(subset=labels)
@@ -306,7 +306,11 @@ def main(config_file):
 
         print(f"Using {len(df_scores)} non-nan rows for scoring.")
 
-        score = compute_scores(y_true, y_predicted)
+        if y_true.dtype == "float64" and y_predicted.dtype == "float64":
+            # TODO Regression scores
+            score = dict(rmse=0.0, mae=0.0, mse=0.0, mape=0.0, r2=0.0)
+        else:
+            score = compute_scores(y_true, y_predicted)  # Classification stores
 
         score_lines.append(f"{score_column_name}, {score.get('auc'):.3f}, {score.get('ap'):.3f}, {score.get('f1'):.3f}, {score.get('precision'):.3f}, {score.get('recall'):.3f}")
 

diff --git a/scripts/train.py b/scripts/train.py
@@ -80,8 +80,8 @@ def main(config_file):
     df = df[out_columns + [x for x in all_features if x not in out_columns]]
 
     for label in labels:
-        # "category" NN does not work without this (note that we assume a classification task here)
-        df[label] = df[label].astype(int)
+        if np.issubdtype(df[label].dtype, bool):
+            df[label] = df[label].astype(int)  # For classification tasks we want to use integers
 
     # Remove the tail data for which no (correct) labels are available
     # The reason is that these labels are computed from future values which are not available and hence labels might be wrong