vishalbelsare · pull · Oct 23, 2022 · Oct 23, 2022 · Oct 23, 2022 · Oct 23, 2022
diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ If you want to evaluate our baselines, too, please install with
 ```bash
 pip install tabpfn[baselines]
 ```
-To run the autogluon baseline please create a separate environment and install autogluon==0.4.0, installation in the same environment as our other baselines is not possible.
+To run the autogluon and autosklearn baseline please create a separate environment and install autosklearn / autogluon==0.4.0, installation in the same environment as our other baselines is not possible.
 
 ## Getting started
 

diff --git a/requirements.txt b/requirements.txt
@@ -9,7 +9,7 @@ tqdm==4.62.1
 numpy==1.21.2
 openml==0.12.2
 catboost==0.26.1
-auto-sklearn==0.14.5
+# auto-sklearn==0.14.5
 hyperopt==0.2.5
 configspace==0.4.21
-# autogluon==0.4.0
+# autogluon==0.4.0
diff --git a/tabpfn/priors/utils.py b/tabpfn/priors/utils.py
@@ -23,7 +23,7 @@ def __init__(self, num_steps, **get_batch_kwargs):
             # The stuff outside the or is set as class attribute before instantiation.
             self.num_features = get_batch_kwargs.get('num_features') or self.num_features
             self.epoch_count = 0
-            print('DataLoader.__dict__', self.__dict__)
+            #print('DataLoader.__dict__', self.__dict__)
 
         @staticmethod
         def gbm(*args, eval_pos_seq_len_sampler, **kwargs):

diff --git a/tabpfn/scripts/model_builder.py b/tabpfn/scripts/model_builder.py
@@ -42,7 +42,7 @@ def get_gpu_memory():
 
 def load_model(path, filename, device, eval_positions, verbose):
     # TODO: This function only restores evaluation functionality but training canät be continued. It is also not flexible.
-    print('Loading....')
+    # print('Loading....')
     model_state, optimizer_state, config_sample = torch.load(
         os.path.join(path, filename), map_location='cpu')
     if ('differentiable_hyperparameters' in config_sample
@@ -238,7 +238,7 @@ def new_get_batch(batch_size, seq_len, num_features, hyperparameters
     config['eval_positions'] = [int(config['bptt'] * 0.95)] if config['bptt_extra_samples'] is None else [int(config['bptt'])]
 
     epochs = 0 if not should_train else config['epochs']
-    print('MODEL BUILDER', model_proto, extra_kwargs['get_batch'])
+    #print('MODEL BUILDER', model_proto, extra_kwargs['get_batch'])
     model = train(model_proto.DataLoader
                   , loss
                   , encoder
@@ -274,4 +274,4 @@ def new_get_batch(batch_size, seq_len, num_features, hyperparameters
                   , verbose=verbose_train,
                   weight_decay=config.get('weight_decay', 0.0))
 
-    return model
+    return model
diff --git a/tabpfn/scripts/tabular_baselines.py b/tabpfn/scripts/tabular_baselines.py
@@ -39,10 +39,8 @@
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import MinMaxScaler
 
-import autosklearn.classification
-
 CV = 5
-MULTITHREAD = 1 # Number of threads baselines are able to use at most
+MULTITHREAD = -1 # Number of threads baselines are able to use at most
 param_grid, param_grid_hyperopt = {}, {}
 
 def get_scoring_direction(metric_used):
@@ -77,6 +75,7 @@ def get_scoring_string(metric_used, multiclass=True, usage="sklearn_cv"):
         elif usage == 'tabnet':
             return 'logloss' if multiclass else 'auc'
         elif usage == 'autosklearn':
+            import autosklearn.classification
             if multiclass:
                 return autosklearn.metrics.log_loss # roc_auc only works for binary, use logloss instead
             else:
@@ -99,12 +98,14 @@ def get_scoring_string(metric_used, multiclass=True, usage="sklearn_cv"):
         elif usage == 'tabnet':
             return 'logloss'
         elif usage == 'autosklearn':
+            import autosklearn.classification
             return autosklearn.metrics.log_loss
         elif usage == 'catboost':
             return 'MultiClass' # Effectively LogLoss
         return 'logloss'
     elif metric_used.__name__ == tabular_metrics.r2_metric.__name__:
         if usage == 'autosklearn':
+            import autosklearn.classification
             return autosklearn.metrics.r2
         elif usage == 'sklearn_cv':
             return 'r2' # tabular_metrics.neg_r2
@@ -118,6 +119,7 @@ def get_scoring_string(metric_used, multiclass=True, usage="sklearn_cv"):
             return 'r2'
     elif metric_used.__name__ == tabular_metrics.root_mean_squared_error_metric.__name__:
         if usage == 'autosklearn':
+            import autosklearn.classification
             return autosklearn.metrics.root_mean_squared_error
         elif usage == 'sklearn_cv':
             return 'neg_root_mean_squared_error' # tabular_metrics.neg_r2
@@ -131,6 +133,7 @@ def get_scoring_string(metric_used, multiclass=True, usage="sklearn_cv"):
             return 'neg_root_mean_squared_error'
     elif metric_used.__name__ == tabular_metrics.mean_absolute_error_metric.__name__:
         if usage == 'autosklearn':
+            import autosklearn.classification
             return autosklearn.metrics.mean_absolute_error
         elif usage == 'sklearn_cv':
             return 'neg_mean_absolute_error' # tabular_metrics.neg_r2
@@ -919,7 +922,7 @@ def stop(trial):
         fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
         space=param_grid_hyperopt['ridge'],
         algo=rand.suggest,
-        rstate=np.random.default_rng(int(y[:].sum()) % 10000),
+        rstate=np.random.RandomState(int(y[:].sum()) % 10000),
         early_stop_fn=stop,
         # The seed is deterministic but varies for each dataset and each split of it
         max_evals=10000)
@@ -1001,7 +1004,7 @@ def stop(trial):
         fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
         space=param_grid_hyperopt['lightgbm'],
         algo=rand.suggest,
-        rstate=np.random.default_rng(int(y[:].sum()) % 10000),
+        rstate=np.random.RandomState(int(y[:].sum()) % 10000),
         early_stop_fn=stop,
         # The seed is deterministic but varies for each dataset and each split of it
         max_evals=10000)
@@ -1028,7 +1031,7 @@ def logistic_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=30
                                              , cat_features=cat_features)
 
     def clf_(**params):
-        return LogisticRegression(solver='saga', tol=1e-4, n_jobs=1, **params)
+        return LogisticRegression(solver='saga', tol=1e-4, n_jobs=MULTITHREAD, **params)
 
     start_time = time.time()
 
@@ -1039,7 +1042,7 @@ def stop(trial):
         fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
         space=param_grid_hyperopt['logistic'],
         algo=rand.suggest,
-        rstate=np.random.default_rng(int(y[:].sum()) % 10000),
+        rstate=np.random.RandomState(int(y[:].sum()) % 10000),
         early_stop_fn=stop,
         # The seed is deterministic but varies for each dataset and each split of it
         max_evals=1000)
@@ -1053,6 +1056,52 @@ def stop(trial):
 
     return metric, pred, best
 
+
+## Random Forest
+# Search space from
+# https://www.kaggle.com/code/emanueleamcappella/random-forest-hyperparameters-tuning/notebook
+param_grid_hyperopt['random_forest'] = {'n_estimators': hp.randint('n_estimators', 20, 200),
+               'max_features': hp.choice('max_features', ['auto', 'sqrt']),
+               'max_depth': hp.randint('max_depth', 1, 45),
+               'min_samples_split': hp.choice('min_samples_split', [5, 10])}
+def random_forest_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
+    from sklearn.ensemble import RandomForestClassifier
+
+    x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y,
+                                             one_hot=False, impute=True, standardize=False,
+                                             cat_features=cat_features)
+
+    def clf_(**params):
+        if is_classification(metric_used):
+            return RandomForestClassifier(n_jobs=MULTITHREAD, **params)
+        return RandomForestClassifier(n_jobs=MULTITHREAD, **params)
+
+    start_time = time.time()
+
+    def stop(trial):
+        return time.time() - start_time > max_time, []
+
+    best = fmin(
+        fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
+        space=param_grid_hyperopt['random_forest'],
+        algo=rand.suggest,
+        rstate=np.random.RandomState(int(y[:].sum()) % 10000),
+        early_stop_fn=stop,
+        # The seed is deterministic but varies for each dataset and each split of it
+        max_evals=10000)
+    best = space_eval(param_grid_hyperopt['random_forest'], best)
+
+    clf = clf_(**best)
+    clf.fit(x, y)
+
+    if is_classification(metric_used):
+        pred = clf.predict_proba(test_x)
+    else:
+        pred = clf.predict(test_x)
+    metric = metric_used(test_y, pred)
+
+    return metric, pred, best
+
 ## KNN
 param_grid_hyperopt['knn'] = {'n_neighbors': hp.randint('n_neighbors', 1,16)
                               }
@@ -1063,8 +1112,8 @@ def knn_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
 
     def clf_(**params):
         if is_classification(metric_used):
-            return neighbors.KNeighborsClassifier(n_jobs=1, **params)
-        return neighbors.KNeighborsRegressor(n_jobs=1, **params)
+            return neighbors.KNeighborsClassifier(n_jobs=MULTITHREAD, **params)
+        return neighbors.KNeighborsRegressor(n_jobs=MULTITHREAD, **params)
 
     start_time = time.time()
 
@@ -1075,7 +1124,7 @@ def stop(trial):
         fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
         space=param_grid_hyperopt['knn'],
         algo=rand.suggest,
-        rstate=np.random.default_rng(int(y[:].sum()) % 10000),
+        rstate=np.random.RandomState(int(y[:].sum()) % 10000),
         early_stop_fn=stop,
         # The seed is deterministic but varies for each dataset and each split of it
         max_evals=200)
@@ -1117,7 +1166,7 @@ def stop(trial):
         fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
         space=param_grid_hyperopt['gp'],
         algo=rand.suggest,
-        rstate=np.random.default_rng(int(y[:].sum()) % 10000),
+        rstate=np.random.RandomState(int(y[:].sum()) % 10000),
         early_stop_fn=stop,
         # The seed is deterministic but varies for each dataset and each split of it
         max_evals=1000)
@@ -1192,7 +1241,7 @@ def stop(trial):
         fn=lambda params: tabnet_eval_f(params, clf_, x, y, metric_used, start_time, max_time),
         space=param_grid_hyperopt['tabnet'],
         algo=rand.suggest,
-        rstate=np.random.default_rng(int(y[:].sum()) % 10000),
+        rstate=np.random.RandomState(int(y[:].sum()) % 10000),
         early_stop_fn=stop,
         max_evals=1000)
     best = space_eval(param_grid_hyperopt['tabnet'], best)
@@ -1270,7 +1319,7 @@ def stop(trial):
         fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
         space=param_grid_hyperopt['catboost'],
         algo=rand.suggest,
-        rstate=np.random.default_rng(int(y[:].sum()) % 10000),
+        rstate=np.random.RandomState(int(y[:].sum()) % 10000),
         early_stop_fn=stop,
         # The seed is deterministic but varies for each dataset and each split of it
         max_evals=1000)
@@ -1336,7 +1385,7 @@ def stop(trial):
         fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
         space=param_grid_hyperopt['xgb'],
         algo=rand.suggest,
-        rstate=np.random.default_rng(int(y[:].sum()) % 10000),
+        rstate=np.random.RandomState(int(y[:].sum()) % 10000),
         early_stop_fn=stop,
         # The seed is deterministic but varies for each dataset and each split of it
         max_evals=1000)
@@ -1370,7 +1419,7 @@ def warn(*args, **kwargs):
     x, y, test_x, test_y = x.cpu(), y.cpu(), test_x.cpu(), test_y.cpu()
     x, test_x = torch.nan_to_num(x), torch.nan_to_num(test_x)
 
-    clf = RidgeClassifier(n_jobs=1)
+    clf = RidgeClassifier(n_jobs=MULTITHREAD)
 
     # create a dictionary of all values we want to test for n_neighbors
     # use gridsearch to test all values for n_neighbors
@@ -1430,6 +1479,7 @@ def get_model():
 
 clf_dict = {'gp': gp_metric
 , 'transformer': transformer_metric
+, 'random_forest': random_forest_metric
                 , 'knn': knn_metric
                 , 'catboost': catboost_metric
                 , 'tabnet': tabnet_metric

diff --git a/tabpfn/scripts/transformer_prediction_interface.py b/tabpfn/scripts/transformer_prediction_interface.py
@@ -87,7 +87,7 @@ def check_file(e):
         raise Exception('No checkpoint found at '+str(model_path))
 
 
-    print(f'Loading {model_file}')
+    #print(f'Loading {model_file}')
 
     model, c = load_model(base_path, model_file, device, eval_positions=[], verbose=False)
 

diff --git a/tabpfn/train.py b/tabpfn/train.py
@@ -51,8 +51,9 @@ def eval_pos_seq_len_sampler():
     dl = priordataloader_class(num_steps=steps_per_epoch, batch_size=batch_size, eval_pos_seq_len_sampler=eval_pos_seq_len_sampler, seq_len_maximum=bptt+(bptt_extra_samples if bptt_extra_samples else 0), device=device, **extra_prior_kwargs_dict)
 
     encoder = encoder_generator(dl.num_features, emsize)
-    style_def = dl.get_test_batch()[0][0] # the style in batch of the form ((style, x, y), target, single_eval_pos)
-    print(f'Style definition of first 3 examples: {style_def[:3] if style_def is not None else None}')
+    #style_def = dl.get_test_batch()[0][0] # the style in batch of the form ((style, x, y), target, single_eval_pos)
+    style_def = None
+    #print(f'Style definition of first 3 examples: {style_def[:3] if style_def is not None else None}')
     style_encoder = style_encoder_generator(style_def.shape[1], emsize) if (style_def is not None) else None
     if isinstance(criterion, nn.GaussianNLLLoss):
         n_out = 2

diff --git a/tabpfn/utils.py b/tabpfn/utils.py
@@ -236,7 +236,7 @@ def print(*args, **kwargs):
 
 
 def init_dist(device):
-    print('init dist')
+    #print('init dist')
     if 'LOCAL_RANK' in os.environ:
         # launched with torch.distributed.launch
         rank = int(os.environ["LOCAL_RANK"])
@@ -268,7 +268,7 @@ def init_dist(device):
 
         return True, rank, f'cuda:{rank}'
     else:
-        print('Not using distributed')
+        #print('Not using distributed')
         # will not change any of the behavior of print, but allows putting the force=True in the print calls
         print_on_master_only(True)
         return False, 0, device