Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pull] main from automl:main #1

Merged
merged 10 commits into from
Oct 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ If you want to evaluate our baselines, too, please install with
```bash
pip install tabpfn[baselines]
```
To run the autogluon baseline please create a separate environment and install autogluon==0.4.0, installation in the same environment as our other baselines is not possible.
To run the autogluon and autosklearn baseline please create a separate environment and install autosklearn / autogluon==0.4.0, installation in the same environment as our other baselines is not possible.

## Getting started

Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ tqdm==4.62.1
numpy==1.21.2
openml==0.12.2
catboost==0.26.1
auto-sklearn==0.14.5
# auto-sklearn==0.14.5
hyperopt==0.2.5
configspace==0.4.21
# autogluon==0.4.0
# autogluon==0.4.0
2 changes: 1 addition & 1 deletion tabpfn/priors/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, num_steps, **get_batch_kwargs):
# The stuff outside the or is set as class attribute before instantiation.
self.num_features = get_batch_kwargs.get('num_features') or self.num_features
self.epoch_count = 0
print('DataLoader.__dict__', self.__dict__)
#print('DataLoader.__dict__', self.__dict__)

@staticmethod
def gbm(*args, eval_pos_seq_len_sampler, **kwargs):
Expand Down
6 changes: 3 additions & 3 deletions tabpfn/scripts/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def get_gpu_memory():

def load_model(path, filename, device, eval_positions, verbose):
# TODO: This function only restores evaluation functionality but training canät be continued. It is also not flexible.
print('Loading....')
# print('Loading....')
model_state, optimizer_state, config_sample = torch.load(
os.path.join(path, filename), map_location='cpu')
if ('differentiable_hyperparameters' in config_sample
Expand Down Expand Up @@ -238,7 +238,7 @@ def new_get_batch(batch_size, seq_len, num_features, hyperparameters
config['eval_positions'] = [int(config['bptt'] * 0.95)] if config['bptt_extra_samples'] is None else [int(config['bptt'])]

epochs = 0 if not should_train else config['epochs']
print('MODEL BUILDER', model_proto, extra_kwargs['get_batch'])
#print('MODEL BUILDER', model_proto, extra_kwargs['get_batch'])
model = train(model_proto.DataLoader
, loss
, encoder
Expand Down Expand Up @@ -274,4 +274,4 @@ def new_get_batch(batch_size, seq_len, num_features, hyperparameters
, verbose=verbose_train,
weight_decay=config.get('weight_decay', 0.0))

return model
return model
80 changes: 65 additions & 15 deletions tabpfn/scripts/tabular_baselines.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,8 @@
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

import autosklearn.classification

CV = 5
MULTITHREAD = 1 # Number of threads baselines are able to use at most
MULTITHREAD = -1 # Number of threads baselines are able to use at most
param_grid, param_grid_hyperopt = {}, {}

def get_scoring_direction(metric_used):
Expand Down Expand Up @@ -77,6 +75,7 @@ def get_scoring_string(metric_used, multiclass=True, usage="sklearn_cv"):
elif usage == 'tabnet':
return 'logloss' if multiclass else 'auc'
elif usage == 'autosklearn':
import autosklearn.classification
if multiclass:
return autosklearn.metrics.log_loss # roc_auc only works for binary, use logloss instead
else:
Expand All @@ -99,12 +98,14 @@ def get_scoring_string(metric_used, multiclass=True, usage="sklearn_cv"):
elif usage == 'tabnet':
return 'logloss'
elif usage == 'autosklearn':
import autosklearn.classification
return autosklearn.metrics.log_loss
elif usage == 'catboost':
return 'MultiClass' # Effectively LogLoss
return 'logloss'
elif metric_used.__name__ == tabular_metrics.r2_metric.__name__:
if usage == 'autosklearn':
import autosklearn.classification
return autosklearn.metrics.r2
elif usage == 'sklearn_cv':
return 'r2' # tabular_metrics.neg_r2
Expand All @@ -118,6 +119,7 @@ def get_scoring_string(metric_used, multiclass=True, usage="sklearn_cv"):
return 'r2'
elif metric_used.__name__ == tabular_metrics.root_mean_squared_error_metric.__name__:
if usage == 'autosklearn':
import autosklearn.classification
return autosklearn.metrics.root_mean_squared_error
elif usage == 'sklearn_cv':
return 'neg_root_mean_squared_error' # tabular_metrics.neg_r2
Expand All @@ -131,6 +133,7 @@ def get_scoring_string(metric_used, multiclass=True, usage="sklearn_cv"):
return 'neg_root_mean_squared_error'
elif metric_used.__name__ == tabular_metrics.mean_absolute_error_metric.__name__:
if usage == 'autosklearn':
import autosklearn.classification
return autosklearn.metrics.mean_absolute_error
elif usage == 'sklearn_cv':
return 'neg_mean_absolute_error' # tabular_metrics.neg_r2
Expand Down Expand Up @@ -919,7 +922,7 @@ def stop(trial):
fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
space=param_grid_hyperopt['ridge'],
algo=rand.suggest,
rstate=np.random.default_rng(int(y[:].sum()) % 10000),
rstate=np.random.RandomState(int(y[:].sum()) % 10000),
early_stop_fn=stop,
# The seed is deterministic but varies for each dataset and each split of it
max_evals=10000)
Expand Down Expand Up @@ -1001,7 +1004,7 @@ def stop(trial):
fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
space=param_grid_hyperopt['lightgbm'],
algo=rand.suggest,
rstate=np.random.default_rng(int(y[:].sum()) % 10000),
rstate=np.random.RandomState(int(y[:].sum()) % 10000),
early_stop_fn=stop,
# The seed is deterministic but varies for each dataset and each split of it
max_evals=10000)
Expand All @@ -1028,7 +1031,7 @@ def logistic_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=30
, cat_features=cat_features)

def clf_(**params):
return LogisticRegression(solver='saga', tol=1e-4, n_jobs=1, **params)
return LogisticRegression(solver='saga', tol=1e-4, n_jobs=MULTITHREAD, **params)

start_time = time.time()

Expand All @@ -1039,7 +1042,7 @@ def stop(trial):
fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
space=param_grid_hyperopt['logistic'],
algo=rand.suggest,
rstate=np.random.default_rng(int(y[:].sum()) % 10000),
rstate=np.random.RandomState(int(y[:].sum()) % 10000),
early_stop_fn=stop,
# The seed is deterministic but varies for each dataset and each split of it
max_evals=1000)
Expand All @@ -1053,6 +1056,52 @@ def stop(trial):

return metric, pred, best


## Random Forest
# Search space from
# https://www.kaggle.com/code/emanueleamcappella/random-forest-hyperparameters-tuning/notebook
param_grid_hyperopt['random_forest'] = {'n_estimators': hp.randint('n_estimators', 20, 200),
'max_features': hp.choice('max_features', ['auto', 'sqrt']),
'max_depth': hp.randint('max_depth', 1, 45),
'min_samples_split': hp.choice('min_samples_split', [5, 10])}
def random_forest_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
from sklearn.ensemble import RandomForestClassifier

x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y,
one_hot=False, impute=True, standardize=False,
cat_features=cat_features)

def clf_(**params):
if is_classification(metric_used):
return RandomForestClassifier(n_jobs=MULTITHREAD, **params)
return RandomForestClassifier(n_jobs=MULTITHREAD, **params)

start_time = time.time()

def stop(trial):
return time.time() - start_time > max_time, []

best = fmin(
fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
space=param_grid_hyperopt['random_forest'],
algo=rand.suggest,
rstate=np.random.RandomState(int(y[:].sum()) % 10000),
early_stop_fn=stop,
# The seed is deterministic but varies for each dataset and each split of it
max_evals=10000)
best = space_eval(param_grid_hyperopt['random_forest'], best)

clf = clf_(**best)
clf.fit(x, y)

if is_classification(metric_used):
pred = clf.predict_proba(test_x)
else:
pred = clf.predict(test_x)
metric = metric_used(test_y, pred)

return metric, pred, best

## KNN
param_grid_hyperopt['knn'] = {'n_neighbors': hp.randint('n_neighbors', 1,16)
}
Expand All @@ -1063,8 +1112,8 @@ def knn_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):

def clf_(**params):
if is_classification(metric_used):
return neighbors.KNeighborsClassifier(n_jobs=1, **params)
return neighbors.KNeighborsRegressor(n_jobs=1, **params)
return neighbors.KNeighborsClassifier(n_jobs=MULTITHREAD, **params)
return neighbors.KNeighborsRegressor(n_jobs=MULTITHREAD, **params)

start_time = time.time()

Expand All @@ -1075,7 +1124,7 @@ def stop(trial):
fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
space=param_grid_hyperopt['knn'],
algo=rand.suggest,
rstate=np.random.default_rng(int(y[:].sum()) % 10000),
rstate=np.random.RandomState(int(y[:].sum()) % 10000),
early_stop_fn=stop,
# The seed is deterministic but varies for each dataset and each split of it
max_evals=200)
Expand Down Expand Up @@ -1117,7 +1166,7 @@ def stop(trial):
fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
space=param_grid_hyperopt['gp'],
algo=rand.suggest,
rstate=np.random.default_rng(int(y[:].sum()) % 10000),
rstate=np.random.RandomState(int(y[:].sum()) % 10000),
early_stop_fn=stop,
# The seed is deterministic but varies for each dataset and each split of it
max_evals=1000)
Expand Down Expand Up @@ -1192,7 +1241,7 @@ def stop(trial):
fn=lambda params: tabnet_eval_f(params, clf_, x, y, metric_used, start_time, max_time),
space=param_grid_hyperopt['tabnet'],
algo=rand.suggest,
rstate=np.random.default_rng(int(y[:].sum()) % 10000),
rstate=np.random.RandomState(int(y[:].sum()) % 10000),
early_stop_fn=stop,
max_evals=1000)
best = space_eval(param_grid_hyperopt['tabnet'], best)
Expand Down Expand Up @@ -1270,7 +1319,7 @@ def stop(trial):
fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
space=param_grid_hyperopt['catboost'],
algo=rand.suggest,
rstate=np.random.default_rng(int(y[:].sum()) % 10000),
rstate=np.random.RandomState(int(y[:].sum()) % 10000),
early_stop_fn=stop,
# The seed is deterministic but varies for each dataset and each split of it
max_evals=1000)
Expand Down Expand Up @@ -1336,7 +1385,7 @@ def stop(trial):
fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
space=param_grid_hyperopt['xgb'],
algo=rand.suggest,
rstate=np.random.default_rng(int(y[:].sum()) % 10000),
rstate=np.random.RandomState(int(y[:].sum()) % 10000),
early_stop_fn=stop,
# The seed is deterministic but varies for each dataset and each split of it
max_evals=1000)
Expand Down Expand Up @@ -1370,7 +1419,7 @@ def warn(*args, **kwargs):
x, y, test_x, test_y = x.cpu(), y.cpu(), test_x.cpu(), test_y.cpu()
x, test_x = torch.nan_to_num(x), torch.nan_to_num(test_x)

clf = RidgeClassifier(n_jobs=1)
clf = RidgeClassifier(n_jobs=MULTITHREAD)

# create a dictionary of all values we want to test for n_neighbors
# use gridsearch to test all values for n_neighbors
Expand Down Expand Up @@ -1430,6 +1479,7 @@ def get_model():

clf_dict = {'gp': gp_metric
, 'transformer': transformer_metric
, 'random_forest': random_forest_metric
, 'knn': knn_metric
, 'catboost': catboost_metric
, 'tabnet': tabnet_metric
Expand Down
2 changes: 1 addition & 1 deletion tabpfn/scripts/transformer_prediction_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def check_file(e):
raise Exception('No checkpoint found at '+str(model_path))


print(f'Loading {model_file}')
#print(f'Loading {model_file}')

model, c = load_model(base_path, model_file, device, eval_positions=[], verbose=False)

Expand Down
5 changes: 3 additions & 2 deletions tabpfn/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ def eval_pos_seq_len_sampler():
dl = priordataloader_class(num_steps=steps_per_epoch, batch_size=batch_size, eval_pos_seq_len_sampler=eval_pos_seq_len_sampler, seq_len_maximum=bptt+(bptt_extra_samples if bptt_extra_samples else 0), device=device, **extra_prior_kwargs_dict)

encoder = encoder_generator(dl.num_features, emsize)
style_def = dl.get_test_batch()[0][0] # the style in batch of the form ((style, x, y), target, single_eval_pos)
print(f'Style definition of first 3 examples: {style_def[:3] if style_def is not None else None}')
#style_def = dl.get_test_batch()[0][0] # the style in batch of the form ((style, x, y), target, single_eval_pos)
style_def = None
#print(f'Style definition of first 3 examples: {style_def[:3] if style_def is not None else None}')
style_encoder = style_encoder_generator(style_def.shape[1], emsize) if (style_def is not None) else None
if isinstance(criterion, nn.GaussianNLLLoss):
n_out = 2
Expand Down
4 changes: 2 additions & 2 deletions tabpfn/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def print(*args, **kwargs):


def init_dist(device):
print('init dist')
#print('init dist')
if 'LOCAL_RANK' in os.environ:
# launched with torch.distributed.launch
rank = int(os.environ["LOCAL_RANK"])
Expand Down Expand Up @@ -268,7 +268,7 @@ def init_dist(device):

return True, rank, f'cuda:{rank}'
else:
print('Not using distributed')
#print('Not using distributed')
# will not change any of the behavior of print, but allows putting the force=True in the print calls
print_on_master_only(True)
return False, 0, device
Expand Down