Skip to content

Commit

Permalink
added PhysioNet dataset.
Browse files Browse the repository at this point in the history
  • Loading branch information
dwromero committed Nov 12, 2021
1 parent dc84dce commit 4f11efe
Show file tree
Hide file tree
Showing 18 changed files with 295 additions and 5 deletions.
Binary file modified __pycache__/config.cpython-37.pyc
Binary file not shown.
Binary file modified __pycache__/dataset.cpython-37.pyc
Binary file not shown.
Binary file modified __pycache__/model.cpython-37.pyc
Binary file not shown.
Binary file modified __pycache__/tester.cpython-37.pyc
Binary file not shown.
Binary file modified __pycache__/trainer.cpython-37.pyc
Binary file not shown.
2 changes: 2 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ def get_config():
path="",
# This parameter is automatically derived from the other parameters of the run. It specifies
# the path where the network parameters will be saved / loaded from.
report_auc=False,
max_epochs_no_improvement=100,
# --------------------------
# Parameters of TCNs / BFCNNs
cnn_kernel_size=0,
Expand Down
4 changes: 3 additions & 1 deletion dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
CIFAR10,
SpeechCommands,
CharTrajectories,
PhysioNet,
)

import ml_collections
Expand All @@ -28,6 +29,7 @@ def dataset_constructor(
"CIFAR10": CIFAR10,
"SpeechCommands": SpeechCommands,
"CharTrajectories": CharTrajectories,
"PhysioNet": PhysioNet,
}[config.dataset]

training_set = dataset(
Expand All @@ -48,7 +50,7 @@ def dataset_constructor(
else config.sr_test, # Test set can be sample differently.
dropped_rate=config.drop_rate,
)
if config.dataset in ["SpeechCommands", "CharTrajectories"]:
if config.dataset in ["SpeechCommands", "CharTrajectories", "PhysioNet"]:
validation_set = dataset(
partition="val",
seq_length=config.seq_length,
Expand Down
1 change: 1 addition & 0 deletions datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@
load_data,
save_data,
)
from .physionet import PhysioNet
from .char_trajectories import CharTrajectories
Binary file modified datasets/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file modified datasets/__pycache__/char_trajectories.cpython-37.pyc
Binary file not shown.
Binary file added datasets/__pycache__/physionet.cpython-37.pyc
Binary file not shown.
182 changes: 182 additions & 0 deletions datasets/physionet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
"""
Adapted from https://github.com/patrick-kidger/NeuralCDE/blob/758d3a7134e3a691013e5cc6b7f68f277e9e6b69/experiments/datasets/speech_commands.py
"""
import os
import pathlib
import urllib.request
import zipfile
import torch
import csv
import math
import torchaudio

from .utils import normalise_data, split_data, load_data, save_data, subsample


class PhysioNet(torch.utils.data.TensorDataset):
def __init__(
self,
partition: int,
**kwargs,
):
self.root = pathlib.Path("./data")
self.base_loc = self.root / "sepsis"
data_loc = self.base_loc / "preprocessed_data"

if os.path.exists(self.base_loc):
pass
else:
self.download()
train_X, val_X, test_X, train_y, val_y, test_y = self._process_data()
if not os.path.exists(data_loc):
os.mkdir(data_loc)
save_data(
data_loc,
train_X=train_X,
val_X=val_X,
test_X=test_X,
train_y=train_y,
val_y=val_y,
test_y=test_y,
)

X, y = self.load_data(data_loc, partition)

super(PhysioNet, self).__init__(X, y)


def download(self):
loc_Azip = self.base_loc / 'training_setA.zip'
loc_Bzip = self.base_loc / 'training_setB.zip'

if not os.path.exists(loc_Azip):
if not os.path.exists(self.base_loc):
os.mkdir(self.base_loc)
urllib.request.urlretrieve('https://archive.physionet.org/users/shared/challenge-2019/training_setA.zip',
str(loc_Azip))
urllib.request.urlretrieve('https://archive.physionet.org/users/shared/challenge-2019/training_setB.zip',
str(loc_Bzip))

with zipfile.ZipFile(loc_Azip, 'r') as f:
f.extractall(str(self.base_loc))
with zipfile.ZipFile(loc_Bzip, 'r') as f:
f.extractall(str(self.base_loc))
for folder in ('training', 'training_setB'):
for filename in os.listdir(self.base_loc / folder):
if os.path.exists(self.base_loc / filename):
raise RuntimeError
os.rename(self.base_loc / folder / filename, self.base_loc / filename)

def _process_data(self):
X_times = []
X_static = []
y = []
for filename in os.listdir(self.base_loc):
if filename.endswith('.psv'):
with open(self.base_loc / filename) as file:
time = []
label = 0.0
reader = csv.reader(file, delimiter='|')
reader = iter(reader)
next(reader) # first line is headings
prev_iculos = 0
for line in reader:
assert len(line) == 41
*time_values, age, gender, unit1, unit2, hospadmtime, iculos, sepsislabel = line
iculos = int(iculos)
if iculos > 72: # keep at most the first three days
break
for iculos_ in range(prev_iculos + 1, iculos):
time.append([float('nan') for value in time_values])
prev_iculos = iculos
time.append([float(value) for value in time_values])
label = max(label, float(sepsislabel))
unit1 = float(unit1)
unit2 = float(unit2)
unit1_obs = not math.isnan(unit1)
unit2_obs = not math.isnan(unit2)
if not unit1_obs:
unit1 = 0.
if not unit2_obs:
unit2 = 0.
hospadmtime = float(hospadmtime)
if math.isnan(hospadmtime):
hospadmtime = 0. # this only happens for one record
static = [float(age), float(gender), unit1, unit2, hospadmtime]
static += [unit1_obs, unit2_obs]
if len(time) > 2:
X_times.append(time)
X_static.append(static)
y.append(label)
final_indices = []
for time in X_times:
final_indices.append(len(time) - 1)
maxlen = max(final_indices) + 1
for time in X_times:
for _ in range(maxlen - len(time)):
time.append([float('nan') for value in time_values])

X_times = torch.tensor(X_times)
X_static = torch.tensor(X_static)
y = torch.tensor(y).long()

# Normalize data
X_times = normalise_data(X_times, y)

# Append extra channels together.
augmented_X_times = []
intensity = ~torch.isnan(X_times) # of size (batch, stream, channels)
intensity = intensity.to(X_times.dtype).cumsum(dim=1)
augmented_X_times.append(intensity)
augmented_X_times.append(X_times)
X_times = torch.cat(augmented_X_times, dim=2)

X_times = torch.where(~torch.isnan(X_times), X_times, torch.Tensor([0.0]))

train_X_times, val_X_times, test_X = split_data(X_times, y)
train_y, val_y, test_y = split_data(y, y)

X_static_ = X_static[:, :-2]
X_static_ = normalise_data(X_static_, y)
X_static = torch.cat([X_static_, X_static[:, -2:]], dim=1).unsqueeze(1).repeat(1, X_times.shape[1], 1)

train_X_static, val_X_static, test_X_static = split_data(X_static, y)

# Concatenate
train_X = torch.cat([train_X_times, train_X_static], dim=-1).transpose(-2, -1)
val_X = torch.cat([val_X_times, val_X_static], dim=-1).transpose(-2, -1)
test_X = torch.cat([val_X_times, val_X_static], dim=-1).transpose(-2, -1)

return (
train_X,
val_X,
test_X,
train_y,
val_y,
test_y,
)

@staticmethod
def load_data(data_loc, partition):

tensors = load_data(data_loc)
if partition == "train":
X = tensors["train_X"]
y = tensors["train_y"]
elif partition == "val":
X = tensors["val_X"]
y = tensors["val_y"]
elif partition == "test":
X = tensors["test_X"]
y = tensors["test_y"]
else:
raise NotImplementedError("the set {} is not implemented.".format(set))

return X, y







17 changes: 17 additions & 0 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def get_model(config):
in_channels = 20
else:
in_channels = 1
elif config.dataset in ["PhysioNet"]:
in_channels = 75
else:
raise NotImplementedError("Dataset {} not found.".format(config.dataset))

Expand Down Expand Up @@ -151,6 +153,21 @@ def get_model(config):
weight_dropout=config.weight_dropout,
pool=config.pool,
),
"PhysioNet_CKCNN": lambda: models.seqImg_CKCNN(
in_channels=in_channels,
out_channels=2,
hidden_channels=config.no_hidden,
num_blocks=config.no_blocks,
kernelnet_hidden_channels=config.kernelnet_no_hidden,
kernelnet_activation_function=config.kernelnet_activation_function,
kernelnet_norm_type=config.kernelnet_norm_type,
dim_linear=1,
bias=True,
omega_0=config.kernelnet_omega_0,
dropout=config.dropout,
weight_dropout=config.weight_dropout,
pool=config.pool,
),
"CharTrajectories_CKCNN": lambda: models.seqImg_CKCNN(
in_channels=in_channels,
out_channels=20,
Expand Down
6 changes: 6 additions & 0 deletions notebooks/.ipynb_checkpoints/Untitled-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
6 changes: 6 additions & 0 deletions notebooks/Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
4 changes: 3 additions & 1 deletion run_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,13 @@ def main(_):
config["device"] = (
"cuda:0" if (config.device == "cuda" and torch.cuda.is_available()) else "cpu"
)
model = get_model(config)

# Define transforms and create dataloaders
dataloaders, test_loader = dataset.get_dataset(config, num_workers=4)

# Define model
model = get_model(config)

# WandB – wandb.watch() automatically fetches all layer dimensions, gradients, model parameters and logs them automatically to your dashboard.
# Using log="all" log histograms of parameter values in addition to gradients
# wandb.watch(model, log="all", log_freq=200) # -> There was a wandb bug that made runs in Sweeps crash
Expand Down
22 changes: 21 additions & 1 deletion tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import wandb

import sklearn

# project
import probspec_routines as ps_routines

Expand All @@ -17,6 +19,7 @@ def test(model, test_loader, config):
"CIFAR10": _test_classif,
"SpeechCommands": _test_classif,
"CharTrajectories": _test_classif,
"PhysioNet": _test_classif,
}[config.dataset]

test_acc = test_function(model, test_loader, config)
Expand All @@ -36,6 +39,10 @@ def _test_classif(model, test_loader, config):
correct = 0
total = 0

true_y_cpus = []
pred_y_cpus = []
auc = 0

with torch.no_grad():
# Iterate through data
for inputs, labels in test_loader:
Expand All @@ -53,11 +60,24 @@ def _test_classif(model, test_loader, config):
total += labels.size(0)
correct += (predicted == labels).sum().item()

# Save for AUC
if config.report_auc:
true_y_cpus.append(labels.detach().cpu())
pred_y_cpus.append(predicted.detach().cpu())

# Print results
test_acc = correct / total
print(
"Accuracy of the network on the {} test samples: {}".format(
total, (100 * test_acc)
)
)
return test_acc

if config.report_auc:
true_y_cpus = torch.cat(true_y_cpus, dim=0)
pred_y_cpus = torch.cat(pred_y_cpus, dim=0)

auc = sklearn.metrics.roc_auc_score(true_y_cpus, pred_y_cpus)
print(f"AUC: {auc}")

return test_acc, auc
Loading

0 comments on commit 4f11efe

Please sign in to comment.