added PhysioNet dataset.

dwromero · Nov 12, 2021 · 4f11efe · 4f11efe
1 parent dc84dce
commit 4f11efe
Show file tree

Hide file tree

Showing 18 changed files with 295 additions and 5 deletions.
diff --git a/__pycache__/config.cpython-37.pyc b/__pycache__/config.cpython-37.pyc
diff --git a/__pycache__/dataset.cpython-37.pyc b/__pycache__/dataset.cpython-37.pyc
diff --git a/__pycache__/model.cpython-37.pyc b/__pycache__/model.cpython-37.pyc
diff --git a/__pycache__/tester.cpython-37.pyc b/__pycache__/tester.cpython-37.pyc
diff --git a/__pycache__/trainer.cpython-37.pyc b/__pycache__/trainer.cpython-37.pyc
diff --git a/config.py b/config.py
@@ -63,6 +63,8 @@ def get_config():
         path="",
         # This parameter is automatically derived from the other parameters of the run. It specifies
         # the path where the network parameters will be saved / loaded from.
+        report_auc=False,
+        max_epochs_no_improvement=100,
         # --------------------------
         # Parameters of TCNs / BFCNNs
         cnn_kernel_size=0,

diff --git a/dataset.py b/dataset.py
@@ -6,6 +6,7 @@
     CIFAR10,
     SpeechCommands,
     CharTrajectories,
+    PhysioNet,
 )
 
 import ml_collections
@@ -28,6 +29,7 @@ def dataset_constructor(
         "CIFAR10": CIFAR10,
         "SpeechCommands": SpeechCommands,
         "CharTrajectories": CharTrajectories,
+        "PhysioNet": PhysioNet,
     }[config.dataset]
 
     training_set = dataset(
@@ -48,7 +50,7 @@ def dataset_constructor(
         else config.sr_test,  # Test set can be sample differently.
         dropped_rate=config.drop_rate,
     )
-    if config.dataset in ["SpeechCommands", "CharTrajectories"]:
+    if config.dataset in ["SpeechCommands", "CharTrajectories", "PhysioNet"]:
         validation_set = dataset(
             partition="val",
             seq_length=config.seq_length,

diff --git a/datasets/__init__.py b/datasets/__init__.py
@@ -9,4 +9,5 @@
     load_data,
     save_data,
 )
+from .physionet import PhysioNet
 from .char_trajectories import CharTrajectories
diff --git a/datasets/__pycache__/__init__.cpython-37.pyc b/datasets/__pycache__/__init__.cpython-37.pyc
diff --git a/datasets/__pycache__/char_trajectories.cpython-37.pyc b/datasets/__pycache__/char_trajectories.cpython-37.pyc
diff --git a/datasets/__pycache__/physionet.cpython-37.pyc b/datasets/__pycache__/physionet.cpython-37.pyc
diff --git a/datasets/physionet.py b/datasets/physionet.py
@@ -0,0 +1,182 @@
+"""
+Adapted from https://github.com/patrick-kidger/NeuralCDE/blob/758d3a7134e3a691013e5cc6b7f68f277e9e6b69/experiments/datasets/speech_commands.py
+"""
+import os
+import pathlib
+import urllib.request
+import zipfile
+import torch
+import csv
+import math
+import torchaudio
+
+from .utils import normalise_data, split_data, load_data, save_data, subsample
+
+
+class PhysioNet(torch.utils.data.TensorDataset):
+    def __init__(
+        self,
+        partition: int,
+        **kwargs,
+    ):
+        self.root = pathlib.Path("./data")
+        self.base_loc = self.root / "sepsis"
+        data_loc = self.base_loc / "preprocessed_data"
+
+        if os.path.exists(self.base_loc):
+            pass
+        else:
+            self.download()
+            train_X, val_X, test_X, train_y, val_y, test_y = self._process_data()
+            if not os.path.exists(data_loc):
+                os.mkdir(data_loc)
+            save_data(
+                data_loc,
+                train_X=train_X,
+                val_X=val_X,
+                test_X=test_X,
+                train_y=train_y,
+                val_y=val_y,
+                test_y=test_y,
+            )
+
+        X, y = self.load_data(data_loc, partition)
+
+        super(PhysioNet, self).__init__(X, y)
+
+
+    def download(self):
+        loc_Azip = self.base_loc / 'training_setA.zip'
+        loc_Bzip = self.base_loc / 'training_setB.zip'
+
+        if not os.path.exists(loc_Azip):
+            if not os.path.exists(self.base_loc):
+                os.mkdir(self.base_loc)
+            urllib.request.urlretrieve('https://archive.physionet.org/users/shared/challenge-2019/training_setA.zip',
+                                       str(loc_Azip))
+            urllib.request.urlretrieve('https://archive.physionet.org/users/shared/challenge-2019/training_setB.zip',
+                                       str(loc_Bzip))
+
+            with zipfile.ZipFile(loc_Azip, 'r') as f:
+                f.extractall(str(self.base_loc))
+            with zipfile.ZipFile(loc_Bzip, 'r') as f:
+                f.extractall(str(self.base_loc))
+            for folder in ('training', 'training_setB'):
+                for filename in os.listdir(self.base_loc / folder):
+                    if os.path.exists(self.base_loc / filename):
+                        raise RuntimeError
+                    os.rename(self.base_loc / folder / filename, self.base_loc / filename)
+
+    def _process_data(self):
+        X_times = []
+        X_static = []
+        y = []
+        for filename in os.listdir(self.base_loc):
+            if filename.endswith('.psv'):
+                with open(self.base_loc / filename) as file:
+                    time = []
+                    label = 0.0
+                    reader = csv.reader(file, delimiter='|')
+                    reader = iter(reader)
+                    next(reader)  # first line is headings
+                    prev_iculos = 0
+                    for line in reader:
+                        assert len(line) == 41
+                        *time_values, age, gender, unit1, unit2, hospadmtime, iculos, sepsislabel = line
+                        iculos = int(iculos)
+                        if iculos > 72:  # keep at most the first three days
+                            break
+                        for iculos_ in range(prev_iculos + 1, iculos):
+                            time.append([float('nan') for value in time_values])
+                        prev_iculos = iculos
+                        time.append([float(value) for value in time_values])
+                        label = max(label, float(sepsislabel))
+                    unit1 = float(unit1)
+                    unit2 = float(unit2)
+                    unit1_obs = not math.isnan(unit1)
+                    unit2_obs = not math.isnan(unit2)
+                    if not unit1_obs:
+                        unit1 = 0.
+                    if not unit2_obs:
+                        unit2 = 0.
+                    hospadmtime = float(hospadmtime)
+                    if math.isnan(hospadmtime):
+                        hospadmtime = 0.  # this only happens for one record
+                    static = [float(age), float(gender), unit1, unit2, hospadmtime]
+                    static += [unit1_obs, unit2_obs]
+                    if len(time) > 2:
+                        X_times.append(time)
+                        X_static.append(static)
+                        y.append(label)
+        final_indices = []
+        for time in X_times:
+            final_indices.append(len(time) - 1)
+        maxlen = max(final_indices) + 1
+        for time in X_times:
+            for _ in range(maxlen - len(time)):
+                time.append([float('nan') for value in time_values])
+
+        X_times = torch.tensor(X_times)
+        X_static = torch.tensor(X_static)
+        y = torch.tensor(y).long()
+
+        # Normalize data
+        X_times = normalise_data(X_times, y)
+
+        # Append extra channels together.
+        augmented_X_times = []
+        intensity = ~torch.isnan(X_times)  # of size (batch, stream, channels)
+        intensity = intensity.to(X_times.dtype).cumsum(dim=1)
+        augmented_X_times.append(intensity)
+        augmented_X_times.append(X_times)
+        X_times = torch.cat(augmented_X_times, dim=2)
+
+        X_times = torch.where(~torch.isnan(X_times), X_times, torch.Tensor([0.0]))
+
+        train_X_times, val_X_times, test_X = split_data(X_times, y)
+        train_y, val_y, test_y = split_data(y, y)
+
+        X_static_ = X_static[:, :-2]
+        X_static_ = normalise_data(X_static_, y)
+        X_static = torch.cat([X_static_, X_static[:, -2:]], dim=1).unsqueeze(1).repeat(1, X_times.shape[1], 1)
+
+        train_X_static, val_X_static, test_X_static = split_data(X_static, y)
+
+        # Concatenate
+        train_X = torch.cat([train_X_times,  train_X_static], dim=-1).transpose(-2, -1)
+        val_X = torch.cat([val_X_times, val_X_static], dim=-1).transpose(-2, -1)
+        test_X = torch.cat([val_X_times, val_X_static], dim=-1).transpose(-2, -1)
+
+        return (
+            train_X,
+            val_X,
+            test_X,
+            train_y,
+            val_y,
+            test_y,
+        )
+
+    @staticmethod
+    def load_data(data_loc, partition):
+
+        tensors = load_data(data_loc)
+        if partition == "train":
+            X = tensors["train_X"]
+            y = tensors["train_y"]
+        elif partition == "val":
+            X = tensors["val_X"]
+            y = tensors["val_y"]
+        elif partition == "test":
+            X = tensors["test_X"]
+            y = tensors["test_y"]
+        else:
+            raise NotImplementedError("the set {} is not implemented.".format(set))
+
+        return X, y
+
+
+
+
+
+
+
diff --git a/model.py b/model.py
@@ -22,6 +22,8 @@ def get_model(config):
             in_channels = 20
         else:
             in_channels = 1
+    elif config.dataset in ["PhysioNet"]:
+        in_channels = 75
     else:
         raise NotImplementedError("Dataset {} not found.".format(config.dataset))
 
@@ -151,6 +153,21 @@ def get_model(config):
             weight_dropout=config.weight_dropout,
             pool=config.pool,
         ),
+        "PhysioNet_CKCNN": lambda: models.seqImg_CKCNN(
+            in_channels=in_channels,
+            out_channels=2,
+            hidden_channels=config.no_hidden,
+            num_blocks=config.no_blocks,
+            kernelnet_hidden_channels=config.kernelnet_no_hidden,
+            kernelnet_activation_function=config.kernelnet_activation_function,
+            kernelnet_norm_type=config.kernelnet_norm_type,
+            dim_linear=1,
+            bias=True,
+            omega_0=config.kernelnet_omega_0,
+            dropout=config.dropout,
+            weight_dropout=config.weight_dropout,
+            pool=config.pool,
+        ),
         "CharTrajectories_CKCNN": lambda: models.seqImg_CKCNN(
             in_channels=in_channels,
             out_channels=20,

diff --git a/notebooks/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/Untitled.ipynb b/notebooks/Untitled.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/run_experiment.py b/run_experiment.py
@@ -67,11 +67,13 @@ def main(_):
     config["device"] = (
         "cuda:0" if (config.device == "cuda" and torch.cuda.is_available()) else "cpu"
     )
-    model = get_model(config)
 
     # Define transforms and create dataloaders
     dataloaders, test_loader = dataset.get_dataset(config, num_workers=4)
 
+    # Define model
+    model = get_model(config)
+
     # WandB – wandb.watch() automatically fetches all layer dimensions, gradients, model parameters and logs them automatically to your dashboard.
     # Using log="all" log histograms of parameter values in addition to gradients
     # wandb.watch(model, log="all", log_freq=200) # -> There was a wandb bug that made runs in Sweeps crash

diff --git a/tester.py b/tester.py
@@ -4,6 +4,8 @@
 
 import wandb
 
+import sklearn
+
 # project
 import probspec_routines as ps_routines
 
@@ -17,6 +19,7 @@ def test(model, test_loader, config):
         "CIFAR10": _test_classif,
         "SpeechCommands": _test_classif,
         "CharTrajectories": _test_classif,
+        "PhysioNet": _test_classif,
     }[config.dataset]
 
     test_acc = test_function(model, test_loader, config)
@@ -36,6 +39,10 @@ def _test_classif(model, test_loader, config):
     correct = 0
     total = 0
 
+    true_y_cpus = []
+    pred_y_cpus = []
+    auc = 0
+
     with torch.no_grad():
         # Iterate through data
         for inputs, labels in test_loader:
@@ -53,11 +60,24 @@ def _test_classif(model, test_loader, config):
             total += labels.size(0)
             correct += (predicted == labels).sum().item()
 
+            # Save for AUC
+            if config.report_auc:
+                true_y_cpus.append(labels.detach().cpu())
+                pred_y_cpus.append(predicted.detach().cpu())
+
     # Print results
     test_acc = correct / total
     print(
         "Accuracy of the network on the {} test samples: {}".format(
             total, (100 * test_acc)
         )
     )
-    return test_acc
+
+    if config.report_auc:
+        true_y_cpus = torch.cat(true_y_cpus, dim=0)
+        pred_y_cpus = torch.cat(pred_y_cpus, dim=0)
+
+        auc = sklearn.metrics.roc_auc_score(true_y_cpus, pred_y_cpus)
+        print(f"AUC: {auc}")
+
+    return test_acc, auc