Initial public commit

CYHSM · Dec 11, 2019 · acd69cd · acd69cd
commit acd69cd
Show file tree

Hide file tree

Showing 23 changed files with 2,602 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,28 @@
+__pycache__/
+*.p
+*.h5
+*.hdf5
+logs*
+.ipynb_checkpoints
+*.mp4
+/data
+.vscode
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Markus Frey
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,45 @@
+[![license](https://img.shields.io/github/license/mashape/apistatus.svg)](https://github.com/CYHSM/DeepInsight/blob/master/LICENSE.md)
+![py36 status](https://img.shields.io/badge/python3.6-supported-green.svg)
+
+## DeepInsight: A general framework for interpreting wide-band neural activity
+
+DeepInsight is a toolbox for the analysis and interpretation of wide-band neural activity and can be applied on unsorted neural data. This means the traditional step of spike-sorting can be omitted and the raw data can be used directly as input, providing a more objective way of measuring decoding performance. 
+![Model Architecture](media/model_architecture.png)
+
+
+
+## Example Usage
+```python
+import deepinsight
+
+# Load your electrophysiological or calcium-imaging data
+(raw_data, raw_timestamps, output, output_timestamps, info) = deepinsight.util.tetrode.read_tetrode_data(fp_raw_file)
+
+# Transform raw data to frequency domain
+deepinsight.preprocess.preprocess_input(fp_deepinsight, raw_data, sampling_rate=info['sampling_rate'], channels=info['channels'])
+
+# Prepare outputs
+deepinsight.util.tetrode.preprocess_output(fp_deepinsight, raw_timestamps, output, output_timestamps, sampling_rate=info['sampling_rate'])
+
+# Train the model
+deepinsight.train.run_from_path(fp_deepinsight, loss_functions, loss_weights)
+
+# Get loss and shuffled loss for influence plot
+losses, output_predictions, indices = deepinsight.analyse.get_model_loss(fp_deepinsight, stepsize=10)
+shuffled_losses = deepinsight.analyse.get_shuffled_model_loss(fp_deepinsight, axis=1, stepsize=10)
+
+# Plot influence across behaviours
+deepinsight.visualize.plot_residuals(fp_deepinsight, frequency_spacing=2)
+```
+
+See also the [jupyter notebook](notebooks/deepinsight_example_usage.ipynb) for a full example for decoding behaviours from tetrode CA1 recordings. 
+
+Following Video shows the performance of the model trained on position (left), head direction (top right) and speed (bottom right):
+![Model Performance](media/decoding_error.gif)
+
+## Installation
+For now install DeepInsight with the following command:
+```
+pip install -e git+https://github.com/CYHSM/DeepInsight.git
+```
+A full pip installation and Colab integration will be available soon.
diff --git a/deepinsight/__init__.py b/deepinsight/__init__.py
@@ -0,0 +1,6 @@
+from . import util
+from . import preprocess
+from . import architecture
+from . import train
+from . import analyse
+from . import visualize
diff --git a/deepinsight/analyse.py b/deepinsight/analyse.py
@@ -0,0 +1,245 @@
+"""
+DeepInsight Toolbox
+© Markus Frey
+https://github.com/CYHSM/DeepInsight
+Licensed under MIT License
+"""
+import os
+
+import keras.backend as K
+import numpy as np
+import h5py
+
+from . import util
+
+
+def get_model_loss(fp_hdf_out, stepsize=1, shuffles=None):
+    """
+    Loops across cross validated models and calculates loss and predictions for full experiment length
+
+    Parameters
+    ----------
+    fp_hdf_out : str
+        File path to HDF5 file
+    stepsize : int, optional
+        Determines how many samples will be evaluated. 1 -> N samples evaluated, 
+        2 -> N/2 samples evaluated, etc..., by default 1
+    shuffles : dict, optional
+        If wavelets should be shuffled, important for calculating influence scores, by default None
+
+    Returns
+    -------
+    losses : (N,1) array_like
+        Loss between predicted and ground truth observation
+    predictions : dict
+        Dictionary with predictions for each behaviour, each item in dict has size (N, Z) with Z the dimensions of the sample (e.g. Z_position=2, Z_speed=1, ...)
+    indices : (N,1) array_like
+        Indices which were evaluated, important when taking stepsize unequal to 1
+    """
+    dirname = os.path.dirname(fp_hdf_out)
+    filename = os.path.basename(fp_hdf_out)[0:-3]
+    cv_results = []
+    (_, _, _, opts) = util.hdf5.load_model_with_opts(dirname + '/models/' + filename + '_model_{}.h5'.format(0))
+    loss_names = opts['loss_names']
+    time_shift = opts['model_timesteps']
+    for k in range(0, opts['num_cvs']):
+        K.clear_session()
+        # Find folders
+        model_path = dirname + '/models/' + filename + '_model_{}.h5'.format(k)
+        # Load model and generators
+        print('This model {}'.format(model_path))
+        (model, training_generator, testing_generator, opts) = util.hdf5.load_model_with_opts(model_path)
+        # -----------------------------------------------------------------------------------------------
+        print('Getting loss, predictions and saliencies')
+        if shuffles is not None:
+            testing_generator = shuffle_wavelets(training_generator, testing_generator, shuffles)
+        losses, predictions, indices = calculate_losses_from_generator(
+            testing_generator, model, verbose=1, stepsize=stepsize)
+        # -----------------------------------------------------------------------------------------------
+        cv_results.append((losses, predictions, indices))
+    cv_results = np.array(cv_results)
+    # Reshape cv_results
+    losses = np.concatenate(cv_results[:, 0], axis=0)
+    predictions = {k: [] for k in loss_names}
+    for out in cv_results[:, 1]:
+        for p, name in zip(out, loss_names):
+            predictions[name].append(p)
+    for key, item in predictions.items():
+        if stepsize > 1:
+            tmp_output = np.concatenate(predictions[key], axis=0)[:, -1, :]
+        else:
+            tmp_output = np.concatenate(predictions[key], axis=0)[:, -1, :]
+            tmp_output = np.array([np.pad(l, [time_shift, 0], mode='constant', constant_values=[l[0], 0])
+                                   for l in tmp_output.transpose()]).transpose()
+        predictions[key] = tmp_output
+    indices = np.concatenate(cv_results[:, 2], axis=0)
+    # We only take the last timestep for decoding, so decoder does not see any part of the future
+    indices = indices + time_shift
+    if stepsize > 1:
+        losses = losses[:, :, -1]
+    else:
+        losses = losses[:, :, -1]
+        losses = np.array([np.pad(l, [time_shift, 0], mode='constant', constant_values=[l[0], 0])
+                           for l in losses.transpose()]).transpose()
+        indices = np.arange(0, losses.shape[0])
+    # Also save to HDF5
+    hdf5_file = h5py.File(fp_hdf_out, mode='a')
+    for key, item in predictions.items():
+        util.hdf5.create_or_update(hdf5_file, dataset_name="analysis/predictions/{}".format(key),
+                                   dataset_shape=item.shape, dataset_type=np.float32, dataset_value=item)
+    util.hdf5.create_or_update(hdf5_file, dataset_name="analysis/losses",
+                               dataset_shape=losses.shape, dataset_type=np.float32, dataset_value=losses)
+    util.hdf5.create_or_update(hdf5_file, dataset_name="analysis/indices",
+                               dataset_shape=indices.shape, dataset_type=np.int64, dataset_value=indices)
+    hdf5_file.close()
+
+    return losses, predictions, indices
+
+
+def get_shuffled_model_loss(fp_hdf_out, stepsize=1, axis=0):
+    """
+    Shuffles the wavelets and recalculates error
+
+    Parameters
+    ----------
+    fp_hdf_out : str
+        File path to HDF5 file
+    stepsize : int, optional
+        Determines how many samples will be evaluated. 1 -> N samples evaluated, 
+        2 -> N/2 samples evaluated, etc..., by default 1
+    axis : int, optional
+        Which axis to shuffle
+
+    Returns
+    -------
+    shuffled_losses : (N,1) array_like
+        Loss between predicted and ground truth observation for shuffled wavelets on specified axis
+    """
+    if axis == 0:
+        raise ValueError('Shuffling across time dimension (axis=0) not supported yet.')
+    hdf5_file = h5py.File(fp_hdf_out, mode='r')
+    tmp_wavelets_shape = hdf5_file['inputs/wavelets'].shape
+    hdf5_file.close()
+    shuffled_losses = []
+    for s in range(0, tmp_wavelets_shape[axis]):
+        if axis == 1:
+            losses, _, _ = get_model_loss(fp_hdf_out, stepsize=stepsize, shuffles={'f': s})
+        elif axis == 2:
+            losses, _, _ = get_model_loss(fp_hdf_out, stepsize=stepsize, shuffles={'c': s})
+        shuffled_losses.append(losses)
+    shuffled_losses = np.array(shuffled_losses)
+    # Also save to HDF5
+    hdf5_file = h5py.File(fp_hdf_out, mode='a')
+    util.hdf5.create_or_update(hdf5_file, dataset_name="analysis/influence/shuffled_losses",
+                               dataset_shape=shuffled_losses.shape, dataset_type=np.float32, dataset_value=shuffled_losses)
+    hdf5_file.close()
+
+    return shuffled_losses
+
+
+def calculate_losses_from_generator(tg, model, num_steps=None, stepsize=1, verbose=0):
+    """
+    Keras evaluate_generator only returns a scalar loss (mean) while predict_generator only returns the predictions but not the real labels
+    TODO Make it batch size independent
+
+    Parameters
+    ----------
+    tg : object
+        Data generator
+    model : object
+        Keras model
+    num_steps : int, optional
+        How many steps should be evaluated, by default None (runs through full experiment)
+    stepsize : int, optional
+        Determines how many samples will be evaluated. 1 -> N samples evaluated, 
+        2 -> N/2 samples evaluated, etc..., by default 1
+    verbose : int, optional
+        Verbosity level
+
+    Returns
+    -------
+    losses : (N,1) array_like
+        Loss between predicted and ground truth observation
+    predictions : dict
+        Dictionary with predictions for each behaviour, each item in dict has size (N, Z) with Z the dimensions of the sample (e.g. Z_position=2, Z_speed=1, ...)
+    indices : (N,1) array_like
+        Indices which were evaluated, important when taking stepsize unequal to 1
+    """
+    # X.) Parse inputs
+    if num_steps is None:
+        num_steps = len(tg)
+
+    # 1.) Make a copy and adjust attributes
+    tmp_dict = tg.__dict__.copy()
+    if tg.batch_size != 1:
+        tg.batch_size = 1
+        tg.random_batches = False
+        tg.shuffle = False
+        tg.sample_size = tg.model_timesteps * tg.batch_size
+
+    # 2.) Get output tensors
+    sess = K.get_session()
+    (_, test_out) = tg.__getitem__(0)
+    real_tensor, calc_tensors = K.placeholder(), []
+    for output_index in range(0, len(test_out)):
+        prediction_tensor = model.outputs[output_index]
+        loss_tensor = model.loss_functions[output_index](real_tensor, prediction_tensor)
+        calc_tensors.append((prediction_tensor, loss_tensor))
+
+    # 3.) Predict
+    losses, predictions, indices = [], [], []
+    for i in range(0, num_steps, stepsize):
+        (in_tg, out_tg) = tg.__getitem__(i)
+        indices.append(tg.cv_indices[i])
+        loss, prediction = [], []
+        for o in range(0, len(out_tg)):
+            evaluated = sess.run(calc_tensors[o], feed_dict={model.input: in_tg, real_tensor: out_tg[o]})
+            prediction.append(evaluated[0][0, ...])
+            loss.append(evaluated[1][0, ...])  # Get rid of batch dimensions
+        predictions.append(prediction)
+        losses.append(loss)
+        if verbose > 0 and not i % 50:
+            print('{} / {}'.format(i, num_steps), end='\r')
+    if verbose > 0:
+        print('Performed {} gradient steps'.format(num_steps // stepsize))
+    losses, predictions, indices = np.array(losses), swap_listaxes(predictions), np.array(indices)
+    tg.__dict__.update(tmp_dict)
+
+    return losses, predictions, indices
+
+
+def shuffle_wavelets(training_generator, testing_generator, shuffles):
+    """
+    [summary]
+
+    Parameters
+    ----------
+    training_generator : object
+        Data generator for training data
+    testing_generator : object
+        Data generator for testing data
+    shuffles : dict
+        Indicates which axis to shuffle and which index in selected dimension, e.g. {'f' : 5} shuffles frequency axis 5
+
+    Returns
+    -------
+    testing_generator : object
+        Data generator for testing data with shuffled wavelets
+    """
+    rolled_wavelets = training_generator.wavelets.copy()
+    for key, item in shuffles.items():
+        if key == 'f':
+            np.random.shuffle(rolled_wavelets[:, item, :])  # In place
+        elif key == 'c':
+            np.random.shuffle(rolled_wavelets[:, :, item])  # In place
+        elif key == 't':
+            np.random.shuffle(rolled_wavelets[item, :, :])  # In place
+    testing_generator.wavelets = rolled_wavelets
+    return testing_generator
+
+
+def swap_listaxes(list_in):
+    list_out = []
+    for o in range(0, len(list_in[0])):
+        list_out.append(np.array([out[o] for out in list_in]))
+    return list_out