Skip to content

Commit

Permalink
Write documentation (jankrepl#26)
Browse files Browse the repository at this point in the history
  • Loading branch information
jankrepl authored May 21, 2020
1 parent 255e533 commit 422d4f3
Show file tree
Hide file tree
Showing 39 changed files with 2,667 additions and 137 deletions.
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ python:
- "3.7"

install:
- pip install .[dev]
- pip install .[dev,docs]

script:
- pip freeze
- pytest
- flake8 deepdow
- flake8 tests
- pydocstyle deepdow
- sphinx-build -b doctest -D "autodoc_mock_imports=[]" docs/ temp_doctest

after_success:
- codecov
275 changes: 244 additions & 31 deletions deepdow/data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Collection of functions related to data."""
from functools import partial

import numpy as np
import torch


Expand Down Expand Up @@ -41,6 +40,196 @@ def scale_features(X, approach='standard'):
return X_scaled


class Compose:
"""Meta transform inspired by torchvision.
Parameters
----------
transforms : list
List of callables that represent transforms to be composed.
"""

def __init__(self, transforms):
self.transforms = transforms

def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
"""Transform.
Parameters
----------
X_sample : torch.Tensor
Feature vector of shape `(n_channels, lookback, n_assets)`.
y_sample : torch.Tesnor
Target vector of shape `(n_channels, horizon, n_assets)`.
timestamps_sample : datetime
Time stamp of the sample.
asset_names
Asset names corresponding to the last channel of `X_sample` and `y_sample`.
Returns
-------
X_sample_new : torch.Tensor
Transformed version of `X_sample`.
y_sample_new : torch.Tesnor
Transformed version of `y_sample`.
timestamps_sample_new : datetime
Transformed version of `timestamps_sample`.
asset_names_new
Transformed version of `asset_names`.
"""
for t in self.transforms:
X_sample, y_sample, timestamps_sample, asset_names = t(X_sample, y_sample, timestamps_sample, asset_names)

return X_sample, y_sample, timestamps_sample, asset_names


class Dropout:
"""Set random elements of the input to zero with probability p.
Parameters
----------
p : float
Probability of setting an element to zero.
training : bool
If False, then dropout disabled no matter what the `p` is.
"""

def __init__(self, p=0.2, training=True):
self.p = p
self.training = training

def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
"""Perform transform.
Parameters
----------
X_sample : torch.Tensor
Feature vector of shape `(n_channels, lookback, n_assets)`.
y_sample : torch.Tesnor
Target vector of shape `(n_channels, horizon, n_assets)`.
timestamps_sample : datetime
Time stamp of the sample.
asset_names
Asset names corresponding to the last channel of `X_sample` and `y_sample`.
Returns
-------
X_sample_new : torch.Tensor
Feature vector of shape `(n_channels, lookback, n_assets)` with some elements being set to zero.
y_sample : torch.Tensor
Same as input.
timestamps_sample : datetime
Same as input.
asset_names
Same as input.
"""
X_sample_new = torch.nn.functional.dropout(X_sample, p=self.p, training=self.training)

return X_sample_new, y_sample, timestamps_sample, asset_names


class Multiply:
"""Transform multiplying the feature tensor X with a constant."""

def __init__(self, c=100):
self.c = c

def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
"""Perform transform.
Parameters
----------
X_sample : torch.Tensor
Feature vector of shape `(n_channels, lookback, n_assets)`.
y_sample : torch.Tesnor
Target vector of shape `(n_channels, horizon, n_assets)`.
timestamps_sample : datetime
Time stamp of the sample.
asset_names
Asset names corresponding to the last channel of `X_sample` and `y_sample`.
Returns
-------
X_sample_new : torch.Tensor
Feature vector of shape `(n_channels, lookback, n_assets)` multiplied by a constant `self.c`.
y_sample : torch.Tesnor
Same as input.
timestamps_sample : datetime
Same as input.
asset_names
Same as input.
"""
return self.c * X_sample, y_sample, timestamps_sample, asset_names


class Noise:
"""Add noise to each of the channels.
Random (Gaussian) noise is added to the original features X. One can control the standard deviation of the noise
via the `frac` parameter. Mathematically, `std(X_noise) = std(X) * frac` for each channel.
"""

def __init__(self, frac=0.2):
self.frac = frac

def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
"""Perform transform.
Parameters
----------
X_sample : torch.Tensor
Feature vector of shape `(n_channels, lookback, n_assets)`.
y_sample : torch.Tensor
Target vector of shape `(n_channels, horizon, n_assets)`.
timestamps_sample : datetime
Time stamp of the sample.
asset_names
Asset names corresponding to the last channel of `X_sample` and `y_sample`.
Returns
-------
X_sample_new : torch.Tensor
Feature vector of shape `(n_channels, lookback, n_assets)` with some added noise.
y_sample : torch.Tesnor
Same as input.
timestamps_sample : datetime
Same as input.
asset_names
Same as input.
"""
X_sample_new = self.frac * X_sample.std([1, 2], keepdim=True) * torch.randn_like(X_sample) + X_sample

return X_sample_new, y_sample, timestamps_sample, asset_names


class InRAMDataset(torch.utils.data.Dataset):
"""Dataset that lives entirely in RAM.
Expand All @@ -57,9 +246,12 @@ class InRAMDataset(torch.utils.data.Dataset):
asset_names : None or array-like
If not None then of shape `(n_assets, )` representing the names of assets.
transform : None or callable
If provided, then a callable that transforms a single sample.
"""

def __init__(self, X, y, timestamps=None, asset_names=None):
def __init__(self, X, y, timestamps=None, asset_names=None, transform=None):
"""Construct."""
# checks
if len(X) != len(y):
Expand All @@ -75,6 +267,7 @@ def __init__(self, X, y, timestamps=None, asset_names=None):
self.y = y
self.timestamps = list(range(len(X))) if timestamps is None else timestamps
self.asset_names = ['a_{}'.format(i) for i in range(X.shape[-1])] if asset_names is None else asset_names
self.transform = transform

# utility
self.n_channels, self.lookback, self.n_assets = X.shape[1:]
Expand All @@ -89,11 +282,18 @@ def __getitem__(self, ix):
X_sample = torch.from_numpy(self.X[ix])
y_sample = torch.from_numpy(self.y[ix])
timestamps_sample = self.timestamps[ix]
asset_names = self.asset_names

if self.transform:
X_sample, y_sample, timestamps_sample, asset_names = self.transform(X_sample,
y_sample,
timestamps_sample,
asset_names)

return X_sample, y_sample, timestamps_sample, self.asset_names
return X_sample, y_sample, timestamps_sample, asset_names


def collate_uniform(batch, n_assets_range=(5, 10), lookback_range=(1, 20), horizon_range=(3, 15), asset_ixs=None,
def collate_uniform(batch, n_assets_range=(5, 10), lookback_range=(2, 20), horizon_range=(3, 15), asset_ixs=None,
random_state=None, scaler=None):
"""Create batch of samples.
Expand Down Expand Up @@ -144,10 +344,10 @@ def collate_uniform(batch, n_assets_range=(5, 10), lookback_range=(1, 20), horiz
if asset_ixs is None and not n_assets_range[1] > n_assets_range[0] >= 1:
raise ValueError('Incorrect number of assets range.')

if not lookback_range[1] > lookback_range[0] >= 1:
if not lookback_range[1] > lookback_range[0] >= 2:
raise ValueError('Incorrect lookback range.')

if not horizon_range[1] > horizon_range[0] >= 1:
if not horizon_range[1] > horizon_range[0] >= 2:
raise ValueError('Incorrect horizon range.')

if random_state is not None:
Expand All @@ -159,7 +359,7 @@ def collate_uniform(batch, n_assets_range=(5, 10), lookback_range=(1, 20), horiz
# sample assets
if asset_ixs is None:
n_assets = torch.randint(low=n_assets_range[0], high=min(n_assets_max + 1, n_assets_range[1]), size=(1,))[0]
asset_ixs = torch.multinomial(torch.from_numpy(np.array(range(n_assets_max), dtype='float')), n_assets.item())
asset_ixs = torch.multinomial(torch.ones(n_assets_max), n_assets.item(), replacement=False)
else:
pass

Expand Down Expand Up @@ -195,35 +395,43 @@ class FlexibleDataLoader(torch.utils.data.DataLoader):
List of indices to consider from the provided `dataset` which is inherently ordered. If None then considering
all the samples.
n_assets_range : tuple
Minimum and maximum (only left included) number of assets that are randomly subselected. Ignored if `asset_ixs`
specified.
n_assets_range : tuple or None
Only used if `asset_ixs` is None. Minimum and maximum (only left included) number of assets that are randomly
subselected.
lookback_range : tuple
Minimum and maximum (only left included) of the lookback that is randomly selected.
lookback_range : tuple or None
Minimum and maximum (only left included) of the lookback that is uniformly sampled. If not specified then using
`(2, dataset.lookback + 1)` which is the biggest range.
horizon_range : tuple
Minimum and maximum (only left included) of the horizon that is randomly selected.
Minimum and maximum (only left included) of the horizon that is uniformly sampled. If not specified then using
`(2, dataset.horizon + 1)` which is the biggest range.
asset_ixs : None or list
If None, then `n_assets` sampled randomly. If ``list`` then it represents the indices of desired assets - no
randomness and `n_assets_range` is not used.
If None, and `n_assets_range` specified then `n_assets` sampled randomly based on `n_assets_range`.
If ``list`` then it represents the indices of desired assets - no randomness.
If both `asset_ixs` and `n_assets_range` are None then `asset_ixs` automatically assumed to be all possible
indices.
scaler : None or {'standard', 'percent'}
If None then no scaling applied. If string then a specific scaling theme. Only applied to X_batch.
"""

def __init__(self, dataset, indices=None, n_assets_range=(5, 10), lookback_range=(3, 20), horizon_range=(3, 15),
def __init__(self, dataset, indices=None, n_assets_range=None, lookback_range=None, horizon_range=None,
asset_ixs=None, scaler=None, **kwargs):

if n_assets_range is not None and asset_ixs is not None:
raise ValueError('One cannot specify both n_assets_range and asset_ixs')

# checks
if not (2 <= n_assets_range[0] <= n_assets_range[1] <= dataset.n_assets + 1):
if n_assets_range is not None and not (2 <= n_assets_range[0] <= n_assets_range[1] <= dataset.n_assets + 1):
raise ValueError('Invalid n_assets_range.')

if not (2 <= lookback_range[0] <= lookback_range[1] <= dataset.lookback + 1):
if lookback_range is not None and not (2 <= lookback_range[0] <= lookback_range[1] <= dataset.lookback + 1):
raise ValueError('Invalid lookback_range.')

if not (1 <= horizon_range[0] <= horizon_range[1] <= dataset.horizon + 1):
if horizon_range is not None and not (2 <= horizon_range[0] <= horizon_range[1] <= dataset.horizon + 1):
raise ValueError('Invalid horizon_range.')

if indices is not None and not (0 <= min(indices) <= max(indices) <= len(dataset) - 1):
Expand All @@ -232,19 +440,24 @@ def __init__(self, dataset, indices=None, n_assets_range=(5, 10), lookback_range
self.dataset = dataset
self.indices = indices if indices is not None else list(range(len(dataset)))
self.n_assets_range = n_assets_range
self.lookback_range = lookback_range
self.horizon_range = horizon_range
self.asset_ixs = asset_ixs
self.lookback_range = lookback_range if lookback_range is not None else (2, dataset.lookback + 1)
self.horizon_range = horizon_range if horizon_range is not None else (2, dataset.horizon + 1)

if n_assets_range is None and asset_ixs is None:
self.asset_ixs = list(range(len(dataset.asset_names)))
else:
self.asset_ixs = asset_ixs

self.scaler = scaler

super().__init__(dataset,
collate_fn=partial(collate_uniform,
n_assets_range=n_assets_range,
lookback_range=lookback_range,
horizon_range=horizon_range,
asset_ixs=asset_ixs,
scaler=scaler),
sampler=torch.utils.data.SubsetRandomSampler(indices),
n_assets_range=self.n_assets_range,
lookback_range=self.lookback_range,
horizon_range=self.horizon_range,
asset_ixs=self.asset_ixs,
scaler=self.scaler),
sampler=torch.utils.data.SubsetRandomSampler(self.indices),
batch_sampler=None,
shuffle=False,
drop_last=False,
Expand All @@ -254,16 +467,16 @@ def __init__(self, dataset, indices=None, n_assets_range=(5, 10), lookback_range
def hparams(self):
"""Generate dictionary of relevant parameters."""
return {
'lookback_range': self.lookback_range,
'horizon_range': self.horizon_range,
'lookback_range': str(self.lookback_range),
'horizon_range': str(self.horizon_range),
'batch_size': self.batch_size}


class RigidDataLoader(torch.utils.data.DataLoader):
"""Rigid data loader.
Rigid data loader is well suited for validation purposes since all horizon, lookback and assets are frozen.
However, it is not good for training since it enforces the user to choose a single setup.
However, it might not be that good for training since it enforces the user to choose a single setup.
Parameters
----------
Expand Down
Loading

0 comments on commit 422d4f3

Please sign in to comment.