Write documentation (jankrepl#26)

puneeth714 · May 21, 2020 · 422d4f3 · 422d4f3
1 parent 255e533
commit 422d4f3
Show file tree

Hide file tree

Showing 39 changed files with 2,667 additions and 137 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -10,13 +10,15 @@ python:
     - "3.7"
 
 install:
-    - pip install .[dev]
+    - pip install .[dev,docs]
 
 script:
     - pip freeze
     - pytest
     - flake8 deepdow
+    - flake8 tests
     - pydocstyle deepdow
+    - sphinx-build -b doctest -D "autodoc_mock_imports=[]" docs/ temp_doctest
 
 after_success:
     - codecov
diff --git a/deepdow/data.py b/deepdow/data.py
@@ -1,7 +1,6 @@
 """Collection of functions related to data."""
 from functools import partial
 
-import numpy as np
 import torch
 
 
@@ -41,6 +40,196 @@ def scale_features(X, approach='standard'):
     return X_scaled
 
 
+class Compose:
+    """Meta transform inspired by torchvision.
+
+    Parameters
+    ----------
+    transforms : list
+        List of callables that represent transforms to be composed.
+
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
+        """Transform.
+
+        Parameters
+        ----------
+        X_sample : torch.Tensor
+            Feature vector of shape `(n_channels, lookback, n_assets)`.
+
+        y_sample : torch.Tesnor
+            Target vector of shape `(n_channels, horizon, n_assets)`.
+
+        timestamps_sample : datetime
+            Time stamp of the sample.
+
+        asset_names
+            Asset names corresponding to the last channel of `X_sample` and `y_sample`.
+
+        Returns
+        -------
+        X_sample_new : torch.Tensor
+            Transformed version of `X_sample`.
+
+        y_sample_new : torch.Tesnor
+            Transformed version of `y_sample`.
+
+        timestamps_sample_new : datetime
+            Transformed version of `timestamps_sample`.
+
+        asset_names_new
+            Transformed version of `asset_names`.
+        """
+        for t in self.transforms:
+            X_sample, y_sample, timestamps_sample, asset_names = t(X_sample, y_sample, timestamps_sample, asset_names)
+
+        return X_sample, y_sample, timestamps_sample, asset_names
+
+
+class Dropout:
+    """Set random elements of the input to zero with probability p.
+
+    Parameters
+    ----------
+    p : float
+        Probability of setting an element to zero.
+
+    training : bool
+        If False, then dropout disabled no matter what the `p` is.
+    """
+
+    def __init__(self, p=0.2, training=True):
+        self.p = p
+        self.training = training
+
+    def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
+        """Perform transform.
+
+        Parameters
+        ----------
+        X_sample : torch.Tensor
+            Feature vector of shape `(n_channels, lookback, n_assets)`.
+
+        y_sample : torch.Tesnor
+            Target vector of shape `(n_channels, horizon, n_assets)`.
+
+        timestamps_sample : datetime
+            Time stamp of the sample.
+
+        asset_names
+            Asset names corresponding to the last channel of `X_sample` and `y_sample`.
+
+        Returns
+        -------
+        X_sample_new : torch.Tensor
+            Feature vector of shape `(n_channels, lookback, n_assets)` with some elements being set to zero.
+
+        y_sample : torch.Tensor
+            Same as input.
+
+        timestamps_sample : datetime
+            Same as input.
+
+        asset_names
+            Same as input.
+        """
+        X_sample_new = torch.nn.functional.dropout(X_sample, p=self.p, training=self.training)
+
+        return X_sample_new, y_sample, timestamps_sample, asset_names
+
+
+class Multiply:
+    """Transform multiplying the feature tensor X with a constant."""
+
+    def __init__(self, c=100):
+        self.c = c
+
+    def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
+        """Perform transform.
+
+        Parameters
+        ----------
+        X_sample : torch.Tensor
+            Feature vector of shape `(n_channels, lookback, n_assets)`.
+
+        y_sample : torch.Tesnor
+            Target vector of shape `(n_channels, horizon, n_assets)`.
+
+        timestamps_sample : datetime
+            Time stamp of the sample.
+
+        asset_names
+            Asset names corresponding to the last channel of `X_sample` and `y_sample`.
+
+        Returns
+        -------
+        X_sample_new : torch.Tensor
+            Feature vector of shape `(n_channels, lookback, n_assets)` multiplied by a constant `self.c`.
+
+        y_sample : torch.Tesnor
+            Same as input.
+
+        timestamps_sample : datetime
+            Same as input.
+
+        asset_names
+            Same as input.
+        """
+        return self.c * X_sample, y_sample, timestamps_sample, asset_names
+
+
+class Noise:
+    """Add noise to each of the channels.
+
+    Random (Gaussian) noise is added to the original features X. One can control the standard deviation of the noise
+    via the `frac` parameter. Mathematically, `std(X_noise) = std(X) * frac` for each channel.
+
+
+    """
+
+    def __init__(self, frac=0.2):
+        self.frac = frac
+
+    def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
+        """Perform transform.
+
+        Parameters
+        ----------
+        X_sample : torch.Tensor
+            Feature vector of shape `(n_channels, lookback, n_assets)`.
+
+        y_sample : torch.Tensor
+            Target vector of shape `(n_channels, horizon, n_assets)`.
+
+        timestamps_sample : datetime
+            Time stamp of the sample.
+
+        asset_names
+            Asset names corresponding to the last channel of `X_sample` and `y_sample`.
+
+        Returns
+        -------
+        X_sample_new : torch.Tensor
+            Feature vector of shape `(n_channels, lookback, n_assets)` with some added noise.
+
+        y_sample : torch.Tesnor
+            Same as input.
+
+        timestamps_sample : datetime
+            Same as input.
+
+        asset_names
+            Same as input.
+        """
+        X_sample_new = self.frac * X_sample.std([1, 2], keepdim=True) * torch.randn_like(X_sample) + X_sample
+
+        return X_sample_new, y_sample, timestamps_sample, asset_names
+
+
 class InRAMDataset(torch.utils.data.Dataset):
     """Dataset that lives entirely in RAM.
 
@@ -57,9 +246,12 @@ class InRAMDataset(torch.utils.data.Dataset):
 
     asset_names : None or array-like
         If not None then of shape `(n_assets, )` representing the names of assets.
+
+    transform : None or callable
+        If provided, then a callable that transforms a single sample.
     """
 
-    def __init__(self, X, y, timestamps=None, asset_names=None):
+    def __init__(self, X, y, timestamps=None, asset_names=None, transform=None):
         """Construct."""
         # checks
         if len(X) != len(y):
@@ -75,6 +267,7 @@ def __init__(self, X, y, timestamps=None, asset_names=None):
         self.y = y
         self.timestamps = list(range(len(X))) if timestamps is None else timestamps
         self.asset_names = ['a_{}'.format(i) for i in range(X.shape[-1])] if asset_names is None else asset_names
+        self.transform = transform
 
         # utility
         self.n_channels, self.lookback, self.n_assets = X.shape[1:]
@@ -89,11 +282,18 @@ def __getitem__(self, ix):
         X_sample = torch.from_numpy(self.X[ix])
         y_sample = torch.from_numpy(self.y[ix])
         timestamps_sample = self.timestamps[ix]
+        asset_names = self.asset_names
+
+        if self.transform:
+            X_sample, y_sample, timestamps_sample, asset_names = self.transform(X_sample,
+                                                                                y_sample,
+                                                                                timestamps_sample,
+                                                                                asset_names)
 
-        return X_sample, y_sample, timestamps_sample, self.asset_names
+        return X_sample, y_sample, timestamps_sample, asset_names
 
 
-def collate_uniform(batch, n_assets_range=(5, 10), lookback_range=(1, 20), horizon_range=(3, 15), asset_ixs=None,
+def collate_uniform(batch, n_assets_range=(5, 10), lookback_range=(2, 20), horizon_range=(3, 15), asset_ixs=None,
                     random_state=None, scaler=None):
     """Create batch of samples.
 
@@ -144,10 +344,10 @@ def collate_uniform(batch, n_assets_range=(5, 10), lookback_range=(1, 20), horiz
     if asset_ixs is None and not n_assets_range[1] > n_assets_range[0] >= 1:
         raise ValueError('Incorrect number of assets range.')
 
-    if not lookback_range[1] > lookback_range[0] >= 1:
+    if not lookback_range[1] > lookback_range[0] >= 2:
         raise ValueError('Incorrect lookback range.')
 
-    if not horizon_range[1] > horizon_range[0] >= 1:
+    if not horizon_range[1] > horizon_range[0] >= 2:
         raise ValueError('Incorrect horizon range.')
 
     if random_state is not None:
@@ -159,7 +359,7 @@ def collate_uniform(batch, n_assets_range=(5, 10), lookback_range=(1, 20), horiz
     # sample assets
     if asset_ixs is None:
         n_assets = torch.randint(low=n_assets_range[0], high=min(n_assets_max + 1, n_assets_range[1]), size=(1,))[0]
-        asset_ixs = torch.multinomial(torch.from_numpy(np.array(range(n_assets_max), dtype='float')), n_assets.item())
+        asset_ixs = torch.multinomial(torch.ones(n_assets_max), n_assets.item(), replacement=False)
     else:
         pass
 
@@ -195,35 +395,43 @@ class FlexibleDataLoader(torch.utils.data.DataLoader):
         List of indices to consider from the provided `dataset` which is inherently ordered. If None then considering
         all the samples.
 
-    n_assets_range : tuple
-        Minimum and maximum (only left included) number of assets that are randomly subselected. Ignored if `asset_ixs`
-        specified.
+    n_assets_range : tuple or None
+        Only used if `asset_ixs` is None. Minimum and maximum (only left included) number of assets that are randomly
+        subselected.
 
-    lookback_range : tuple
-        Minimum and maximum (only left included) of the lookback that is randomly selected.
+    lookback_range : tuple or None
+        Minimum and maximum (only left included) of the lookback that is uniformly sampled. If not specified then using
+        `(2, dataset.lookback + 1)` which is the biggest range.
 
     horizon_range : tuple
-        Minimum and maximum (only left included) of the horizon that is randomly selected.
+        Minimum and maximum (only left included) of the horizon that is uniformly sampled. If not specified then using
+        `(2, dataset.horizon + 1)` which is the biggest range.
 
     asset_ixs : None or list
-        If None, then `n_assets` sampled randomly. If ``list`` then it represents the indices of desired assets - no
-        randomness and `n_assets_range` is not used.
+        If None, and `n_assets_range` specified then `n_assets` sampled randomly based on `n_assets_range`.
+        If ``list`` then it represents the indices of desired assets - no randomness.
+        If both `asset_ixs` and `n_assets_range` are None then `asset_ixs` automatically assumed to be all possible
+        indices.
 
     scaler : None or {'standard', 'percent'}
         If None then no scaling applied. If string then a specific scaling theme. Only applied to X_batch.
 
     """
 
-    def __init__(self, dataset, indices=None, n_assets_range=(5, 10), lookback_range=(3, 20), horizon_range=(3, 15),
+    def __init__(self, dataset, indices=None, n_assets_range=None, lookback_range=None, horizon_range=None,
                  asset_ixs=None, scaler=None, **kwargs):
+
+        if n_assets_range is not None and asset_ixs is not None:
+            raise ValueError('One cannot specify both n_assets_range and asset_ixs')
+
         # checks
-        if not (2 <= n_assets_range[0] <= n_assets_range[1] <= dataset.n_assets + 1):
+        if n_assets_range is not None and not (2 <= n_assets_range[0] <= n_assets_range[1] <= dataset.n_assets + 1):
             raise ValueError('Invalid n_assets_range.')
 
-        if not (2 <= lookback_range[0] <= lookback_range[1] <= dataset.lookback + 1):
+        if lookback_range is not None and not (2 <= lookback_range[0] <= lookback_range[1] <= dataset.lookback + 1):
             raise ValueError('Invalid lookback_range.')
 
-        if not (1 <= horizon_range[0] <= horizon_range[1] <= dataset.horizon + 1):
+        if horizon_range is not None and not (2 <= horizon_range[0] <= horizon_range[1] <= dataset.horizon + 1):
             raise ValueError('Invalid horizon_range.')
 
         if indices is not None and not (0 <= min(indices) <= max(indices) <= len(dataset) - 1):
@@ -232,19 +440,24 @@ def __init__(self, dataset, indices=None, n_assets_range=(5, 10), lookback_range
         self.dataset = dataset
         self.indices = indices if indices is not None else list(range(len(dataset)))
         self.n_assets_range = n_assets_range
-        self.lookback_range = lookback_range
-        self.horizon_range = horizon_range
-        self.asset_ixs = asset_ixs
+        self.lookback_range = lookback_range if lookback_range is not None else (2, dataset.lookback + 1)
+        self.horizon_range = horizon_range if horizon_range is not None else (2, dataset.horizon + 1)
+
+        if n_assets_range is None and asset_ixs is None:
+            self.asset_ixs = list(range(len(dataset.asset_names)))
+        else:
+            self.asset_ixs = asset_ixs
+
         self.scaler = scaler
 
         super().__init__(dataset,
                          collate_fn=partial(collate_uniform,
-                                            n_assets_range=n_assets_range,
-                                            lookback_range=lookback_range,
-                                            horizon_range=horizon_range,
-                                            asset_ixs=asset_ixs,
-                                            scaler=scaler),
-                         sampler=torch.utils.data.SubsetRandomSampler(indices),
+                                            n_assets_range=self.n_assets_range,
+                                            lookback_range=self.lookback_range,
+                                            horizon_range=self.horizon_range,
+                                            asset_ixs=self.asset_ixs,
+                                            scaler=self.scaler),
+                         sampler=torch.utils.data.SubsetRandomSampler(self.indices),
                          batch_sampler=None,
                          shuffle=False,
                          drop_last=False,
@@ -254,16 +467,16 @@ def __init__(self, dataset, indices=None, n_assets_range=(5, 10), lookback_range
     def hparams(self):
         """Generate dictionary of relevant parameters."""
         return {
-            'lookback_range': self.lookback_range,
-            'horizon_range': self.horizon_range,
+            'lookback_range': str(self.lookback_range),
+            'horizon_range': str(self.horizon_range),
             'batch_size': self.batch_size}
 
 
 class RigidDataLoader(torch.utils.data.DataLoader):
     """Rigid data loader.
 
     Rigid data loader is well suited for validation purposes since all horizon, lookback and assets are frozen.
-    However, it is not good for training since it enforces the user to choose a single setup.
+    However, it might not be that good for training since it enforces the user to choose a single setup.
 
     Parameters
     ----------