Fix BagOfWords and SAX-VSM with constant subsequences (johannfaouzi#108)

SvenBarray · Aug 26, 2021 · 77d9635 · 77d9635
1 parent 37300b1
commit 77d9635
Show file tree

Hide file tree

Showing 8 changed files with 195 additions and 69 deletions.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -96,10 +96,6 @@ jobs:
       bash <(curl -s https://codecov.io/bash)
     displayName: 'Run test suite and publish results'
 
-  - script: |
-      bash <(curl -s https://codecov.io/bash)
-    displayName: 'Publish code coverage results'
-
 - job: MacOS_JIT
   pool:
     vmImage: 'macOS-latest'

diff --git a/examples/image/plot_single_gaf.py b/examples/image/plot_single_gaf.py
@@ -4,9 +4,9 @@
 ============================
 
 A Gramian angular field is an image obtained from a time series, representing
-some kind of temporal correlation between the values at each time point. Two
-methods are available: Gramian angular summation field and Gramian angular
-difference field.
+some kind of temporal correlation between each pair of values from the time
+series. Two methods are available: Gramian angular summation field and Gramian
+angular difference field.
 It is implemented as :class:`pyts.image.GramianAngularField`.
 
 In this example, the considered time series is the sequence of the sine

diff --git a/pyts/approximation/sax.py b/pyts/approximation/sax.py
@@ -27,6 +27,11 @@ class SymbolicAggregateApproximation(BaseEstimator,
         - 'quantile': All bins in each sample have the same number of points
         - 'normal': Bin edges are quantiles from a standard normal distribution
 
+    raise_warning : bool (default = True)
+        If True, a warning is raised when the number of bins is smaller for
+        at least one sample. In this case, you should consider decreasing the
+        number of bins or removing these samples.
+
     alphabet : None, 'ordinal' or array-like, shape = (n_bins,)
         Alphabet to use. If None, the first `n_bins` letters of the Latin
         alphabet are used. If 'ordinal', integers are used.
@@ -49,9 +54,11 @@ class SymbolicAggregateApproximation(BaseEstimator,
 
     """
 
-    def __init__(self, n_bins=4, strategy='quantile', alphabet=None):
+    def __init__(self, n_bins=4, strategy='quantile', raise_warning=True,
+                 alphabet=None):
         self.n_bins = n_bins
         self.strategy = strategy
+        self.raise_warning = raise_warning
         self.alphabet = alphabet
 
     def fit(self, X=None, y=None):
@@ -85,7 +92,9 @@ def transform(self, X):
         n_timestamps = X.shape[1]
         alphabet = self._check_params(n_timestamps)
         discretizer = KBinsDiscretizer(
-            n_bins=self.n_bins, strategy=self.strategy)
+            n_bins=self.n_bins, strategy=self.strategy,
+            raise_warning=self.raise_warning
+        )
         indices = discretizer.fit_transform(X)
         if isinstance(alphabet, str):
             return indices

diff --git a/pyts/bag_of_words/bow.py b/pyts/bag_of_words/bow.py
@@ -8,11 +8,11 @@
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.pipeline import make_pipeline
 from sklearn.utils.validation import check_array
-import warnings
 from ..approximation import (
     PiecewiseAggregateApproximation, SymbolicAggregateApproximation)
 from ..base import UnivariateTransformerMixin
-from ..preprocessing import StandardScaler
+from ..preprocessing import KBinsDiscretizer, StandardScaler
+from ..preprocessing.discretizer import _digitize
 from ..utils.utils import _windowed_view
 
 
@@ -189,6 +189,11 @@ class BagOfWords(BaseEstimator, TransformerMixin):
         the size of each time series and must be between 0 and 1. The window
         size will be computed as ``ceil(window_step * n_timestamps)``.
 
+    threshold_std: float (default = 0.01)
+        Threshold used to determine whether a subsequence is standardized.
+        Subsequences whose standard deviations are lower than this threshold
+        are not standardized.
+
     norm_mean : bool (default = True)
         If True, center each subseries before scaling.
 
@@ -201,6 +206,12 @@ class BagOfWords(BaseEstimator, TransformerMixin):
         algorithm. If False, each time point belong to one single bin, but
         the size of the bins may vary.
 
+    raise_warning : bool (default = False)
+        If True, a warning is raised when the number of bins is smaller for
+        at least one subsequence. In this case, you should consider decreasing
+        the number of bins, using another strategy to compute the bins or
+        removing the corresponding time series.
+
     alphabet : None or array-like, shape = (n_bins,)
         Alphabet to use. If None, the first `n_bins` letters of the Latin
         alphabet are used.
@@ -228,24 +239,21 @@ class BagOfWords(BaseEstimator, TransformerMixin):
 
     def __init__(self, window_size=0.5, word_size=0.5, n_bins=4,
                  strategy='normal', numerosity_reduction=True, window_step=1,
-                 norm_mean=True, norm_std=True, overlapping=True,
-                 alphabet=None):
+                 threshold_std=0.01, norm_mean=True, norm_std=True,
+                 overlapping=True, raise_warning=False, alphabet=None):
         self.window_size = window_size
         self.word_size = word_size
         self.n_bins = n_bins
         self.strategy = strategy
         self.numerosity_reduction = numerosity_reduction
         self.window_step = window_step
+        self.threshold_std = threshold_std
         self.norm_mean = norm_mean
         self.norm_std = norm_std
         self.overlapping = overlapping
+        self.raise_warning = raise_warning
         self.alphabet = alphabet
 
-        warnings.warn("BagOfWords has been reworked in 0.11 in order to match "
-                      "its definition in the literature. To get the old "
-                      "BagOfWords, use pyts.bag_of_words.WordExtractor "
-                      "instead.", FutureWarning)
-
     def fit(self, X, y=None):
         """Pass.
 
@@ -284,12 +292,48 @@ def transform(self, X):
             n_timestamps)
         n_windows = (n_timestamps - window_size + window_step) // window_step
 
+        # Standardize time series if quantile from standard normal distribution
+        if self.strategy == 'normal':
+            X_scaled = StandardScaler().transform(X)
+        else:
+            X_scaled = X
+
         # Extract subsequences using a sliding window
         X_window = _windowed_view(
-            X, n_samples, n_timestamps, window_size, window_step
+            X_scaled, n_samples, n_timestamps, window_size, window_step
         ).reshape(n_samples * n_windows, window_size)
 
-        # Create a pipeline with three steps: standardization, PAA, SAX
+        # Identify subsequences whose standard deviation is below the threshold
+        idx = np.std(X_window, axis=1) < self.threshold_std
+
+        if np.any(idx):
+            # Subsequences with standard deviations below threshold
+            X_paa = PiecewiseAggregateApproximation(
+                window_size=None, output_size=word_size,
+                overlapping=self.overlapping
+            ).transform(X_window[idx])
+
+            # Compute the bin edges
+            discretizer = KBinsDiscretizer(
+                n_bins=self.n_bins, strategy=self.strategy,
+                raise_warning=self.raise_warning
+            )
+            bin_edges = discretizer._compute_bins(X_scaled, n_samples,
+                                                  self.n_bins, self.strategy)
+
+            # Tile the bin edges for each subsequence from the same time series
+            if self.strategy != 'normal':
+                count = np.bincount(
+                    np.floor_divide(np.nonzero(idx)[0], n_windows)
+                )
+                bin_edges = np.vstack([
+                    np.tile(bin_edges[i], (count[i], 1))
+                    for i in range(count.size) if count[i] != 0
+                ])
+
+            X_sax_below_thresh = alphabet[_digitize(X_paa, bin_edges)]
+
+        # Subsequences with standard deviations above threshold
         pipeline = make_pipeline(
             StandardScaler(
                 with_mean=self.norm_mean, with_std=self.norm_std
@@ -300,11 +344,19 @@ def transform(self, X):
             ),
             SymbolicAggregateApproximation(
                 n_bins=self.n_bins, strategy=self.strategy,
-                alphabet=self.alphabet
+                alphabet=self.alphabet, raise_warning=self.raise_warning
             )
         )
-        X_sax = pipeline.fit_transform(X_window).reshape(
-            n_samples, n_windows, word_size)
+        X_sax_above_thresh = pipeline.fit_transform(X_window[~idx])
+
+        # Concatenate SAX words
+        if np.any(idx):
+            X_sax = np.empty((n_samples * n_windows, word_size), dtype='<U1')
+            X_sax[idx] = X_sax_below_thresh
+            X_sax[~idx] = X_sax_above_thresh
+        else:
+            X_sax = X_sax_above_thresh
+        X_sax = X_sax.reshape(n_samples, n_windows, word_size)
 
         # Join letters to make words
         X_word = np.asarray([[''.join(X_sax[i, j])
@@ -396,6 +448,12 @@ def _check_params(self, n_timestamps):
                 )
             window_step = ceil(self.window_step * n_timestamps)
 
+        if not isinstance(self.threshold_std, (float, np.floating)):
+            raise TypeError("'threshold_std' must be a float.")
+        if not self.threshold_std >= 0.:
+            raise ValueError("'threshold_std' must be non-negative "
+                             "(got {0}).".format(self.threshold_std))
+
         if not ((self.alphabet is None)
                 or (isinstance(self.alphabet, (list, tuple, np.ndarray)))):
             raise TypeError("'alphabet' must be None or array-like "

diff --git a/pyts/bag_of_words/tests/test_bow.py b/pyts/bag_of_words/tests/test_bow.py
@@ -67,7 +67,8 @@ def test_actual_results_word_extractor(params, arr_desired):
 
 # ######################### Tests for BagOfWords #########################
 
-X_bow = np.arange(20).reshape(2, 10)
+X_bow = np.arange(30).reshape(3, 10)
+X_bow[2, :8] = 20
 
 
 @pytest.mark.parametrize(
@@ -123,6 +124,14 @@ def test_actual_results_word_extractor(params, arr_desired):
       "If 'window_step' is a float, it must be greater than 0 and lower "
       "than or equal to 1 (got {0}).".format(2.)),
 
+     ({'window_size': 6, 'word_size': 4, 'n_bins': 2, 'threshold_std': '-1'},
+      TypeError,
+      "'threshold_std' must be a float."),
+
+     ({'window_size': 6, 'word_size': 4, 'n_bins': 2, 'threshold_std': -1.},
+      ValueError,
+      "'threshold_std' must be non-negative (got -1.0)."),
+
      ({'window_size': 6, 'word_size': 4, 'n_bins': 2, 'alphabet': 'whoops'},
       TypeError,
       "'alphabet' must be None or array-like with shape (n_bins,) "
@@ -140,33 +149,40 @@ def test_parameter_check_bag_of_words(params, error, err_msg):
 
 
 @pytest.mark.parametrize(
-    'params, arr_desired',
-    [({'window_size': 6, 'word_size': 4}, ['abcd', 'abcd']),
+    'params, X, arr_desired',
+    [({'window_size': 8, 'word_size': 4}, X_bow,
+      ['abcd', 'abcd', 'bbbb bbbd']),
+
+     ({'window_size': 8, 'word_size': 4, 'numerosity_reduction': False},
+      X_bow, ['abcd abcd abcd', 'abcd abcd abcd', 'bbbb bbbd bbbd']),
+
+     ({'window_size': 8, 'word_size': 4, 'strategy': 'uniform'}, X_bow,
+      ['abcd', 'abcd', 'aaaa aaad']),
 
-     ({'window_size': 6, 'word_size': 4, 'numerosity_reduction': False},
-      ['abcd abcd abcd abcd abcd', 'abcd abcd abcd abcd abcd']),
+     ({'window_size': 8, 'word_size': 4, 'strategy': 'quantile'}, X_bow,
+      ['abcd', 'abcd', 'aaaa aaac']),
 
-     ({'window_size': 6, 'word_size': 4, 'alphabet': ['y', 'o', 'l', 'o']},
-      ['yolo', 'yolo']),
+     ({'window_size': 8, 'word_size': 4, 'alphabet': ['y', 'o', 'l', 'o']},
+      X_bow, ['yolo', 'yolo', 'oooo']),
 
-     ({'window_size': 0.5, 'word_size': 4}, ['abcd', 'abcd']),
+     ({'window_size': 0.5, 'word_size': 4}, X_bow[:2], ['abcd', 'abcd']),
 
      ({'window_size': 4, 'word_size': 1., 'numerosity_reduction': False},
-      ['abcd abcd abcd abcd abcd abcd abcd',
-       'abcd abcd abcd abcd abcd abcd abcd']),
+      X_bow[:2], ['abcd abcd abcd abcd abcd abcd abcd',
+                  'abcd abcd abcd abcd abcd abcd abcd']),
 
      ({'window_size': 4, 'word_size': 4, 'window_step': 2,
        'numerosity_reduction': False},
-      ['abcd abcd abcd abcd', 'abcd abcd abcd abcd']),
+      X_bow[:2], ['abcd abcd abcd abcd', 'abcd abcd abcd abcd']),
 
      ({'window_size': 4, 'word_size': 4, 'window_step': 0.2,
        'numerosity_reduction': False},
-      ['abcd abcd abcd abcd', 'abcd abcd abcd abcd']),
+      X_bow[:2], ['abcd abcd abcd abcd', 'abcd abcd abcd abcd']),
 
      ({'window_size': 4, 'word_size': 4, 'window_step': 0.5},
-      ['abcd', 'abcd'])]
+      X_bow[:2], ['abcd', 'abcd'])]
 )
-def test_actual_results_bag_of_words(params, arr_desired):
+def test_actual_results_bag_of_words(params, X, arr_desired):
     """Test that the actual results are the expected ones."""
-    arr_actual = BagOfWords(**params).fit_transform(X_bow)
+    arr_actual = BagOfWords(**params).fit_transform(X)
     np.testing.assert_array_equal(arr_actual, arr_desired)
diff --git a/pyts/classification/saxvsm.py b/pyts/classification/saxvsm.py
@@ -55,6 +55,11 @@ class SAXVSM(BaseEstimator, UnivariateClassifierMixin):
         sliding window will be computed as
         ``ceil(window_step * n_timestamps)``.
 
+    threshold_std: float (default = 0.01)
+        Threshold used to determine whether a subsequence is standardized.
+        Subsequences whose standard deviations are lower than this threshold
+        are not standardized.
+
     norm_mean : bool (default = True)
         If True, center each subseries before scaling.
 
@@ -112,20 +117,22 @@ class SAXVSM(BaseEstimator, UnivariateClassifierMixin):
     >>> clf.fit(X_train, y_train)
     SAXVSM(...)
     >>> clf.score(X_test, y_test)
-    0.9933...
+    1.0
 
     """
 
     def __init__(self, window_size=0.5, word_size=0.5, n_bins=4,
                  strategy='normal', numerosity_reduction=True, window_step=1,
-                 norm_mean=True, norm_std=True, use_idf=True, smooth_idf=False,
-                 sublinear_tf=True, overlapping=True, alphabet=None):
+                 threshold_std=0.01, norm_mean=True, norm_std=True,
+                 use_idf=True, smooth_idf=False, sublinear_tf=True,
+                 overlapping=True, alphabet=None):
         self.window_size = window_size
         self.word_size = word_size
         self.n_bins = n_bins
         self.strategy = strategy
         self.numerosity_reduction = numerosity_reduction
         self.window_step = window_step
+        self.threshold_std = threshold_std
         self.norm_mean = norm_mean
         self.norm_std = norm_std
         self.use_idf = use_idf
@@ -162,9 +169,9 @@ def fit(self, X, y):
             window_size=self.window_size, word_size=self.word_size,
             n_bins=self.n_bins, strategy=self.strategy,
             numerosity_reduction=self.numerosity_reduction,
-            window_step=self.window_step, norm_mean=self.norm_mean,
-            norm_std=self.norm_std, overlapping=self.overlapping,
-            alphabet=self.alphabet
+            window_step=self.window_step, threshold_std=self.threshold_std,
+            norm_mean=self.norm_mean, norm_std=self.norm_std,
+            overlapping=self.overlapping, alphabet=self.alphabet
         )
         X_bow = bow.fit_transform(X)