Skip to content

Commit

Permalink
Fix BagOfWords and SAX-VSM with constant subsequences (johannfaouzi#108)
Browse files Browse the repository at this point in the history
  • Loading branch information
johannfaouzi authored Aug 26, 2021
1 parent 37300b1 commit 77d9635
Show file tree
Hide file tree
Showing 8 changed files with 195 additions and 69 deletions.
4 changes: 0 additions & 4 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,6 @@ jobs:
bash <(curl -s https://codecov.io/bash)
displayName: 'Run test suite and publish results'
- script: |
bash <(curl -s https://codecov.io/bash)
displayName: 'Publish code coverage results'
- job: MacOS_JIT
pool:
vmImage: 'macOS-latest'
Expand Down
6 changes: 3 additions & 3 deletions examples/image/plot_single_gaf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
============================
A Gramian angular field is an image obtained from a time series, representing
some kind of temporal correlation between the values at each time point. Two
methods are available: Gramian angular summation field and Gramian angular
difference field.
some kind of temporal correlation between each pair of values from the time
series. Two methods are available: Gramian angular summation field and Gramian
angular difference field.
It is implemented as :class:`pyts.image.GramianAngularField`.
In this example, the considered time series is the sequence of the sine
Expand Down
13 changes: 11 additions & 2 deletions pyts/approximation/sax.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ class SymbolicAggregateApproximation(BaseEstimator,
- 'quantile': All bins in each sample have the same number of points
- 'normal': Bin edges are quantiles from a standard normal distribution
raise_warning : bool (default = True)
If True, a warning is raised when the number of bins is smaller for
at least one sample. In this case, you should consider decreasing the
number of bins or removing these samples.
alphabet : None, 'ordinal' or array-like, shape = (n_bins,)
Alphabet to use. If None, the first `n_bins` letters of the Latin
alphabet are used. If 'ordinal', integers are used.
Expand All @@ -49,9 +54,11 @@ class SymbolicAggregateApproximation(BaseEstimator,
"""

def __init__(self, n_bins=4, strategy='quantile', alphabet=None):
def __init__(self, n_bins=4, strategy='quantile', raise_warning=True,
alphabet=None):
self.n_bins = n_bins
self.strategy = strategy
self.raise_warning = raise_warning
self.alphabet = alphabet

def fit(self, X=None, y=None):
Expand Down Expand Up @@ -85,7 +92,9 @@ def transform(self, X):
n_timestamps = X.shape[1]
alphabet = self._check_params(n_timestamps)
discretizer = KBinsDiscretizer(
n_bins=self.n_bins, strategy=self.strategy)
n_bins=self.n_bins, strategy=self.strategy,
raise_warning=self.raise_warning
)
indices = discretizer.fit_transform(X)
if isinstance(alphabet, str):
return indices
Expand Down
86 changes: 72 additions & 14 deletions pyts/bag_of_words/bow.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_array
import warnings
from ..approximation import (
PiecewiseAggregateApproximation, SymbolicAggregateApproximation)
from ..base import UnivariateTransformerMixin
from ..preprocessing import StandardScaler
from ..preprocessing import KBinsDiscretizer, StandardScaler
from ..preprocessing.discretizer import _digitize
from ..utils.utils import _windowed_view


Expand Down Expand Up @@ -189,6 +189,11 @@ class BagOfWords(BaseEstimator, TransformerMixin):
the size of each time series and must be between 0 and 1. The window
size will be computed as ``ceil(window_step * n_timestamps)``.
threshold_std: float (default = 0.01)
Threshold used to determine whether a subsequence is standardized.
Subsequences whose standard deviations are lower than this threshold
are not standardized.
norm_mean : bool (default = True)
If True, center each subseries before scaling.
Expand All @@ -201,6 +206,12 @@ class BagOfWords(BaseEstimator, TransformerMixin):
algorithm. If False, each time point belong to one single bin, but
the size of the bins may vary.
raise_warning : bool (default = False)
If True, a warning is raised when the number of bins is smaller for
at least one subsequence. In this case, you should consider decreasing
the number of bins, using another strategy to compute the bins or
removing the corresponding time series.
alphabet : None or array-like, shape = (n_bins,)
Alphabet to use. If None, the first `n_bins` letters of the Latin
alphabet are used.
Expand Down Expand Up @@ -228,24 +239,21 @@ class BagOfWords(BaseEstimator, TransformerMixin):

def __init__(self, window_size=0.5, word_size=0.5, n_bins=4,
strategy='normal', numerosity_reduction=True, window_step=1,
norm_mean=True, norm_std=True, overlapping=True,
alphabet=None):
threshold_std=0.01, norm_mean=True, norm_std=True,
overlapping=True, raise_warning=False, alphabet=None):
self.window_size = window_size
self.word_size = word_size
self.n_bins = n_bins
self.strategy = strategy
self.numerosity_reduction = numerosity_reduction
self.window_step = window_step
self.threshold_std = threshold_std
self.norm_mean = norm_mean
self.norm_std = norm_std
self.overlapping = overlapping
self.raise_warning = raise_warning
self.alphabet = alphabet

warnings.warn("BagOfWords has been reworked in 0.11 in order to match "
"its definition in the literature. To get the old "
"BagOfWords, use pyts.bag_of_words.WordExtractor "
"instead.", FutureWarning)

def fit(self, X, y=None):
"""Pass.
Expand Down Expand Up @@ -284,12 +292,48 @@ def transform(self, X):
n_timestamps)
n_windows = (n_timestamps - window_size + window_step) // window_step

# Standardize time series if quantile from standard normal distribution
if self.strategy == 'normal':
X_scaled = StandardScaler().transform(X)
else:
X_scaled = X

# Extract subsequences using a sliding window
X_window = _windowed_view(
X, n_samples, n_timestamps, window_size, window_step
X_scaled, n_samples, n_timestamps, window_size, window_step
).reshape(n_samples * n_windows, window_size)

# Create a pipeline with three steps: standardization, PAA, SAX
# Identify subsequences whose standard deviation is below the threshold
idx = np.std(X_window, axis=1) < self.threshold_std

if np.any(idx):
# Subsequences with standard deviations below threshold
X_paa = PiecewiseAggregateApproximation(
window_size=None, output_size=word_size,
overlapping=self.overlapping
).transform(X_window[idx])

# Compute the bin edges
discretizer = KBinsDiscretizer(
n_bins=self.n_bins, strategy=self.strategy,
raise_warning=self.raise_warning
)
bin_edges = discretizer._compute_bins(X_scaled, n_samples,
self.n_bins, self.strategy)

# Tile the bin edges for each subsequence from the same time series
if self.strategy != 'normal':
count = np.bincount(
np.floor_divide(np.nonzero(idx)[0], n_windows)
)
bin_edges = np.vstack([
np.tile(bin_edges[i], (count[i], 1))
for i in range(count.size) if count[i] != 0
])

X_sax_below_thresh = alphabet[_digitize(X_paa, bin_edges)]

# Subsequences with standard deviations above threshold
pipeline = make_pipeline(
StandardScaler(
with_mean=self.norm_mean, with_std=self.norm_std
Expand All @@ -300,11 +344,19 @@ def transform(self, X):
),
SymbolicAggregateApproximation(
n_bins=self.n_bins, strategy=self.strategy,
alphabet=self.alphabet
alphabet=self.alphabet, raise_warning=self.raise_warning
)
)
X_sax = pipeline.fit_transform(X_window).reshape(
n_samples, n_windows, word_size)
X_sax_above_thresh = pipeline.fit_transform(X_window[~idx])

# Concatenate SAX words
if np.any(idx):
X_sax = np.empty((n_samples * n_windows, word_size), dtype='<U1')
X_sax[idx] = X_sax_below_thresh
X_sax[~idx] = X_sax_above_thresh
else:
X_sax = X_sax_above_thresh
X_sax = X_sax.reshape(n_samples, n_windows, word_size)

# Join letters to make words
X_word = np.asarray([[''.join(X_sax[i, j])
Expand Down Expand Up @@ -396,6 +448,12 @@ def _check_params(self, n_timestamps):
)
window_step = ceil(self.window_step * n_timestamps)

if not isinstance(self.threshold_std, (float, np.floating)):
raise TypeError("'threshold_std' must be a float.")
if not self.threshold_std >= 0.:
raise ValueError("'threshold_std' must be non-negative "
"(got {0}).".format(self.threshold_std))

if not ((self.alphabet is None)
or (isinstance(self.alphabet, (list, tuple, np.ndarray)))):
raise TypeError("'alphabet' must be None or array-like "
Expand Down
46 changes: 31 additions & 15 deletions pyts/bag_of_words/tests/test_bow.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ def test_actual_results_word_extractor(params, arr_desired):

# ######################### Tests for BagOfWords #########################

X_bow = np.arange(20).reshape(2, 10)
X_bow = np.arange(30).reshape(3, 10)
X_bow[2, :8] = 20


@pytest.mark.parametrize(
Expand Down Expand Up @@ -123,6 +124,14 @@ def test_actual_results_word_extractor(params, arr_desired):
"If 'window_step' is a float, it must be greater than 0 and lower "
"than or equal to 1 (got {0}).".format(2.)),
({'window_size': 6, 'word_size': 4, 'n_bins': 2, 'threshold_std': '-1'},
TypeError,
"'threshold_std' must be a float."),
({'window_size': 6, 'word_size': 4, 'n_bins': 2, 'threshold_std': -1.},
ValueError,
"'threshold_std' must be non-negative (got -1.0)."),
({'window_size': 6, 'word_size': 4, 'n_bins': 2, 'alphabet': 'whoops'},
TypeError,
"'alphabet' must be None or array-like with shape (n_bins,) "
Expand All @@ -140,33 +149,40 @@ def test_parameter_check_bag_of_words(params, error, err_msg):


@pytest.mark.parametrize(
'params, arr_desired',
[({'window_size': 6, 'word_size': 4}, ['abcd', 'abcd']),
'params, X, arr_desired',
[({'window_size': 8, 'word_size': 4}, X_bow,
['abcd', 'abcd', 'bbbb bbbd']),
({'window_size': 8, 'word_size': 4, 'numerosity_reduction': False},
X_bow, ['abcd abcd abcd', 'abcd abcd abcd', 'bbbb bbbd bbbd']),
({'window_size': 8, 'word_size': 4, 'strategy': 'uniform'}, X_bow,
['abcd', 'abcd', 'aaaa aaad']),
({'window_size': 6, 'word_size': 4, 'numerosity_reduction': False},
['abcd abcd abcd abcd abcd', 'abcd abcd abcd abcd abcd']),
({'window_size': 8, 'word_size': 4, 'strategy': 'quantile'}, X_bow,
['abcd', 'abcd', 'aaaa aaac']),
({'window_size': 6, 'word_size': 4, 'alphabet': ['y', 'o', 'l', 'o']},
['yolo', 'yolo']),
({'window_size': 8, 'word_size': 4, 'alphabet': ['y', 'o', 'l', 'o']},
X_bow, ['yolo', 'yolo', 'oooo']),
({'window_size': 0.5, 'word_size': 4}, ['abcd', 'abcd']),
({'window_size': 0.5, 'word_size': 4}, X_bow[:2], ['abcd', 'abcd']),
({'window_size': 4, 'word_size': 1., 'numerosity_reduction': False},
['abcd abcd abcd abcd abcd abcd abcd',
'abcd abcd abcd abcd abcd abcd abcd']),
X_bow[:2], ['abcd abcd abcd abcd abcd abcd abcd',
'abcd abcd abcd abcd abcd abcd abcd']),
({'window_size': 4, 'word_size': 4, 'window_step': 2,
'numerosity_reduction': False},
['abcd abcd abcd abcd', 'abcd abcd abcd abcd']),
X_bow[:2], ['abcd abcd abcd abcd', 'abcd abcd abcd abcd']),
({'window_size': 4, 'word_size': 4, 'window_step': 0.2,
'numerosity_reduction': False},
['abcd abcd abcd abcd', 'abcd abcd abcd abcd']),
X_bow[:2], ['abcd abcd abcd abcd', 'abcd abcd abcd abcd']),
({'window_size': 4, 'word_size': 4, 'window_step': 0.5},
['abcd', 'abcd'])]
X_bow[:2], ['abcd', 'abcd'])]
)
def test_actual_results_bag_of_words(params, arr_desired):
def test_actual_results_bag_of_words(params, X, arr_desired):
"""Test that the actual results are the expected ones."""
arr_actual = BagOfWords(**params).fit_transform(X_bow)
arr_actual = BagOfWords(**params).fit_transform(X)
np.testing.assert_array_equal(arr_actual, arr_desired)
19 changes: 13 additions & 6 deletions pyts/classification/saxvsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ class SAXVSM(BaseEstimator, UnivariateClassifierMixin):
sliding window will be computed as
``ceil(window_step * n_timestamps)``.
threshold_std: float (default = 0.01)
Threshold used to determine whether a subsequence is standardized.
Subsequences whose standard deviations are lower than this threshold
are not standardized.
norm_mean : bool (default = True)
If True, center each subseries before scaling.
Expand Down Expand Up @@ -112,20 +117,22 @@ class SAXVSM(BaseEstimator, UnivariateClassifierMixin):
>>> clf.fit(X_train, y_train)
SAXVSM(...)
>>> clf.score(X_test, y_test)
0.9933...
1.0
"""

def __init__(self, window_size=0.5, word_size=0.5, n_bins=4,
strategy='normal', numerosity_reduction=True, window_step=1,
norm_mean=True, norm_std=True, use_idf=True, smooth_idf=False,
sublinear_tf=True, overlapping=True, alphabet=None):
threshold_std=0.01, norm_mean=True, norm_std=True,
use_idf=True, smooth_idf=False, sublinear_tf=True,
overlapping=True, alphabet=None):
self.window_size = window_size
self.word_size = word_size
self.n_bins = n_bins
self.strategy = strategy
self.numerosity_reduction = numerosity_reduction
self.window_step = window_step
self.threshold_std = threshold_std
self.norm_mean = norm_mean
self.norm_std = norm_std
self.use_idf = use_idf
Expand Down Expand Up @@ -162,9 +169,9 @@ def fit(self, X, y):
window_size=self.window_size, word_size=self.word_size,
n_bins=self.n_bins, strategy=self.strategy,
numerosity_reduction=self.numerosity_reduction,
window_step=self.window_step, norm_mean=self.norm_mean,
norm_std=self.norm_std, overlapping=self.overlapping,
alphabet=self.alphabet
window_step=self.window_step, threshold_std=self.threshold_std,
norm_mean=self.norm_mean, norm_std=self.norm_std,
overlapping=self.overlapping, alphabet=self.alphabet
)
X_bow = bow.fit_transform(X)

Expand Down
Loading

0 comments on commit 77d9635

Please sign in to comment.