Skip to content

Commit

Permalink
MRG: fix exponential time explosion in sig check (sourmash-bio#2762)
Browse files Browse the repository at this point in the history
Fixes sourmash-bio#2646.

Embarrassing "let's make things run really slow" typo of the month...
fixed!

In brief, the following code in `manifest.py` was causing an exponential
explosion in time cost:
```python
    def _add_rows(self, rows):
        self.rows.extend(rows)

        # maintain a fast check for md5sums for __contains__ check.
        md5set = self._md5_set
        for row in self.rows:
            md5set.add(row['md5'])
```
when called many times on a single row, because each addition of a row
triggered an iteration over ALL rows.

This PR changes things so that we do `for row in rows:`.

Additional complexity => new tests to deal explicitly with the case
where `rows` is a generator (which it turns out it often is, breaking
dozens of tests when not accounted for)
  • Loading branch information
ctb authored Sep 15, 2023
1 parent 6df0360 commit 94d82b1
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 8 deletions.
10 changes: 6 additions & 4 deletions src/sourmash/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,14 +232,16 @@ def add_row(self, row):
self._add_rows([row])

def _add_rows(self, rows):
self.rows.extend(rows)

# maintain a fast check for md5sums for __contains__ check.
md5set = self._md5_set
for row in self.rows:

# only iterate once, in case it's a generator
for row in rows:
self.rows.append(row)
md5set.add(row['md5'])

def __iadd__(self, other):
if self is other:
raise Exception("cannot directly add manifest to itself")
self._add_rows(other.rows)
return self

Expand Down
6 changes: 4 additions & 2 deletions src/sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from sourmash.sourmash_args import FileOutput

from sourmash.logging import (set_quiet, error, notify, print_results, debug,
debug_literal)
debug_literal, _debug)
from sourmash import sourmash_args
from sourmash.minhash import _get_max_hash_for_scaled
from sourmash.manifest import CollectionManifest
Expand Down Expand Up @@ -1359,7 +1359,9 @@ def check(args):
row['internal_location'] = filename
total_manifest_rows.add_row(row)

debug_literal(f"examined {len(new_manifest)} new rows, found {len(sub_manifest)} matching rows")
# the len(sub_manifest) here should only be run when needed :)
if _debug:
debug_literal(f"examined {len(new_manifest)} new rows, found {len(sub_manifest)} matching rows")

notify(f"loaded {total_rows_examined} signatures.")

Expand Down
86 changes: 84 additions & 2 deletions tests/test_manifest.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""
Tests for manifest code in databases, etc.
"""
import pytest
from io import StringIO

import sourmash
from sourmash import index
from sourmash import index, sourmash_args

import sourmash_tst_utils as utils

Expand Down Expand Up @@ -49,7 +50,8 @@ def test_manifest_operations():
siglist.append(sig)

manifest = index.CollectionManifest(rows)
manifest += manifest
manifest2 = index.CollectionManifest(rows)
manifest += manifest2

assert len(manifest) == 2*len(rows)
assert len(manifest) == 4
Expand All @@ -59,6 +61,25 @@ def test_manifest_operations():
assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list


def test_manifest_operations_fail():
# should not be able to add a manifest to itself - not only makes
# no sense, but it means you're modifying a generator in place, sometimes.
protzip = utils.get_test_data('prot/protein.zip')

loader = sourmash.load_file_as_index(protzip)

rows = []
siglist = []
for (sig, loc) in loader._signatures_with_internal():
row = index.CollectionManifest.make_manifest_row(sig, loc)
rows.append(row)
siglist.append(sig)

manifest = index.CollectionManifest(rows)
with pytest.raises(Exception):
manifest += manifest


def test_manifest_to_picklist():
# test manifest/picklist interaction basics
protzip = utils.get_test_data('prot/protein.zip')
Expand Down Expand Up @@ -162,3 +183,64 @@ def test_save_load_manifest():

short_mf = index.CollectionManifest(rows)
assert short_mf != manifest


def test_manifest_to_picklist_bug(runtmp):
# this tests a fun combination of things that led to a bug.
# tl;dr we only want to iterate once across a generator...
# ref #2762
c = runtmp
all_zip = utils.get_test_data('prot/all.zip')

idx = sourmash_args.load_file_as_index(all_zip)
assert len(idx) == 8

manifest = sourmash_args.get_manifest(idx)
assert len(manifest) == 8

def filter_fn(row):
# match?
keep = False
if "09a0869" in row['md5']:
keep = True

return keep

sub_manifest = manifest.filter_rows(filter_fn)
sub_picklist = sub_manifest.to_picklist()
idx = idx.select(picklist=sub_picklist)

assert len(idx) == 1
print(idx)

x = list(idx.signatures())
assert len(x)


def test_generate_manifest_iterate_once():
# we should only iterate across manifest rows once
protzip = utils.get_test_data('prot/protein.zip')

loader = sourmash.load_file_as_index(protzip)

siglist = []
for (sig, loc) in loader._signatures_with_internal():
siglist.append(sig)

# build generator function => will not allow iteration twice
def genfn():
for (sig, loc) in loader._signatures_with_internal():
row = index.CollectionManifest.make_manifest_row(sig, loc)
yield row

manifest = index.CollectionManifest(genfn())

assert len(manifest) == 2
assert len(manifest._md5_set) == 2

md5_list = [ row['md5'] for row in manifest.rows ]
assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list
assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list

for sig in siglist:
assert sig in manifest

0 comments on commit 94d82b1

Please sign in to comment.