Skip to content

Commit

Permalink
Improvment to update of exists column in _check_filesystem
Browse files Browse the repository at this point in the history
  • Loading branch information
k1o0 committed Oct 10, 2024
1 parent a47ecf8 commit a55b006
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 15 deletions.
32 changes: 18 additions & 14 deletions one/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -626,20 +626,6 @@ def _check_filesystem(self, datasets, offline=None, update_exists=True, check_ha
files.append(None)
# Add this index to list of datasets that need downloading
indices_to_download.append(i)
if rec['exists'] != file.exists():
with warnings.catch_warnings():
# Suppress future warning: exist column should always be present
msg = '.*indexing on a MultiIndex with a nested sequence of labels.*'
warnings.filterwarnings('ignore', message=msg)
datasets.at[i, 'exists'] = not rec['exists']
if update_exists:
_logger.debug('Updating exists field')
if isinstance(i, tuple):
self._cache['datasets'].loc[i, 'exists'] = not rec['exists']
else: # eid index level missing in datasets input
i = pd.IndexSlice[:, i]
self._cache['datasets'].loc[i, 'exists'] = not rec['exists']
self._cache['_meta']['modified_time'] = datetime.now()

# If online and we have datasets to download, call download_datasets with these datasets
if not (offline or self.offline) and indices_to_download:
Expand All @@ -650,6 +636,24 @@ def _check_filesystem(self, datasets, offline=None, update_exists=True, check_ha
for i, file in zip(indices_to_download, new_files):
files[datasets.index.get_loc(i)] = file

# NB: Currently if not offline and a remote file is missing, an exception will be raised
# before we reach this point. This could change in the future.
exists = list(map(bool, files))
if not all(datasets['exists'] == exists):
with warnings.catch_warnings():
# Suppress future warning: exist column should always be present
msg = '.*indexing on a MultiIndex with a nested sequence of labels.*'
warnings.filterwarnings('ignore', message=msg)
datasets['exists'] = exists
if update_exists:
_logger.debug('Updating exists field')
i = datasets.index
if i.nlevels == 1:
# eid index level missing in datasets input
i = pd.IndexSlice[:, i]
self._cache['datasets'].loc[i, 'exists'] = exists
self._cache['_meta']['modified_time'] = datetime.now()

if self.record_loaded:
loaded = np.fromiter(map(bool, files), bool)
loaded_ids = datasets.index.get_level_values('id')[loaded].to_numpy()
Expand Down
24 changes: 23 additions & 1 deletion one/tests/test_one.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,15 +534,37 @@ def test_check_filesystem(self):
datasets = self.one._cache['datasets'].loc[eids]
files = self.one._check_filesystem(datasets)
self.assertEqual(53, len(files))

# Expect same number of unique session paths as eids
session_paths = set(map(lambda p: p.parents[1], files))
self.assertEqual(len(eids), len(session_paths))
expected = map(lambda x: '/'.join(x.parts[-3:]), session_paths)
session_parts = self.one._cache['sessions'].loc[eids, ['subject', 'date', 'number']].values
self.assertCountEqual(expected, map(lambda x: f'{x[0]}/{x[1]}/{x[2]:03}', session_parts))
# Attempt the same with the eid index missing

# Test a very rare occurence of a missing dataset with eid index missing
# but session_path column present
idx = self.one._cache.datasets.index[(i := 5)] # pick a random dataset to make vanish
_eid2path = {
e: self.one.eid2path(e).relative_to(self.one.cache_dir).as_posix() for e in eids
}
session_paths = list(map(_eid2path.get, datasets.index.get_level_values(0)))
datasets['session_path'] = session_paths
datasets = datasets.droplevel(0)
files[(i := 5)].unlink()
# For this check the current state should be exists==True in the main cache
assert self.one._cache.datasets.loc[idx, 'exists'].all()
_files = self.one._check_filesystem(datasets, update_exists=True)
self.assertIsNone(_files[i])
self.assertFalse(
self.one._cache.datasets.loc[idx, 'exists'].all(), 'failed to update cache exists')
files[i].touch() # restore file for next check

# Attempt to load datasets with both eid index
# and session_path column missing (most common)
datasets = datasets.drop('session_path', axis=1)
self.assertEqual(files, self.one._check_filesystem(datasets))

# Test with uuid_filenames as True
self.one.uuid_filenames = True
try:
Expand Down

0 comments on commit a55b006

Please sign in to comment.