Improvment to update of exists column in _check_filesystem

int-brain-lab · Oct 10, 2024 · a55b006 · a55b006
1 parent a47ecf8
commit a55b006
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 15 deletions.
diff --git a/one/api.py b/one/api.py
@@ -626,20 +626,6 @@ def _check_filesystem(self, datasets, offline=None, update_exists=True, check_ha
                 files.append(None)
                 # Add this index to list of datasets that need downloading
                 indices_to_download.append(i)
-            if rec['exists'] != file.exists():
-                with warnings.catch_warnings():
-                    # Suppress future warning: exist column should always be present
-                    msg = '.*indexing on a MultiIndex with a nested sequence of labels.*'
-                    warnings.filterwarnings('ignore', message=msg)
-                    datasets.at[i, 'exists'] = not rec['exists']
-                    if update_exists:
-                        _logger.debug('Updating exists field')
-                        if isinstance(i, tuple):
-                            self._cache['datasets'].loc[i, 'exists'] = not rec['exists']
-                        else:  # eid index level missing in datasets input
-                            i = pd.IndexSlice[:, i]
-                            self._cache['datasets'].loc[i, 'exists'] = not rec['exists']
-                        self._cache['_meta']['modified_time'] = datetime.now()
 
         # If online and we have datasets to download, call download_datasets with these datasets
         if not (offline or self.offline) and indices_to_download:
@@ -650,6 +636,24 @@ def _check_filesystem(self, datasets, offline=None, update_exists=True, check_ha
             for i, file in zip(indices_to_download, new_files):
                 files[datasets.index.get_loc(i)] = file
 
+        # NB: Currently if not offline and a remote file is missing, an exception will be raised
+        # before we reach this point. This could change in the future.
+        exists = list(map(bool, files))
+        if not all(datasets['exists'] == exists):
+            with warnings.catch_warnings():
+                # Suppress future warning: exist column should always be present
+                msg = '.*indexing on a MultiIndex with a nested sequence of labels.*'
+                warnings.filterwarnings('ignore', message=msg)
+                datasets['exists'] = exists
+                if update_exists:
+                    _logger.debug('Updating exists field')
+                    i = datasets.index
+                    if i.nlevels == 1:
+                        # eid index level missing in datasets input
+                        i = pd.IndexSlice[:, i]
+                    self._cache['datasets'].loc[i, 'exists'] = exists
+                    self._cache['_meta']['modified_time'] = datetime.now()
+
         if self.record_loaded:
             loaded = np.fromiter(map(bool, files), bool)
             loaded_ids = datasets.index.get_level_values('id')[loaded].to_numpy()

diff --git a/one/tests/test_one.py b/one/tests/test_one.py
@@ -534,15 +534,37 @@ def test_check_filesystem(self):
         datasets = self.one._cache['datasets'].loc[eids]
         files = self.one._check_filesystem(datasets)
         self.assertEqual(53, len(files))
+
         # Expect same number of unique session paths as eids
         session_paths = set(map(lambda p: p.parents[1], files))
         self.assertEqual(len(eids), len(session_paths))
         expected = map(lambda x: '/'.join(x.parts[-3:]), session_paths)
         session_parts = self.one._cache['sessions'].loc[eids, ['subject', 'date', 'number']].values
         self.assertCountEqual(expected, map(lambda x: f'{x[0]}/{x[1]}/{x[2]:03}', session_parts))
-        # Attempt the same with the eid index missing
+
+        # Test a very rare occurence of a missing dataset with eid index missing
+        # but session_path column present
+        idx = self.one._cache.datasets.index[(i := 5)]  # pick a random dataset to make vanish
+        _eid2path = {
+            e: self.one.eid2path(e).relative_to(self.one.cache_dir).as_posix() for e in eids
+        }
+        session_paths = list(map(_eid2path.get, datasets.index.get_level_values(0)))
+        datasets['session_path'] = session_paths
         datasets = datasets.droplevel(0)
+        files[(i := 5)].unlink()
+        # For this check the current state should be exists==True in the main cache
+        assert self.one._cache.datasets.loc[idx, 'exists'].all()
+        _files = self.one._check_filesystem(datasets, update_exists=True)
+        self.assertIsNone(_files[i])
+        self.assertFalse(
+            self.one._cache.datasets.loc[idx, 'exists'].all(), 'failed to update cache exists')
+        files[i].touch()  # restore file for next check
+
+        # Attempt to load datasets with both eid index
+        # and session_path column missing (most common)
+        datasets = datasets.drop('session_path', axis=1)
         self.assertEqual(files, self.one._check_filesystem(datasets))
+
         # Test with uuid_filenames as True
         self.one.uuid_filenames = True
         try: