v1.13.0 (#54)

* Added - one.remote.aws.get_s3_virtual_host constructs HTTPS Web address from bucket name and region - one.remote.aws.is_folder determines if an S3 Object is a folder Modified - iter_datasets now public function in one.alf.io, moved from one.alf.cache - expose register_session kwargs in RegistrationClient.create_session method - one.remote.aws.get_aws_access_keys requires AlyxClient instance instead of OneAlyx, in line with one.remote.globus
int-brain-lab · Jul 25, 2022 · d7a0d18 · d7a0d18
1 parent 5ff517f
commit d7a0d18
Show file tree

Hide file tree

Showing 17 changed files with 230 additions and 78 deletions.
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -29,7 +29,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install coverage coveralls
         pip install -r requirements.txt
-        pip install boto3 globus_sdk
+        pip install globus_sdk
         pip install -e .
     - name: run tests
       run: |

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,20 @@
 # Changelog
-## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [1.12.2]
+## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [1.13.0]
+
+### Added
+
+- one.remote.aws.get_s3_virtual_host constructs HTTPS Web address from bucket name and region
+- one.remote.aws.is_folder determines if an S3 Object is a folder
+- boto3 now a requirement
+
+### Modified
+
+- cache 'project' field renamed to 'projects'
+- iter_datasets now public function in one.alf.io, moved from one.alf.cache
+- expose register_session kwargs in RegistrationClient.create_session method
+- one.remote.aws.get_aws_access_keys requires AlyxClient instance instead of OneAlyx, in line with one.remote.globus
+
+## [1.12.2]
 
 ### Modified
 

diff --git a/README.md b/README.md
@@ -4,6 +4,8 @@
 
 The Open Neurophysiology Environment is a scheme for sharing neurophysiology data in a standardized manner. For information on how to share data with ONE please [click here](https://github.com/int-brain-lab/ONE/blob/main/docs/Open_Neurophysiology_Environment_Filename_Convention.pdf). This github page contains an API for searching and loading ONE-standardized data, stored either on a user’s local machine or on a remote server. Please [Click here](https://int-brain-lab.github.io/ONE/) for the main documentation page.
 
+**NB**: The API and backend database are still under active development, for the best experience please regularly update the package by running `pip install -U ONE-api`. 
+
 ## Requirements
 ONE runs on Python 3.7 or later, and is tested on the latest Ubuntu and Windows (3.7 and 3.8 only).
 

diff --git a/one/__init__.py b/one/__init__.py
@@ -1,2 +1,2 @@
 """The Open Neurophysiology Environment (ONE) API"""
-__version__ = '1.12.2'
+__version__ = '1.13.0'
diff --git a/one/alf/cache.py b/one/alf/cache.py
@@ -26,9 +26,8 @@
 from iblutil.io import parquet
 from iblutil.io.hashfile import md5
 
-from one.alf.io import iter_sessions
+from one.alf.io import iter_sessions, iter_datasets
 from one.alf.files import session_path_parts, get_alf_path
-from one.alf.spec import is_valid
 
 __all__ = ['make_parquet_db']
 
@@ -43,7 +42,7 @@
     'date',             # datetime.date
     'number',           # int
     'task_protocol',
-    'project',
+    'projects',
 )
 
 DATASETS_COLUMNS = (
@@ -73,7 +72,7 @@ def _get_session_info(rel_ses_path):
     out['date'] = pd.to_datetime(out['date']).date()
     out['number'] = int(out['number'])
     out['task_protocol'] = ''
-    out['project'] = ''
+    out['projects'] = ''
     return out
 
 
@@ -192,7 +191,7 @@ def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame:
     # Go through sessions and append datasets
     for session_path in iter_sessions(root_dir):
         rows = []
-        for rel_dset_path in _iter_datasets(session_path):
+        for rel_dset_path in iter_datasets(session_path):
             file_info = _get_dataset_info(session_path, rel_dset_path, compute_hash=hash_files)
             assert set(file_info.keys()) <= set(DATASETS_COLUMNS)
             rows.append(file_info)
@@ -201,13 +200,6 @@ def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame:
     return df
 
 
-def _iter_datasets(session_path):
-    """Iterate over all files in a session, and yield relative dataset paths."""
-    for p in sorted(Path(session_path).rglob('*.*')):
-        if not p.is_dir() and is_valid(p.name):
-            yield p.relative_to(session_path)
-
-
 def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab=None):
     """
     Given a data directory, index the ALF datasets and save the generated cache tables.

diff --git a/one/alf/io.py b/one/alf/io.py
@@ -364,17 +364,17 @@ def _ls(alfpath, object=None, **kwargs) -> (list, tuple):
 
 def iter_sessions(root_dir):
     """
-    Recursively iterate over session paths in a given directory
+    Recursively iterate over session paths in a given directory.
 
     Parameters
     ----------
     root_dir : str, pathlib.Path
-        The folder to look for sessions
+        The folder to look for sessions.
 
     Yields
     -------
     pathlib.Path
-        The next session path in lexicographical order
+        The next session path in lexicographical order.
     """
     if spec.is_session_path(root_dir):
         yield root_dir
@@ -383,6 +383,25 @@ def iter_sessions(root_dir):
             yield path
 
 
+def iter_datasets(session_path):
+    """
+    Iterate over all files in a session, and yield relative dataset paths.
+
+    Parameters
+    ----------
+    session_path : str, pathlib.Path
+        The folder to look for datasets.
+
+    Yields
+    -------
+    pathlib.Path
+        The next dataset path (relative to the session path) in lexicographical order.
+    """
+    for p in sorted(Path(session_path).rglob('*.*')):
+        if not p.is_dir() and spec.is_valid(p.name):
+            yield p.relative_to(session_path)
+
+
 def exists(alfpath, object, attributes=None, **kwargs) -> bool:
     """
     Test if ALF object and optionally specific attributes exist in the given path

diff --git a/one/api.py b/one/api.py
@@ -38,7 +38,7 @@
 class One(ConversionMixin):
     """An API for searching and loading data on a local filesystem"""
     _search_terms = (
-        'dataset', 'date_range', 'laboratory', 'number', 'project', 'subject', 'task_protocol'
+        'dataset', 'date_range', 'laboratory', 'number', 'projects', 'subject', 'task_protocol'
     )
 
     def __init__(self, cache_dir=None, mode='auto', wildcards=True):
@@ -99,8 +99,8 @@ def _load_cache(self, cache_dir=None, **kwargs):
                 continue
             meta['loaded_time'] = datetime.now()
 
-            # Convert to str ids
-            cache = util.cache_int2str(cache)
+            # Patch older tables
+            cache = util.patch_cache(cache, meta['raw'][table].get('min_api_version'))
 
             # Set the appropriate index if none already set
             if isinstance(cache.index, pd.RangeIndex):
@@ -221,7 +221,7 @@ def _update_cache_from_records(self, strict=False, **kwargs):
         strict : bool
             If not True, the columns don't need to match.  Extra columns in input tables are
             dropped and missing columns are added and filled with np.nan.
-        kwargs
+        **kwargs
             pandas.DataFrame or pandas.Series to insert/update for each table
 
         Returns
@@ -385,8 +385,8 @@ def search(self, details=False, query_type=None, **kwargs):
         task_protocol : str
             The task protocol name (can be partial, i.e. any task protocol containing that str
             will be found)
-        project : str
-            The project name (can be partial, i.e. any task protocol containing that str
+        projects : str, list
+            The project name(s) (can be partial, i.e. any project containing that str
             will be found)
         details : bool
             If true also returns a dict of dataset details
@@ -424,7 +424,7 @@ def sort_fcn(itm):
             if sessions.size == 0:
                 return ([], None) if details else []
             # String fields
-            elif key in ('subject', 'task_protocol', 'laboratory', 'project'):
+            elif key in ('subject', 'task_protocol', 'laboratory', 'projects'):
                 query = '|'.join(util.ensure_list(value))
                 key = 'lab' if key == 'laboratory' else key
                 mask = sessions[key].str.contains(query, regex=self.wildcards)
@@ -869,7 +869,7 @@ def load_object(self,
         download_only : bool
             When true the data are downloaded and the file path is returned. NB: The order of the
             file path list is undefined.
-        kwargs : dict
+        **kwargs
             Additional filters for datasets, including namespace and timescale. For full list
             see the one.alf.spec.describe function.
 
@@ -1201,7 +1201,7 @@ def load_collection(self,
             Query cache ('local') or Alyx database ('remote')
         download_only : bool
             When true the data are downloaded and the file path is returned.
-        kwargs : dict
+        **kwargs
             Additional filters for datasets, including namespace and timescale. For full list
             see the one.alf.spec.describe function.
 
@@ -1268,6 +1268,9 @@ def setup(cache_dir=None, silent=False, **kwargs):
         silent : (False) bool
             when True will prompt for cache_dir if cache_dir is None, and overwrite cache if any
             when False will use cwd for cache_dir if cache_dir is None and use existing cache
+        **kwargs
+            Optional arguments to pass to one.alf.cache.make_parquet_db.
+
         Returns
         -------
         One
@@ -1583,7 +1586,7 @@ def search(self, details=False, query_type=None, **kwargs):
         task_protocol : str, list
             The task protocol name (can be partial, i.e. any task protocol containing that str
             will be found)
-        project : str, list
+        project(s) : str, list
             The project name (can be partial, i.e. any task protocol containing that str
             will be found)
         performance_lte / performance_gte : float
@@ -1665,10 +1668,10 @@ def _download_datasets(self, dsets, **kwargs) -> List[Path]:
             _logger.debug(ex)
         return self._download_dataset(dsets, **kwargs)
 
-    def _download_aws(self, dsets, update_exists=True, **kwargs) -> List[Path]:
+    def _download_aws(self, dsets, update_exists=True, **_) -> List[Path]:
         # Download datasets from AWS
         import one.remote.aws as aws
-        s3, bucket_name = aws.get_s3_from_alyx(self)
+        s3, bucket_name = aws.get_s3_from_alyx(self.alyx)
         if self._index_type() is int:
             raise NotImplementedError('AWS download only supported for str index cache')
         assert self.mode != 'local'
@@ -1908,6 +1911,8 @@ def setup(base_url=None, **kwargs):
         ----------
         base_url : str
             An Alyx database URL.  If None, the current default database is used.
+        **kwargs
+            Optional arguments to pass to one.params.setup.
 
         Returns
         -------
@@ -2171,9 +2176,10 @@ def get_details(self, eid: str, full: bool = False, query_type=None):
         if full:
             return dets
         # If it's not full return the normal output like from a one.search
-        det_fields = ['subject', 'start_time', 'number', 'lab', 'project',
+        det_fields = ['subject', 'start_time', 'number', 'lab', 'projects',
                       'url', 'task_protocol', 'local_path']
         out = {k: v for k, v in dets.items() if k in det_fields}
+        out['projects'] = ','.join(out['projects'])
         out.update({'local_path': self.eid2path(eid),
                     'date': datetime.fromisoformat(out['start_time']).date()})
         return out
diff --git a/one/registration.py b/one/registration.py
@@ -81,22 +81,24 @@ def create_sessions(self, root_data_folder, glob_pattern='**/create_me.flag', dr
             flag_file.unlink()
         return [ff.parent for ff in flag_files], records
 
-    def create_session(self, session_path) -> dict:
+    def create_session(self, session_path, **kwargs) -> dict:
         """Create a remote session on Alyx from a local session path, without registering files
 
         Parameters
         ----------
         session_path : str, pathlib.Path
-            The path ending with subject/date/number
+            The path ending with subject/date/number.
+        **kwargs
+            Optional arguments for RegistrationClient.register_session.
 
         Returns
         -------
         dict
-            Newly created session record
+            Newly created session record.
         """
-        return self.register_session(session_path, file_list=False)[0]
+        return self.register_session(session_path, file_list=False, **kwargs)[0]
 
-    def create_new_session(self, subject, session_root=None, date=None, register=True):
+    def create_new_session(self, subject, session_root=None, date=None, register=True, **kwargs):
         """Create a new local session folder and optionally create session record on Alyx
 
         Parameters
@@ -110,12 +112,14 @@ def create_new_session(self, subject, session_root=None, date=None, register=Tru
             An optional date for the session.  If None the current time is used.
         register : bool
             If true, create session record on Alyx database
+        **kwargs
+            Optional arguments for RegistrationClient.register_session.
 
         Returns
         -------
         pathlib.Path
             New local session path
-        str
+        uuid.UUID
             The experiment UUID if register is True
 
         Examples
@@ -139,7 +143,7 @@ def create_new_session(self, subject, session_root=None, date=None, register=Tru
         session_root = Path(session_root or self.one.alyx.cache_dir) / subject / date[:10]
         session_path = session_root / alfio.next_num_folder(session_root)
         session_path.mkdir(exist_ok=True, parents=True)  # Ensure folder exists on disk
-        eid = UUID(self.create_session(session_path)['url'][-36:]) if register else None
+        eid = UUID(self.create_session(session_path, **kwargs)['url'][-36:]) if register else None
         return session_path, eid
 
     def find_files(self, session_path):
@@ -248,7 +252,7 @@ def register_session(self, ses_path, users=None, file_list=True, **kwargs):
             The total number of completed trials (optional)
         json : dict, str
             Optional JSON data
-        project: str, list
+        projects: str, list
             The project(s) to which the experiment belongs (optional)
         type : str
             The experiment type, e.g. 'Experiment', 'Base'
@@ -311,6 +315,8 @@ def register_session(self, ses_path, users=None, file_list=True, **kwargs):
         assert start_time[:10] == details['date'], 'start_time doesn\'t match session path'
         if kwargs.get('procedures', False):
             ses_['procedures'] = ensure_list(kwargs.pop('procedures'))
+        if kwargs.get('projects', False):
+            ses_['projects'] = ensure_list(kwargs.pop('projects'))
         assert ('subject', 'number') not in kwargs
         if 'lab' not in kwargs and details['lab']:
             kwargs.update({'lab': details['lab']})