Skip to content

Commit

Permalink
v1.13.0 (#54)
Browse files Browse the repository at this point in the history
* Added

- one.remote.aws.get_s3_virtual_host constructs HTTPS Web address from bucket name and region
- one.remote.aws.is_folder determines if an S3 Object is a folder

Modified

- iter_datasets now public function in one.alf.io, moved from one.alf.cache
- expose register_session kwargs in RegistrationClient.create_session method
- one.remote.aws.get_aws_access_keys requires AlyxClient instance instead of OneAlyx, in line with one.remote.globus
  • Loading branch information
k1o0 authored Jul 25, 2022
1 parent 5ff517f commit d7a0d18
Show file tree
Hide file tree
Showing 17 changed files with 230 additions and 78 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
python -m pip install --upgrade pip
pip install coverage coveralls
pip install -r requirements.txt
pip install boto3 globus_sdk
pip install globus_sdk
pip install -e .
- name: run tests
run: |
Expand Down
17 changes: 16 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
# Changelog
## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [1.12.2]
## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [1.13.0]

### Added

- one.remote.aws.get_s3_virtual_host constructs HTTPS Web address from bucket name and region
- one.remote.aws.is_folder determines if an S3 Object is a folder
- boto3 now a requirement

### Modified

- cache 'project' field renamed to 'projects'
- iter_datasets now public function in one.alf.io, moved from one.alf.cache
- expose register_session kwargs in RegistrationClient.create_session method
- one.remote.aws.get_aws_access_keys requires AlyxClient instance instead of OneAlyx, in line with one.remote.globus

## [1.12.2]

### Modified

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

The Open Neurophysiology Environment is a scheme for sharing neurophysiology data in a standardized manner. For information on how to share data with ONE please [click here](https://github.com/int-brain-lab/ONE/blob/main/docs/Open_Neurophysiology_Environment_Filename_Convention.pdf). This github page contains an API for searching and loading ONE-standardized data, stored either on a user’s local machine or on a remote server. Please [Click here](https://int-brain-lab.github.io/ONE/) for the main documentation page.

**NB**: The API and backend database are still under active development, for the best experience please regularly update the package by running `pip install -U ONE-api`.

## Requirements
ONE runs on Python 3.7 or later, and is tested on the latest Ubuntu and Windows (3.7 and 3.8 only).

Expand Down
2 changes: 1 addition & 1 deletion one/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""The Open Neurophysiology Environment (ONE) API"""
__version__ = '1.12.2'
__version__ = '1.13.0'
16 changes: 4 additions & 12 deletions one/alf/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,8 @@
from iblutil.io import parquet
from iblutil.io.hashfile import md5

from one.alf.io import iter_sessions
from one.alf.io import iter_sessions, iter_datasets
from one.alf.files import session_path_parts, get_alf_path
from one.alf.spec import is_valid

__all__ = ['make_parquet_db']

Expand All @@ -43,7 +42,7 @@
'date', # datetime.date
'number', # int
'task_protocol',
'project',
'projects',
)

DATASETS_COLUMNS = (
Expand Down Expand Up @@ -73,7 +72,7 @@ def _get_session_info(rel_ses_path):
out['date'] = pd.to_datetime(out['date']).date()
out['number'] = int(out['number'])
out['task_protocol'] = ''
out['project'] = ''
out['projects'] = ''
return out


Expand Down Expand Up @@ -192,7 +191,7 @@ def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame:
# Go through sessions and append datasets
for session_path in iter_sessions(root_dir):
rows = []
for rel_dset_path in _iter_datasets(session_path):
for rel_dset_path in iter_datasets(session_path):
file_info = _get_dataset_info(session_path, rel_dset_path, compute_hash=hash_files)
assert set(file_info.keys()) <= set(DATASETS_COLUMNS)
rows.append(file_info)
Expand All @@ -201,13 +200,6 @@ def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame:
return df


def _iter_datasets(session_path):
"""Iterate over all files in a session, and yield relative dataset paths."""
for p in sorted(Path(session_path).rglob('*.*')):
if not p.is_dir() and is_valid(p.name):
yield p.relative_to(session_path)


def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab=None):
"""
Given a data directory, index the ALF datasets and save the generated cache tables.
Expand Down
25 changes: 22 additions & 3 deletions one/alf/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,17 +364,17 @@ def _ls(alfpath, object=None, **kwargs) -> (list, tuple):

def iter_sessions(root_dir):
"""
Recursively iterate over session paths in a given directory
Recursively iterate over session paths in a given directory.
Parameters
----------
root_dir : str, pathlib.Path
The folder to look for sessions
The folder to look for sessions.
Yields
-------
pathlib.Path
The next session path in lexicographical order
The next session path in lexicographical order.
"""
if spec.is_session_path(root_dir):
yield root_dir
Expand All @@ -383,6 +383,25 @@ def iter_sessions(root_dir):
yield path


def iter_datasets(session_path):
"""
Iterate over all files in a session, and yield relative dataset paths.
Parameters
----------
session_path : str, pathlib.Path
The folder to look for datasets.
Yields
-------
pathlib.Path
The next dataset path (relative to the session path) in lexicographical order.
"""
for p in sorted(Path(session_path).rglob('*.*')):
if not p.is_dir() and spec.is_valid(p.name):
yield p.relative_to(session_path)


def exists(alfpath, object, attributes=None, **kwargs) -> bool:
"""
Test if ALF object and optionally specific attributes exist in the given path
Expand Down
32 changes: 19 additions & 13 deletions one/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
class One(ConversionMixin):
"""An API for searching and loading data on a local filesystem"""
_search_terms = (
'dataset', 'date_range', 'laboratory', 'number', 'project', 'subject', 'task_protocol'
'dataset', 'date_range', 'laboratory', 'number', 'projects', 'subject', 'task_protocol'
)

def __init__(self, cache_dir=None, mode='auto', wildcards=True):
Expand Down Expand Up @@ -99,8 +99,8 @@ def _load_cache(self, cache_dir=None, **kwargs):
continue
meta['loaded_time'] = datetime.now()

# Convert to str ids
cache = util.cache_int2str(cache)
# Patch older tables
cache = util.patch_cache(cache, meta['raw'][table].get('min_api_version'))

# Set the appropriate index if none already set
if isinstance(cache.index, pd.RangeIndex):
Expand Down Expand Up @@ -221,7 +221,7 @@ def _update_cache_from_records(self, strict=False, **kwargs):
strict : bool
If not True, the columns don't need to match. Extra columns in input tables are
dropped and missing columns are added and filled with np.nan.
kwargs
**kwargs
pandas.DataFrame or pandas.Series to insert/update for each table
Returns
Expand Down Expand Up @@ -385,8 +385,8 @@ def search(self, details=False, query_type=None, **kwargs):
task_protocol : str
The task protocol name (can be partial, i.e. any task protocol containing that str
will be found)
project : str
The project name (can be partial, i.e. any task protocol containing that str
projects : str, list
The project name(s) (can be partial, i.e. any project containing that str
will be found)
details : bool
If true also returns a dict of dataset details
Expand Down Expand Up @@ -424,7 +424,7 @@ def sort_fcn(itm):
if sessions.size == 0:
return ([], None) if details else []
# String fields
elif key in ('subject', 'task_protocol', 'laboratory', 'project'):
elif key in ('subject', 'task_protocol', 'laboratory', 'projects'):
query = '|'.join(util.ensure_list(value))
key = 'lab' if key == 'laboratory' else key
mask = sessions[key].str.contains(query, regex=self.wildcards)
Expand Down Expand Up @@ -869,7 +869,7 @@ def load_object(self,
download_only : bool
When true the data are downloaded and the file path is returned. NB: The order of the
file path list is undefined.
kwargs : dict
**kwargs
Additional filters for datasets, including namespace and timescale. For full list
see the one.alf.spec.describe function.
Expand Down Expand Up @@ -1201,7 +1201,7 @@ def load_collection(self,
Query cache ('local') or Alyx database ('remote')
download_only : bool
When true the data are downloaded and the file path is returned.
kwargs : dict
**kwargs
Additional filters for datasets, including namespace and timescale. For full list
see the one.alf.spec.describe function.
Expand Down Expand Up @@ -1268,6 +1268,9 @@ def setup(cache_dir=None, silent=False, **kwargs):
silent : (False) bool
when True will prompt for cache_dir if cache_dir is None, and overwrite cache if any
when False will use cwd for cache_dir if cache_dir is None and use existing cache
**kwargs
Optional arguments to pass to one.alf.cache.make_parquet_db.
Returns
-------
One
Expand Down Expand Up @@ -1583,7 +1586,7 @@ def search(self, details=False, query_type=None, **kwargs):
task_protocol : str, list
The task protocol name (can be partial, i.e. any task protocol containing that str
will be found)
project : str, list
project(s) : str, list
The project name (can be partial, i.e. any task protocol containing that str
will be found)
performance_lte / performance_gte : float
Expand Down Expand Up @@ -1665,10 +1668,10 @@ def _download_datasets(self, dsets, **kwargs) -> List[Path]:
_logger.debug(ex)
return self._download_dataset(dsets, **kwargs)

def _download_aws(self, dsets, update_exists=True, **kwargs) -> List[Path]:
def _download_aws(self, dsets, update_exists=True, **_) -> List[Path]:
# Download datasets from AWS
import one.remote.aws as aws
s3, bucket_name = aws.get_s3_from_alyx(self)
s3, bucket_name = aws.get_s3_from_alyx(self.alyx)
if self._index_type() is int:
raise NotImplementedError('AWS download only supported for str index cache')
assert self.mode != 'local'
Expand Down Expand Up @@ -1908,6 +1911,8 @@ def setup(base_url=None, **kwargs):
----------
base_url : str
An Alyx database URL. If None, the current default database is used.
**kwargs
Optional arguments to pass to one.params.setup.
Returns
-------
Expand Down Expand Up @@ -2171,9 +2176,10 @@ def get_details(self, eid: str, full: bool = False, query_type=None):
if full:
return dets
# If it's not full return the normal output like from a one.search
det_fields = ['subject', 'start_time', 'number', 'lab', 'project',
det_fields = ['subject', 'start_time', 'number', 'lab', 'projects',
'url', 'task_protocol', 'local_path']
out = {k: v for k, v in dets.items() if k in det_fields}
out['projects'] = ','.join(out['projects'])
out.update({'local_path': self.eid2path(eid),
'date': datetime.fromisoformat(out['start_time']).date()})
return out
22 changes: 14 additions & 8 deletions one/registration.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,22 +81,24 @@ def create_sessions(self, root_data_folder, glob_pattern='**/create_me.flag', dr
flag_file.unlink()
return [ff.parent for ff in flag_files], records

def create_session(self, session_path) -> dict:
def create_session(self, session_path, **kwargs) -> dict:
"""Create a remote session on Alyx from a local session path, without registering files
Parameters
----------
session_path : str, pathlib.Path
The path ending with subject/date/number
The path ending with subject/date/number.
**kwargs
Optional arguments for RegistrationClient.register_session.
Returns
-------
dict
Newly created session record
Newly created session record.
"""
return self.register_session(session_path, file_list=False)[0]
return self.register_session(session_path, file_list=False, **kwargs)[0]

def create_new_session(self, subject, session_root=None, date=None, register=True):
def create_new_session(self, subject, session_root=None, date=None, register=True, **kwargs):
"""Create a new local session folder and optionally create session record on Alyx
Parameters
Expand All @@ -110,12 +112,14 @@ def create_new_session(self, subject, session_root=None, date=None, register=Tru
An optional date for the session. If None the current time is used.
register : bool
If true, create session record on Alyx database
**kwargs
Optional arguments for RegistrationClient.register_session.
Returns
-------
pathlib.Path
New local session path
str
uuid.UUID
The experiment UUID if register is True
Examples
Expand All @@ -139,7 +143,7 @@ def create_new_session(self, subject, session_root=None, date=None, register=Tru
session_root = Path(session_root or self.one.alyx.cache_dir) / subject / date[:10]
session_path = session_root / alfio.next_num_folder(session_root)
session_path.mkdir(exist_ok=True, parents=True) # Ensure folder exists on disk
eid = UUID(self.create_session(session_path)['url'][-36:]) if register else None
eid = UUID(self.create_session(session_path, **kwargs)['url'][-36:]) if register else None
return session_path, eid

def find_files(self, session_path):
Expand Down Expand Up @@ -248,7 +252,7 @@ def register_session(self, ses_path, users=None, file_list=True, **kwargs):
The total number of completed trials (optional)
json : dict, str
Optional JSON data
project: str, list
projects: str, list
The project(s) to which the experiment belongs (optional)
type : str
The experiment type, e.g. 'Experiment', 'Base'
Expand Down Expand Up @@ -311,6 +315,8 @@ def register_session(self, ses_path, users=None, file_list=True, **kwargs):
assert start_time[:10] == details['date'], 'start_time doesn\'t match session path'
if kwargs.get('procedures', False):
ses_['procedures'] = ensure_list(kwargs.pop('procedures'))
if kwargs.get('projects', False):
ses_['projects'] = ensure_list(kwargs.pop('projects'))
assert ('subject', 'number') not in kwargs
if 'lab' not in kwargs and details['lab']:
kwargs.update({'lab': details['lab']})
Expand Down
Loading

0 comments on commit d7a0d18

Please sign in to comment.