Skip to content

Commit

Permalink
clean up for natcap#1419
Browse files Browse the repository at this point in the history
  • Loading branch information
emlys committed Nov 8, 2023
1 parent 5d5c4ef commit 1effbb6
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 25 deletions.
2 changes: 2 additions & 0 deletions src/natcap/invest/coastal_vulnerability.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,8 @@
"Shore points with associated habitat data"),
"index_col": "shore_id",
"columns": {
# shore_id and R_hab come first so that they get
# matched before [HABITAT], which matches everything
"shore_id": {
"type": "integer",
"about": "Shore point ID"
Expand Down
32 changes: 7 additions & 25 deletions src/natcap/invest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,48 +600,30 @@ def expand_path(path, base_path):
def read_csv_to_dataframe(path, **kwargs):
"""Return a dataframe representation of the CSV.
Wrapper around ``pandas.read_csv`` that performs some common data cleaning
based on information in the arg spec.
Columns are filtered to just those that match a pattern in the spec.
Column names are lowercased and whitespace is stripped off. Empty rows are
dropped. Values in each column are processed and cast to an appropriate
dtype according to the type in the spec:
- Values in raster, vector, csv, file, and directory columns are cast to
str, whitespace stripped, and expanded as paths relative to the input path
- Values in freestyle_string and option_string columns are cast to str,
whitespace stripped, and converted to lowercase
- Values in number, ratio, and percent columns are cast to float
- Values in integer columns are cast to int
- Values in boolean columns are cast to bool
Empty or NA cells are returned as ``numpy.nan`` (for floats) or
``pandas.NA`` (for all other types).
Also sets custom defaults for some kwargs passed to ``pandas.read_csv``,
which you can override with kwargs:
Wrapper around ``pandas.read_csv`` that performs some common data cleaning.
Column names are lowercased and whitespace is stripped off. Empty rows and
columns are dropped. Sets custom defaults for some kwargs passed to
``pandas.read_csv``, which you can override with kwargs:
- sep=None: lets the Python engine infer the separator
- engine='python': The 'python' engine supports the sep=None option.
- encoding='utf-8-sig': 'utf-8-sig' handles UTF-8 with or without BOM.
- index_col=False: force pandas not to index by any column, useful in
case of trailing separators
Args:
path (str): path to a CSV file
spec (dict): dictionary specifying the structure of the CSV table
**kwargs: additional kwargs will be passed to ``pandas.read_csv``
Returns:
pandas.DataFrame with the contents of the given CSV
"""
try:
# set index_col=False to force pandas not to index by any column
# this is useful in case of trailing separators
# we'll explicitly set the index column later on
df = pandas.read_csv(
path,
index_col=False,
**{
'index_col': False,
'sep': None,
'engine': 'python',
'encoding': 'utf-8-sig',
Expand Down
2 changes: 2 additions & 0 deletions src/natcap/invest/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,8 @@ def get_validated_dataframe(csv_path, columns=None, rows=None, index_col=None,
df[col] = df[col].astype(pandas.Int64Dtype())
elif col_spec['type'] == 'boolean':
df[col] = df[col].astype('boolean')
else:
raise ValueError(f'Unknown type: {col_spec['type']}')
except Exception as err:
raise ValueError(
f'Value(s) in the "{col}" column could not be interpreted '
Expand Down
4 changes: 4 additions & 0 deletions src/natcap/invest/wave_energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,8 @@
},
"machine_param_path": {
"type": "csv",
# use columns because of the non standard format of this table,
# we cannot validate it with the rows as headers.
"columns": {
"name": {
"type": "freestyle_string",
Expand Down Expand Up @@ -329,6 +331,8 @@
},
"machine_econ_path": {
"type": "csv",
# use columns because of the non standard format of this table,
# we cannot validate it with the rows as headers.
"columns": {
"name": {
"type": "freestyle_string",
Expand Down

0 comments on commit 1effbb6

Please sign in to comment.