Merge pull request #72 from darintay/subset-order

Optimize gctx subsetting order for memory usage.
cmap · Jun 17, 2021 · c07e44f · c07e44f
2 parents efe1945 + b900035
commit c07e44f
Showing 1 changed file with 17 additions and 7 deletions.
diff --git a/cmapPy/pandasGEXpress/parse_gctx.py b/cmapPy/pandasGEXpress/parse_gctx.py
@@ -369,16 +369,26 @@ def parse_data_df(data_dset, ridx, cidx, row_meta, col_meta):
  -row_meta (pandas DataFrame): the parsed in row metadata
  -col_meta (pandas DataFrame): the parsed in col metadata
  """
- if len(ridx) == len(row_meta.index) and len(cidx) == len(col_meta.index): # no subset
+ total_rows = len(row_meta.index)
+ total_cols = len(col_meta.index)
+ if len(ridx) == total_rows and len(cidx) == total_cols: # no subset
  data_array = np.empty(data_dset.shape, dtype=np.float32)
  data_dset.read_direct(data_array)
  data_array = data_array.transpose()
- elif len(ridx) <= len(cidx):
- first_subset = data_dset[:, ridx].astype(np.float32)
- data_array = first_subset[cidx, :].transpose()
- elif len(cidx) < len(ridx):
- first_subset = data_dset[cidx, :].astype(np.float32)
- data_array = first_subset[:, ridx].transpose()
+ else:
+ # We can only subset on a single dimension at a time with h5py.
+ # For the first dimension to use, pick the one that minimizes
+ # the size of the intermediate array.
+ row_first_count = total_cols * len(ridx)
+ col_first_count = total_rows * len(cidx)
+
+ if row_first_count < col_first_count:
+ first_subset = data_dset[:, ridx].astype(np.float32)
+ data_array = first_subset[cidx, :].transpose()
+ else:
+ first_subset = data_dset[cidx, :].astype(np.float32)
+ data_array = first_subset[:, ridx].transpose()
+
  # make DataFrame instance
  data_df = pd.DataFrame(data_array, index=row_meta.index[ridx], columns=col_meta.index[cidx])
  return data_df