Improve manipulation of elites by modifying as_pandas (icaros-usc#158)

- Remove old data attributes - Remove references to old data attributes - Implement ArchiveDataFrame and new as_pandas - Test ArchiveDataFrame (as_pandas tests do not need to change) - Fix tutorials - Update docstring for as_pandas
wangwwno1 · Jul 16, 2021 · 121bcc9 · 121bcc9
1 parent 51cc071
commit 121bcc9
Show file tree

Hide file tree

Showing 10 changed files with 344 additions and 204 deletions.
diff --git a/docs/_templates/autosummary/class.rst b/docs/_templates/autosummary/class.rst
@@ -10,25 +10,38 @@
 .. currentmodule:: {{ module }}
 
 .. autoclass:: {{ objname }}
+   {%- if name == "ArchiveDataFrame" %}
+   :no-inherited-members:
+   :members:
+   {% endif %}
 
    {% block methods %}
-
    {% if methods %}
    .. rubric:: {{ _('Methods') }}
 
    .. autosummary::
-   {% for item in all_methods %}
-      {%- if not item.startswith('_') or item in ['__len__',
-                                                  '__call__',
-                                                  '__next__',
-                                                  '__iter__',
-                                                  '__getitem__',
-                                                  '__setitem__',
-                                                  '__delitem__',
-                                                  ] %}
-      ~{{ name }}.{{ item }}
-      {%- endif -%}
-   {%- endfor %}
+   {% if name == "ArchiveDataFrame" %}
+       ~{{ name }}.batch_behaviors
+       ~{{ name }}.batch_indices
+       ~{{ name }}.batch_metadata
+       ~{{ name }}.batch_objectives
+       ~{{ name }}.batch_solutions
+       ~{{ name }}.iterelites
+   {% else %}
+     {% for item in all_methods %}
+       {%- if not item.startswith('_') or item in ['__len__',
+                                                   '__call__',
+                                                   '__next__',
+                                                   '__iter__',
+                                                   '__getitem__',
+                                                   '__setitem__',
+                                                   '__delitem__',
+                                                   ] %}
+       ~{{ name }}.{{ item }}
+       {%- endif -%}
+     {%- endfor %}
+   {% endif %}
+
    {% endif %}
    {% endblock %}
 
@@ -37,8 +50,12 @@
    .. rubric:: {{ _('Attributes') }}
 
    .. autosummary::
-   {% for item in attributes %}
-      ~{{ name }}.{{ item }}
-   {%- endfor %}
+   {% if name == "ArchiveDataFrame" %}
+   {% else %}
+     {% for item in attributes %}
+       ~{{ name }}.{{ item }}
+     {%- endfor %}
+   {% endif %}
+
    {% endif %}
    {% endblock %}
diff --git a/examples/tutorials/lsi_mnist.ipynb b/examples/tutorials/lsi_mnist.ipynb
@@ -480,9 +480,11 @@
     "\n",
     "    imgs = []\n",
     "    img_size = (28, 28)\n",
+    "    df = archive.as_pandas()\n",
+    "    solutions, indices = df.batch_solutions(), df.batch_indices()\n",
     "    for index in grid_indices:\n",
     "        try:\n",
-    "            sol = archive.solutions[archive.indices.index(index)]\n",
+    "            sol = solutions[indices.index(index)]\n",
     "        except ValueError:\n",
     "            print(f\"There is no solution at index {index}.\")\n",
     "            return\n",

diff --git a/examples/tutorials/lunar_lander.ipynb b/examples/tutorials/lunar_lander.ipynb
@@ -603,7 +603,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "high_perf_sols = archive.solutions[archive.objective_values > 200]"
+    "df = archive.as_pandas()\n",
+    "high_perf_sols = df.query(\"objective > 200\").sort_values(\"objective\", ascending=False)"
    ]
   },
   {
@@ -733,8 +734,8 @@
    ],
    "source": [
     "if len(high_perf_sols) > 0:\n",
-    "    for sol in high_perf_sols[[0, len(high_perf_sols) // 2, -1]]:\n",
-    "        display_video(sol)"
+    "    for elite in high_perf_sols.loc[[0, len(high_perf_sols) // 2, -1]].iterelites():\n",
+    "        display_video(elite.sol)"
    ]
   },
   {
@@ -801,7 +802,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.8"
+   "version": "3.7.3"
   }
  },
  "nbformat": 4,

diff --git a/ribs/archives/__init__.py b/ribs/archives/__init__.py
@@ -13,10 +13,12 @@
     ribs.archives.ArchiveBase
     ribs.archives.AddStatus
     ribs.archives.Elite
+    ribs.archives.ArchiveDataFrame
     ribs.archives.ArchiveStats
 """
 from ribs.archives._add_status import AddStatus
 from ribs.archives._archive_base import ArchiveBase
+from ribs.archives._archive_data_frame import ArchiveDataFrame
 from ribs.archives._archive_stats import ArchiveStats
 from ribs.archives._cvt_archive import CVTArchive
 from ribs.archives._elite import Elite
@@ -30,5 +32,6 @@
     "ArchiveBase",
     "AddStatus",
     "Elite",
+    "ArchiveDataFrame",
     "ArchiveStats",
 ]
diff --git a/ribs/archives/_archive_base.py b/ribs/archives/_archive_base.py
@@ -4,10 +4,10 @@
 
 import numba as nb
 import numpy as np
-import pandas as pd
 from decorator import decorator
 
 from ribs.archives._add_status import AddStatus
+from ribs.archives._archive_data_frame import ArchiveDataFrame
 from ribs.archives._archive_stats import ArchiveStats
 from ribs.archives._elite import Elite
 
@@ -75,31 +75,6 @@ def get(self, max_val):
         return val
 
 
-class CachedView:
-    """Maintains a readonly view of the given numpy array.
-
-    Whenever the state changes in update(), the view is updated.
-
-    This class is useful when returning the archive data, e.g.
-    archive.solutions. If the archive has many indices, indexing into the array
-    can be expensive (e.g. ~0.5 seconds for 250k indices), and it adds up if the
-    user does this many times, so we only want to do the indexing once.
-    """
-
-    def __init__(self, array):
-        self.array = array
-        self.view = None
-        self.state = None
-
-    def update(self, indices, state):
-        """Sets view to array[indices], but only if state has changed."""
-        if state != self.state:
-            self.state = state.copy()
-            self.view = self.array[indices]
-            self.view.flags.writeable = False
-        return self.view
-
-
 class ArchiveIterator:
     """An iterator for an archive's elites."""
 
@@ -163,13 +138,6 @@ class ArchiveBase(ABC):  # pylint: disable = too-many-instance-attributes
     | ``_metadata``          |  ``(*storage_dims)``               |
     +------------------------+------------------------------------+
 
-    .. note::
-
-        These arrays are different from the elite data attributes
-        :attr:`solutions`, :attr:`objective_values`, :attr:`behavior_values`,
-        and :attr:`metadata`. The attributes provide access to data about elites
-        in the archive via a view into these arrays.
-
     All of these arrays are accessed via a common index. If we have index ``i``,
     we access its solution at ``_solutions[i]``, its behavior values at
     ``_behavior_values[i]``, etc.
@@ -254,12 +222,6 @@ def __init__(self, storage_dims, behavior_dim, seed=None, dtype=np.float64):
         self._bins = np.product(self._storage_dims)
         self._stats = None
 
-        # Array views for providing access to data.
-        self._solutions_view = None
-        self._objective_values_view = None
-        self._behavior_values_view = None
-        self._metadata_view = None
-
         # Tracks archive modifications by counting calls to clear() and add().
         self._state = None
 
@@ -327,73 +289,6 @@ def dtype(self):
         values."""
         return self._dtype
 
-    ## Data attributes ##
-
-    @property
-    @require_init
-    def solutions(self):
-        """((len(archive), solution_dim) numpy.ndarray): Solutions of all elites
-        currently in the archive."""
-        return self._solutions_view.update(self._occupied_indices_cols,
-                                           self._state)
-
-    @property
-    @require_init
-    def objective_values(self):
-        """(len(archive),) numpy.ndarray): Objective values of all elites
-        currently in the archive.
-
-        These correspond to :attr:`solutions`, e.g. ``objective_values[0]``
-        corresponds to ``solutions[0]``.
-        """
-        return self._objective_values_view.update(self._occupied_indices_cols,
-                                                  self._state)
-
-    @property
-    @require_init
-    def behavior_values(self):
-        """(len(archive), behavior_dim) numpy.ndarray): Behavior values of all
-        elites currently in the archive.
-
-        These correspond to :attr:`solutions`, e.g. ``behavior_values[0]``
-        corresponds to ``solutions[0]``.
-        """
-        return self._behavior_values_view.update(self._occupied_indices_cols,
-                                                 self._state)
-
-    @property
-    @require_init
-    def indices(self):
-        """(len(archive),) tuple: Tuple with indices of all elites in the
-        archive.
-
-        Each entry in the tuple is an index, which can be either an int or tuple
-        of int (see :meth:`get_index` for the specific archive for more info).
-
-        These correspond to :attr:`solutions`, e.g. ``indices[0]`` corresponds
-        to ``solutions[0]``.
-
-        This is a tuple instead of a numpy array because numpy arrays are unable
-        to (easily) store tuples directly.
-        """
-        return tuple(self._occupied_indices)  # List to tuple is cheap.
-
-    @property
-    @require_init
-    def metadata(self):
-        """(len(archive),) numpy.ndarray): Metadata of all elites currently in
-        the archive.
-
-        This array is an object array.
-
-        These correspond to :attr:`solutions`, e.g. ``metadata[0]`` corresponds
-        to ``solutions[0]``.
-        """
-        return self._metadata_view.update(self._occupied_indices_cols,
-                                          self._state)
-
-    ## Methods ##
-
     def __len__(self):
         """Number of elites in the archive."""
         require_init_inline(self)
@@ -463,13 +358,8 @@ def initialize(self, solution_dim):
         self._occupied_indices_cols = tuple(
             [] for _ in range(len(self._storage_dims)))
 
-        self._solutions_view = CachedView(self._solutions)
-        self._objective_values_view = CachedView(self._objective_values)
-        self._behavior_values_view = CachedView(self._behavior_values)
-        self._metadata_view = CachedView(self._metadata)
-        self._state = {"clear": 0, "add": 0}
-
         self._stats_reset()
+        self._state = {"clear": 0, "add": 0}
 
     @require_init
     def clear(self):
@@ -691,19 +581,21 @@ def get_random_elite(self):
         )
 
     def as_pandas(self, include_solutions=True, include_metadata=False):
-        """Converts the archive into a Pandas dataframe.
+        """Converts the archive into an :class:`ArchiveDataFrame` (a child class
+        of :class:`pandas.DataFrame`).
 
-        This base class implementation creates a dataframe consisting of:
+        The implementation of this method in :class:`ArchiveBase` creates a
+        dataframe consisting of:
 
         - ``len(self._storage_dims)`` columns for the index, named
           ``index_0, index_1, ...`` In :class:`~ribs.archives.GridArchive` and
           :class:`~ribs.archives.SlidingBoundariesArchive`, there are
           :attr:`behavior_dim` columns. In :class:`~ribs.archives.CVTArchive`,
           there is just one column. See :meth:`get_index` for more info.
-        - ``self._behavior_dim`` columns for the behavior characteristics, named
+        - :attr:`behavior_dim` columns for the behavior characteristics, named
           ``behavior_0, behavior_1, ...``
         - 1 column for the objective values, named ``objective``
-        - ``solution_dim`` columns for the solution vectors, named
+        - :attr:`solution_dim` columns for the solution vectors, named
           ``solution_0, solution_1, ...``
         - 1 column for the metadata objects, named ``metadata``
 
@@ -715,31 +607,41 @@ def as_pandas(self, include_solutions=True, include_metadata=False):
         |         | ...  |             | ...  |            |             | ... |          |
         +---------+------+-------------+------+------------+-------------+-----+----------+
 
+        Compared to :class:`pandas.DataFrame`, the :class:`ArchiveDataFrame`
+        adds methods and attributes which make it easier to manipulate archive
+        data. For more information, refer to the :class:`ArchiveDataFrame`
+        documentation.
+
         Args:
             include_solutions (bool): Whether to include solution columns.
             include_metadata (bool): Whether to include the metadata column.
                 Note that methods like :meth:`~pandas.DataFrame.to_csv` may not
                 properly save the dataframe since the metadata objects may not
                 be representable in a CSV.
         Returns:
-            pandas.DataFrame: See above.
+            ArchiveDataFrame: See above.
         """ # pylint: disable = line-too-long
         data = OrderedDict()
+        indices = self._occupied_indices_cols
 
-        index_dim = len(self._storage_dims)
-        for i in range(index_dim):
-            data[f"index_{i}"] = np.asarray(self._occupied_indices_cols[i],
-                                            dtype=int)
+        for i, col in enumerate(indices):
+            data[f"index_{i}"] = np.asarray(col, dtype=int)
 
+        behavior_values = self._behavior_values[indices]
         for i in range(self._behavior_dim):
-            data[f"behavior_{i}"] = self.behavior_values[:, i]
+            data[f"behavior_{i}"] = behavior_values[:, i]
 
-        data["objective"] = self.objective_values
+        data["objective"] = self._objective_values[indices]
 
         if include_solutions:
+            solutions = self._solutions[indices]
             for i in range(self._solution_dim):
-                data[f"solution_{i}"] = self.solutions[:, i]
+                data[f"solution_{i}"] = solutions[:, i]
 
         if include_metadata:
-            data["metadata"] = self.metadata
-        return pd.DataFrame(data)
+            data["metadata"] = self._metadata[indices]
+
+        return ArchiveDataFrame(
+            data,
+            copy=False,  # Fancy indexing above already results in copying.
+        )