Dataframe API v2 rerun-io#1: API definitions (rerun-io#7559)

The new public API definition and nothing else. Speak now. * Part of rerun-io#7495 * Requires rerun-io#7558 --------- Co-authored-by: Jeremy Leibs <[email protected]>
TeoNikolov · Oct 2, 2024 · 3581ca4 · 3581ca4
1 parent e5ae198
commit 3581ca4
Show file tree

Hide file tree

Showing 3 changed files with 223 additions and 3 deletions.
diff --git a/crates/store/re_chunk_store/src/dataframe.rs b/crates/store/re_chunk_store/src/dataframe.rs
@@ -1,6 +1,6 @@
 //! All the APIs used specifically for `re_dataframe`.
 
-use std::collections::BTreeSet;
+use std::collections::{BTreeMap, BTreeSet};
 
 use ahash::HashSet;
 use arrow2::{
@@ -242,6 +242,8 @@ pub struct ComponentColumnDescriptor {
     pub store_datatype: ArrowDatatype,
 
     /// How the data will be joined into the resulting `RecordBatch`.
+    //
+    // TODO(cmc): remove with the old re_dataframe.
     pub join_encoding: JoinEncoding,
 
     /// Whether this column represents static data.
@@ -445,9 +447,13 @@ impl From<ComponentColumnSelector> for ColumnSelector {
 /// Select a control column.
 ///
 /// The only control column currently supported is `rerun.components.RowId`.
+//
+// TODO(cmc): `RowId` shouldnt be a control column at this point, it should be yet another index.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct ControlColumnSelector {
     /// Name of the control column.
+    //
+    // TODO(cmc): this should be `component_name`.
     pub component: ComponentName,
 }
 
@@ -470,6 +476,9 @@ impl From<ControlColumnDescriptor> for ControlColumnSelector {
 }
 
 /// Select a time column.
+//
+// TODO(cmc): This shouldn't be specific to time, this should be an `IndexColumnSelector` or smth.
+// Particularly unfortunate that this one already leaks into the public API…
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct TimeColumnSelector {
     /// The name of the timeline.
@@ -497,9 +506,13 @@ pub struct ComponentColumnSelector {
     pub entity_path: EntityPath,
 
     /// Semantic name associated with this data.
+    //
+    // TODO(cmc): this should be `component_name`.
     pub component: ComponentName,
 
     /// How to join the data into the `RecordBatch`.
+    //
+    // TODO(cmc): remove once old `re_dataframe` is gone.
     pub join_encoding: JoinEncoding,
 }
 
@@ -701,6 +714,191 @@ impl std::fmt::Display for RangeQueryExpression {
     }
 }
 
+// --- Queries v2 ---
+
+/// Specifies how null values should be filled in the returned dataframe.
+#[derive(Default, Debug, Clone, PartialEq, Eq, Hash)]
+pub enum SparseFillStrategy {
+    /// No sparse filling. Nulls stay nulls.
+    #[default]
+    None,
+
+    /// Fill null values using global-scope latest-at semantics.
+    ///
+    /// The latest-at semantics are applied on the entire dataset as opposed to just the current
+    /// view contents: it is possible to end up with values from outside the view!
+    LatestAtGlobal,
+    //
+    // TODO(cmc): `LatestAtView`?
+}
+
+impl std::fmt::Display for SparseFillStrategy {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::None => f.write_str("none"),
+            Self::LatestAtGlobal => f.write_str("latest-at (global)"),
+        }
+    }
+}
+
+/// The view contents specify which subset of the database (i.e., which columns) the query runs on,
+/// expressed as a set of [`EntityPath`]s and their associated [`ComponentName`]s.
+///
+/// Setting an entity's components to `None` means: everything.
+///
+// TODO(cmc): we need to be able to build that easily in a command-line context, otherwise it's just
+// very annoying. E.g. `--with /world/points:[rr.Position3D, rr.Radius] --with /cam:[rr.Pinhole]`.
+pub type ViewContents = BTreeMap<EntityPath, Option<BTreeSet<ComponentName>>>;
+
+// TODO(cmc): Ultimately, this shouldn't be hardcoded to `Timeline`, but to a generic `I: Index`.
+//            `Index` in this case should also be implemented on tuples (`(I1, I2, ...)`).
+pub type Index = Timeline;
+
+// TODO(cmc): Ultimately, this shouldn't be hardcoded to `TimeInt`, but to a generic `I: Index`.
+//            `Index` in this case should also be implemented on tuples (`(I1, I2, ...)`).
+pub type IndexValue = TimeInt;
+
+// TODO(cmc): Ultimately, this shouldn't be hardcoded to `ResolvedTimeRange`, but to a generic `I: Index`.
+//            `Index` in this case should also be implemented on tuples (`(I1, I2, ...)`).
+pub type IndexRange = ResolvedTimeRange;
+
+/// Describes a complete query for Rerun's dataframe API.
+///
+/// ## Terminology: view vs. selection vs. filtering vs. sampling
+///
+/// * The view contents specify which subset of the database (i.e., which columns) the query runs on,
+///   expressed as a set of [`EntityPath`]s and their associated [`ComponentName`]s.
+///
+/// * The filters filter out _rows_ of data from the view contents.
+///   A filter cannot possibly introduce new rows, it can only remove existing ones from the view contents.
+///
+/// * The samplers sample _rows_ of data from the view contents at user-specified values.
+///   Samplers don't necessarily return existing rows: they might introduce new ones if the sampled value
+///   isn't present in the view contents in the first place.
+///
+/// * The selection applies last and samples _columns_ of data from the filtered/sampled view contents.
+///   Selecting a column that isn't present in the view contents results in an empty column in the
+///   final dataframe (null array).
+///
+/// A very rough mental model, in SQL terms:
+/// ```text
+/// SELECT <Self::selection> FROM <Self::view_contents> WHERE <Self::filtered_*>
+/// ```
+//
+// TODO(cmc): ideally we'd like this to be the same type as the one used in the blueprint, possibly?
+// TODO(cmc): Get rid of all re_dataframe (as opposed to re_dataframe2) stuff and rename this.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct QueryExpression2 {
+    /// The subset of the database that the query will run on: a set of [`EntityPath`]s and their
+    /// associated [`ComponentName`]s.
+    ///
+    /// Defaults to `None`, which means: everything.
+    ///
+    /// Example (pseudo-code):
+    /// ```text
+    /// view_contents = {
+    ///   "world/points": [rr.Position3D, rr.Radius],
+    ///   "metrics": [rr.Scalar]
+    /// }
+    /// ```
+    pub view_contents: Option<ViewContents>,
+
+    /// The index used to filter out _rows_ from the view contents.
+    ///
+    /// Only rows where at least 1 column contains non-null data at that index will be kept in the
+    /// final dataset.
+    ///
+    /// Example: `Timeline("frame")`.
+    //
+    // TODO(cmc): this has to be a selector otherwise this is a horrible UX.
+    pub filtered_index: Timeline,
+
+    /// The range of index values used to filter out _rows_ from the view contents.
+    ///
+    /// Only rows where at least 1 of the view-contents contains non-null data within that range will be kept in
+    /// the final dataset.
+    ///
+    /// This is ignored if [QueryExpression2::`sampled_index_values`] is set.
+    ///
+    /// Example: `ResolvedTimeRange(10, 20)`.
+    pub filtered_index_range: Option<IndexRange>,
+
+    /// TODO(cmc): NOT IMPLEMENTED.
+    ///
+    /// The specific index values used to filter out _rows_ from the view contents.
+    ///
+    /// Only rows where at least 1 column contains non-null data at these specific values will be kept
+    /// in the final dataset.
+    ///
+    /// This is ignored if [QueryExpression2::`sampled_index_values`] is set.
+    ///
+    /// Example: `[TimeInt(12), TimeInt(14)]`.
+    pub filtered_index_values: Option<BTreeSet<IndexValue>>,
+
+    /// TODO(cmc): NOT IMPLEMENTED.
+    ///
+    /// The specific index values used to sample _rows_ from the view contents.
+    ///
+    /// The final dataset will contain one row per sampled index value, regardless of whether data
+    /// existed for that index value in the view contents.
+    ///
+    /// The order of the samples will be respected in the final result.
+    ///
+    /// If [QueryExpression2::`sampled_index_values`] is set, it overrides both [`QueryExpression2::filtered_index_range`]
+    /// and [`QueryExpression2::filtered_index_values`].
+    ///
+    /// Example: `[TimeInt(12), TimeInt(14)]`.
+    TODO(jleibs): We need an alternative name for sampled.
+    pub sampled_index_values: Option<Vec<IndexValue>>,
+
+    /// TODO(cmc): NOT IMPLEMENTED.
+    ///
+    /// The component column used to filter out _rows_ from the view contents.
+    ///
+    /// Only rows where this column contains non-null data be kept in the final dataset.
+    ///
+    /// Example: `ComponentColumnSelector("rerun.components.Position3D")`.
+    //
+    // TODO(cmc): multi-pov support
+    pub filtered_point_of_view: Option<ComponentColumnSelector>,
+
+    /// TODO(cmc): NOT IMPLEMENTED.
+    ///
+    /// Specifies how null values should be filled in the returned dataframe.
+    ///
+    /// Defaults to [`SparseFillStrategy::None`].
+    pub sparse_fill_strategy: SparseFillStrategy,
+
+    /// The specific _columns_ to sample from the final view contents.
+    ///
+    /// The order of the samples will be respected in the final result.
+    ///
+    /// Defaults to `None`, which means: everything.
+    ///
+    /// Example: `[ColumnSelector(Time("log_time")), ColumnSelector(Component("rerun.components.Position3D"))]`.
+    //
+    // TODO(cmc): the selection has to be on the QueryHandle, otherwise it's hell to use.
+    pub selection: Option<Vec<ColumnSelector>>,
+}
+
+impl QueryExpression2 {
+    #[inline]
+    pub fn new(index: impl Into<Timeline>) -> Self {
+        let index = index.into();
+
+        Self {
+            view_contents: None,
+            filtered_index: index,
+            filtered_index_range: None,
+            filtered_index_values: None,
+            sampled_index_values: None,
+            filtered_point_of_view: None,
+            sparse_fill_strategy: SparseFillStrategy::None,
+            selection: None,
+        }
+    }
+}
+
 // ---
 
 impl ChunkStore {

diff --git a/crates/store/re_chunk_store/src/lib.rs b/crates/store/re_chunk_store/src/lib.rs
@@ -25,8 +25,9 @@ mod writes;
 
 pub use self::dataframe::{
     ColumnDescriptor, ColumnSelector, ComponentColumnDescriptor, ComponentColumnSelector,
-    ControlColumnDescriptor, ControlColumnSelector, JoinEncoding, LatestAtQueryExpression,
-    QueryExpression, RangeQueryExpression, TimeColumnDescriptor, TimeColumnSelector,
+    ControlColumnDescriptor, ControlColumnSelector, Index, IndexRange, IndexValue, JoinEncoding,
+    LatestAtQueryExpression, QueryExpression, QueryExpression2, RangeQueryExpression,
+    SparseFillStrategy, TimeColumnDescriptor, TimeColumnSelector,
 };
 pub use self::events::{ChunkStoreDiff, ChunkStoreDiffKind, ChunkStoreEvent};
 pub use self::gc::{GarbageCollectionOptions, GarbageCollectionTarget};

diff --git a/crates/store/re_log_types/src/path/entity_path_filter.rs b/crates/store/re_log_types/src/path/entity_path_filter.rs
@@ -88,6 +88,27 @@ pub struct EntityPathRule {
     pub include_subtree: bool,
 }
 
+impl From<EntityPath> for EntityPathRule {
+    #[inline]
+    fn from(entity_path: EntityPath) -> Self {
+        Self::exact(entity_path)
+    }
+}
+
+impl std::hash::Hash for EntityPathRule {
+    #[inline]
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        let Self {
+            raw_expression: _,
+            path,
+            include_subtree,
+        } = self;
+
+        std::hash::Hash::hash(path, state);
+        std::hash::Hash::hash(include_subtree, state);
+    }
+}
+
 impl std::fmt::Display for EntityPathRule {
     #[inline]
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {