Skip to content

Commit

Permalink
Dataframe API v2 rerun-io#1: API definitions (rerun-io#7559)
Browse files Browse the repository at this point in the history
The new public API definition and nothing else. Speak now.

* Part of rerun-io#7495 
* Requires rerun-io#7558

---------

Co-authored-by: Jeremy Leibs <[email protected]>
  • Loading branch information
teh-cmc and jleibs authored Oct 2, 2024
1 parent e5ae198 commit 3581ca4
Show file tree
Hide file tree
Showing 3 changed files with 223 additions and 3 deletions.
200 changes: 199 additions & 1 deletion crates/store/re_chunk_store/src/dataframe.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! All the APIs used specifically for `re_dataframe`.
use std::collections::BTreeSet;
use std::collections::{BTreeMap, BTreeSet};

use ahash::HashSet;
use arrow2::{
Expand Down Expand Up @@ -242,6 +242,8 @@ pub struct ComponentColumnDescriptor {
pub store_datatype: ArrowDatatype,

/// How the data will be joined into the resulting `RecordBatch`.
//
// TODO(cmc): remove with the old re_dataframe.
pub join_encoding: JoinEncoding,

/// Whether this column represents static data.
Expand Down Expand Up @@ -445,9 +447,13 @@ impl From<ComponentColumnSelector> for ColumnSelector {
/// Select a control column.
///
/// The only control column currently supported is `rerun.components.RowId`.
//
// TODO(cmc): `RowId` shouldnt be a control column at this point, it should be yet another index.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ControlColumnSelector {
/// Name of the control column.
//
// TODO(cmc): this should be `component_name`.
pub component: ComponentName,
}

Expand All @@ -470,6 +476,9 @@ impl From<ControlColumnDescriptor> for ControlColumnSelector {
}

/// Select a time column.
//
// TODO(cmc): This shouldn't be specific to time, this should be an `IndexColumnSelector` or smth.
// Particularly unfortunate that this one already leaks into the public API…
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct TimeColumnSelector {
/// The name of the timeline.
Expand Down Expand Up @@ -497,9 +506,13 @@ pub struct ComponentColumnSelector {
pub entity_path: EntityPath,

/// Semantic name associated with this data.
//
// TODO(cmc): this should be `component_name`.
pub component: ComponentName,

/// How to join the data into the `RecordBatch`.
//
// TODO(cmc): remove once old `re_dataframe` is gone.
pub join_encoding: JoinEncoding,
}

Expand Down Expand Up @@ -701,6 +714,191 @@ impl std::fmt::Display for RangeQueryExpression {
}
}

// --- Queries v2 ---

/// Specifies how null values should be filled in the returned dataframe.
#[derive(Default, Debug, Clone, PartialEq, Eq, Hash)]
pub enum SparseFillStrategy {
/// No sparse filling. Nulls stay nulls.
#[default]
None,

/// Fill null values using global-scope latest-at semantics.
///
/// The latest-at semantics are applied on the entire dataset as opposed to just the current
/// view contents: it is possible to end up with values from outside the view!
LatestAtGlobal,
//
// TODO(cmc): `LatestAtView`?
}

impl std::fmt::Display for SparseFillStrategy {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::None => f.write_str("none"),
Self::LatestAtGlobal => f.write_str("latest-at (global)"),
}
}
}

/// The view contents specify which subset of the database (i.e., which columns) the query runs on,
/// expressed as a set of [`EntityPath`]s and their associated [`ComponentName`]s.
///
/// Setting an entity's components to `None` means: everything.
///
// TODO(cmc): we need to be able to build that easily in a command-line context, otherwise it's just
// very annoying. E.g. `--with /world/points:[rr.Position3D, rr.Radius] --with /cam:[rr.Pinhole]`.
pub type ViewContents = BTreeMap<EntityPath, Option<BTreeSet<ComponentName>>>;

// TODO(cmc): Ultimately, this shouldn't be hardcoded to `Timeline`, but to a generic `I: Index`.
// `Index` in this case should also be implemented on tuples (`(I1, I2, ...)`).
pub type Index = Timeline;

// TODO(cmc): Ultimately, this shouldn't be hardcoded to `TimeInt`, but to a generic `I: Index`.
// `Index` in this case should also be implemented on tuples (`(I1, I2, ...)`).
pub type IndexValue = TimeInt;

// TODO(cmc): Ultimately, this shouldn't be hardcoded to `ResolvedTimeRange`, but to a generic `I: Index`.
// `Index` in this case should also be implemented on tuples (`(I1, I2, ...)`).
pub type IndexRange = ResolvedTimeRange;

/// Describes a complete query for Rerun's dataframe API.
///
/// ## Terminology: view vs. selection vs. filtering vs. sampling
///
/// * The view contents specify which subset of the database (i.e., which columns) the query runs on,
/// expressed as a set of [`EntityPath`]s and their associated [`ComponentName`]s.
///
/// * The filters filter out _rows_ of data from the view contents.
/// A filter cannot possibly introduce new rows, it can only remove existing ones from the view contents.
///
/// * The samplers sample _rows_ of data from the view contents at user-specified values.
/// Samplers don't necessarily return existing rows: they might introduce new ones if the sampled value
/// isn't present in the view contents in the first place.
///
/// * The selection applies last and samples _columns_ of data from the filtered/sampled view contents.
/// Selecting a column that isn't present in the view contents results in an empty column in the
/// final dataframe (null array).
///
/// A very rough mental model, in SQL terms:
/// ```text
/// SELECT <Self::selection> FROM <Self::view_contents> WHERE <Self::filtered_*>
/// ```
//
// TODO(cmc): ideally we'd like this to be the same type as the one used in the blueprint, possibly?
// TODO(cmc): Get rid of all re_dataframe (as opposed to re_dataframe2) stuff and rename this.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct QueryExpression2 {
/// The subset of the database that the query will run on: a set of [`EntityPath`]s and their
/// associated [`ComponentName`]s.
///
/// Defaults to `None`, which means: everything.
///
/// Example (pseudo-code):
/// ```text
/// view_contents = {
/// "world/points": [rr.Position3D, rr.Radius],
/// "metrics": [rr.Scalar]
/// }
/// ```
pub view_contents: Option<ViewContents>,

/// The index used to filter out _rows_ from the view contents.
///
/// Only rows where at least 1 column contains non-null data at that index will be kept in the
/// final dataset.
///
/// Example: `Timeline("frame")`.
//
// TODO(cmc): this has to be a selector otherwise this is a horrible UX.
pub filtered_index: Timeline,

/// The range of index values used to filter out _rows_ from the view contents.
///
/// Only rows where at least 1 of the view-contents contains non-null data within that range will be kept in
/// the final dataset.
///
/// This is ignored if [QueryExpression2::`sampled_index_values`] is set.
///
/// Example: `ResolvedTimeRange(10, 20)`.
pub filtered_index_range: Option<IndexRange>,

/// TODO(cmc): NOT IMPLEMENTED.
///
/// The specific index values used to filter out _rows_ from the view contents.
///
/// Only rows where at least 1 column contains non-null data at these specific values will be kept
/// in the final dataset.
///
/// This is ignored if [QueryExpression2::`sampled_index_values`] is set.
///
/// Example: `[TimeInt(12), TimeInt(14)]`.
pub filtered_index_values: Option<BTreeSet<IndexValue>>,

/// TODO(cmc): NOT IMPLEMENTED.
///
/// The specific index values used to sample _rows_ from the view contents.
///
/// The final dataset will contain one row per sampled index value, regardless of whether data
/// existed for that index value in the view contents.
///
/// The order of the samples will be respected in the final result.
///
/// If [QueryExpression2::`sampled_index_values`] is set, it overrides both [`QueryExpression2::filtered_index_range`]
/// and [`QueryExpression2::filtered_index_values`].
///
/// Example: `[TimeInt(12), TimeInt(14)]`.
TODO(jleibs): We need an alternative name for sampled.
pub sampled_index_values: Option<Vec<IndexValue>>,

/// TODO(cmc): NOT IMPLEMENTED.
///
/// The component column used to filter out _rows_ from the view contents.
///
/// Only rows where this column contains non-null data be kept in the final dataset.
///
/// Example: `ComponentColumnSelector("rerun.components.Position3D")`.
//
// TODO(cmc): multi-pov support
pub filtered_point_of_view: Option<ComponentColumnSelector>,

/// TODO(cmc): NOT IMPLEMENTED.
///
/// Specifies how null values should be filled in the returned dataframe.
///
/// Defaults to [`SparseFillStrategy::None`].
pub sparse_fill_strategy: SparseFillStrategy,

/// The specific _columns_ to sample from the final view contents.
///
/// The order of the samples will be respected in the final result.
///
/// Defaults to `None`, which means: everything.
///
/// Example: `[ColumnSelector(Time("log_time")), ColumnSelector(Component("rerun.components.Position3D"))]`.
//
// TODO(cmc): the selection has to be on the QueryHandle, otherwise it's hell to use.
pub selection: Option<Vec<ColumnSelector>>,
}

impl QueryExpression2 {
#[inline]
pub fn new(index: impl Into<Timeline>) -> Self {
let index = index.into();

Self {
view_contents: None,
filtered_index: index,
filtered_index_range: None,
filtered_index_values: None,
sampled_index_values: None,
filtered_point_of_view: None,
sparse_fill_strategy: SparseFillStrategy::None,
selection: None,
}
}
}

// ---

impl ChunkStore {
Expand Down
5 changes: 3 additions & 2 deletions crates/store/re_chunk_store/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ mod writes;

pub use self::dataframe::{
ColumnDescriptor, ColumnSelector, ComponentColumnDescriptor, ComponentColumnSelector,
ControlColumnDescriptor, ControlColumnSelector, JoinEncoding, LatestAtQueryExpression,
QueryExpression, RangeQueryExpression, TimeColumnDescriptor, TimeColumnSelector,
ControlColumnDescriptor, ControlColumnSelector, Index, IndexRange, IndexValue, JoinEncoding,
LatestAtQueryExpression, QueryExpression, QueryExpression2, RangeQueryExpression,
SparseFillStrategy, TimeColumnDescriptor, TimeColumnSelector,
};
pub use self::events::{ChunkStoreDiff, ChunkStoreDiffKind, ChunkStoreEvent};
pub use self::gc::{GarbageCollectionOptions, GarbageCollectionTarget};
Expand Down
21 changes: 21 additions & 0 deletions crates/store/re_log_types/src/path/entity_path_filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,27 @@ pub struct EntityPathRule {
pub include_subtree: bool,
}

impl From<EntityPath> for EntityPathRule {
#[inline]
fn from(entity_path: EntityPath) -> Self {
Self::exact(entity_path)
}
}

impl std::hash::Hash for EntityPathRule {
#[inline]
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
let Self {
raw_expression: _,
path,
include_subtree,
} = self;

std::hash::Hash::hash(path, state);
std::hash::Hash::hash(include_subtree, state);
}
}

impl std::fmt::Display for EntityPathRule {
#[inline]
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
Expand Down

0 comments on commit 3581ca4

Please sign in to comment.