Skip to content

Commit

Permalink
rerun_py.dataframe: add support for .filter_index_values (rerun-io#7670)
Browse files Browse the repository at this point in the history
### What
- Add a new type helper for IndexValues (in general some variation of
this will be useful for other APIs)
- Added a dependency on `numpy` package for ArrayLike functionality
  - `numpy` has an old dep on ndarray.
- This has already been fixed but not yet released:
PyO3/rust-numpy#439
- ChunkedArrays are sadly are more complicated (see the note in:
https://docs.rs/arrow/latest/arrow/pyarrow/index.html)


### Checklist
* [x] I have read and agree to [Contributor
Guide](https://github.com/rerun-io/rerun/blob/main/CONTRIBUTING.md) and
the [Code of
Conduct](https://github.com/rerun-io/rerun/blob/main/CODE_OF_CONDUCT.md)
* [x] I've included a screenshot or gif (if applicable)
* [x] I have tested the web demo (if applicable):
* Using examples from latest `main` build:
[rerun.io/viewer](https://rerun.io/viewer/pr/7670?manifest_url=https://app.rerun.io/version/main/examples_manifest.json)
* Using full set of examples from `nightly` build:
[rerun.io/viewer](https://rerun.io/viewer/pr/7670?manifest_url=https://app.rerun.io/version/nightly/examples_manifest.json)
* [x] The PR title and labels are set such as to maximize their
usefulness for the next release's CHANGELOG
* [x] If applicable, add a new check to the [release
checklist](https://github.com/rerun-io/rerun/blob/main/tests/python/release_checklist)!
* [x] If have noted any breaking changes to the log API in
`CHANGELOG.md` and the migration guide

- [PR Build Summary](https://build.rerun.io/pr/7670)
- [Recent benchmark results](https://build.rerun.io/graphs/crates.html)
- [Wasm size tracking](https://build.rerun.io/graphs/sizes.html)

To run all checks from `main`, comment on the PR with `@rerun-bot
full-check`.
  • Loading branch information
jleibs authored Oct 10, 2024
1 parent 8df29e2 commit 370d992
Show file tree
Hide file tree
Showing 8 changed files with 280 additions and 45 deletions.
47 changes: 38 additions & 9 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3588,6 +3588,19 @@ version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "308d96db8debc727c3fd9744aac51751243420e46edf401010908da7f8d5e57c"

[[package]]
name = "ndarray"
version = "0.15.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
dependencies = [
"matrixmultiply",
"num-complex",
"num-integer",
"num-traits",
"rawpointer",
]

[[package]]
name = "ndarray"
version = "0.16.1"
Expand All @@ -3609,7 +3622,7 @@ version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f093b3db6fd194718dcdeea6bd8c829417deae904e3fcc7732dabcd4416d25d8"
dependencies = [
"ndarray",
"ndarray 0.16.1",
"rand",
"rand_distr",
]
Expand Down Expand Up @@ -3846,6 +3859,21 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"

[[package]]
name = "numpy"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec170733ca37175f5d75a5bea5911d6ff45d2cd52849ce98b685394e4f2f37f4"
dependencies = [
"libc",
"ndarray 0.15.6",
"num-complex",
"num-integer",
"num-traits",
"pyo3",
"rustc-hash",
]

[[package]]
name = "objc"
version = "0.2.7"
Expand Down Expand Up @@ -5457,7 +5485,7 @@ dependencies = [
"document-features",
"itertools 0.13.0",
"libc",
"ndarray",
"ndarray 0.16.1",
"ndarray-rand",
"once_cell",
"parking_lot",
Expand Down Expand Up @@ -5647,7 +5675,7 @@ dependencies = [
"bytemuck",
"egui",
"half 2.3.1",
"ndarray",
"ndarray 0.16.1",
"re_chunk_store",
"re_data_ui",
"re_log_types",
Expand Down Expand Up @@ -5798,7 +5826,7 @@ dependencies = [
"linked-hash-map",
"mime_guess2",
"mint",
"ndarray",
"ndarray 0.16.1",
"nohash-hasher",
"once_cell",
"ply-rs",
Expand Down Expand Up @@ -6015,7 +6043,7 @@ dependencies = [
"indexmap 2.1.0",
"itertools 0.13.0",
"linked-hash-map",
"ndarray",
"ndarray 0.16.1",
"nohash-hasher",
"once_cell",
"parking_lot",
Expand Down Expand Up @@ -6289,6 +6317,7 @@ dependencies = [
"infer",
"itertools 0.13.0",
"mimalloc",
"numpy",
"once_cell",
"parking_lot",
"pyo3",
Expand Down Expand Up @@ -6461,7 +6490,7 @@ dependencies = [
"clap",
"half 2.3.1",
"image",
"ndarray",
"ndarray 0.16.1",
"re_log",
"rerun",
]
Expand Down Expand Up @@ -6533,7 +6562,7 @@ version = "0.19.0-alpha.1+dev"
dependencies = [
"anyhow",
"clap",
"ndarray",
"ndarray 0.16.1",
"re_log",
"rerun",
]
Expand Down Expand Up @@ -7016,7 +7045,7 @@ name = "snippets"
version = "0.19.0-alpha.1+dev"
dependencies = [
"itertools 0.13.0",
"ndarray",
"ndarray 0.16.1",
"rand",
"rand_distr",
"re_build_tools",
Expand Down Expand Up @@ -7188,7 +7217,7 @@ dependencies = [
"clap",
"glam",
"itertools 0.13.0",
"ndarray",
"ndarray 0.16.1",
"ndarray-rand",
"rand",
"re_log",
Expand Down
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ memory-stats = "1.1"
# This version is not pinned to avoid creating version requirement conflicts,
# but other packages pin it to exactly "=0.1.37"
mimalloc = "0.1.37"
mime_guess2 = "2.0" # infer MIME type by file extension, and map mime to file extension
mime_guess2 = "2.0" # infer MIME type by file extension, and map mime to file extension
mint = "0.5.9"
re_mp4 = "0.1.0"
natord = "1.0.9"
Expand All @@ -206,6 +206,9 @@ nohash-hasher = "0.2"
notify = { version = "6.1.1", features = ["macos_kqueue"] }
num-derive = "0.4"
num-traits = "0.2"
# TODO(#7676) This pulls in an older ndarray. Remove it from the skip list in `deny.toml` and
# close the issue when updating to 0.22.
numpy = "0.21"
once_cell = "1.17" # No lazy_static - use `std::sync::OnceLock` or `once_cell` instead
ordered-float = "4.2"
parking_lot = "0.12"
Expand Down
1 change: 1 addition & 0 deletions deny.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ skip = [
{ name = "hashbrown" }, # Old version used by polar-rs
{ name = "libloading" }, # Old version used by ash (vulkan binding), newer version used by khronos-egl
{ name = "memoffset" }, # Small crate
{ name = "ndarray" }, # Needed by `numpy<0.22` in `rerun_py`
{ name = "prettyplease" }, # Old version being used by prost
{ name = "pulldown-cmark" }, # Build-dependency via `ply-rs` (!). TODO(emilk): use a better crate for .ply parsing
{ name = "raw-window-handle" }, # Pretty small crate; some crates still on old version
Expand Down
1 change: 1 addition & 0 deletions rerun_py/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ infer.workspace = true
# TODO(#5875): `mimalloc` starts leaking OS pages starting with `0.1.38`.
# When the bug is fixed, change this back to `mimalloc = { workspace = true, …`.
mimalloc = { version = "=0.1.37", features = ["local_dynamic_tls"] }
numpy.workspace = true
once_cell.workspace = true
parking_lot.workspace = true
pyo3 = { workspace = true, features = ["abi3-py38"] }
Expand Down
12 changes: 11 additions & 1 deletion rerun_py/rerun_bindings/rerun_bindings.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ from typing import Optional, Sequence

import pyarrow as pa

from .types import AnyColumn, ComponentLike, ViewContentsLike
from .types import AnyColumn, ComponentLike, IndexValuesLike, ViewContentsLike

class IndexColumnDescriptor:
"""A column containing the index values for when the component data was updated."""
Expand Down Expand Up @@ -57,6 +57,16 @@ class RecordingView:
"""Filter the view to only include data between the given index time values."""
...

def filter_index_values(self, values: IndexValuesLike) -> RecordingView:
"""
Filter the view to only include data at the given index values.
This requires index values to be a precise match. Index values in Rerun are
represented as i64 sequence counts or nanoseconds. This API does not expose an interface
in floating point seconds, as the numerical conversion would risk false mismatches.
"""
...

def select(self, *args: AnyColumn, columns: Optional[Sequence[AnyColumn]] = None) -> pa.RecordBatchReader: ...

class Recording:
Expand Down
14 changes: 10 additions & 4 deletions rerun_py/rerun_bindings/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,27 @@

from typing import TYPE_CHECKING, Sequence, TypeAlias, Union

import numpy as np
import numpy.typing as npt
import pyarrow as pa

if TYPE_CHECKING:
from rerun._baseclasses import ComponentMixin

from .rerun_bindings import (
ComponentColumnDescriptor as ComponentColumnDescriptor,
ComponentColumnSelector as ComponentColumnSelector,
TimeColumnDescriptor as TimeColumnDescriptor,
TimeColumnSelector as TimeColumnSelector,
IndexColumnSelector as IndexColumnDescriptor,
IndexColumnSelector as IndexColumnSelector,
)

ComponentLike: TypeAlias = Union[str, type["ComponentMixin"]]

AnyColumn: TypeAlias = Union[
"TimeColumnDescriptor",
"ComponentColumnDescriptor",
"TimeColumnSelector",
"ComponentColumnSelector",
"IndexColumnDescriptor",
"IndexColumnSelector",
]

AnyComponentColumn: TypeAlias = Union[
Expand All @@ -30,3 +34,5 @@
str,
dict[str, Union[AnyColumn, Sequence[ComponentLike]]],
]

IndexValuesLike: TypeAlias = Union[npt.NDArray[np.int_], pa.Int64Array]
117 changes: 116 additions & 1 deletion rerun_py/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
use std::collections::{BTreeMap, BTreeSet};

use arrow::{
array::{RecordBatchIterator, RecordBatchReader},
array::{make_array, Array, ArrayData, Int64Array, RecordBatchIterator, RecordBatchReader},
pyarrow::PyArrowType,
};
use numpy::PyArrayMethods as _;
use pyo3::{
exceptions::{PyRuntimeError, PyTypeError, PyValueError},
prelude::*,
Expand Down Expand Up @@ -195,6 +196,108 @@ impl AnyComponentColumn {
}
}

#[derive(FromPyObject)]
enum IndexValuesLike<'py> {
PyArrow(PyArrowType<ArrayData>),
NumPy(numpy::PyArrayLike1<'py, i64>),

// Catch all to support ChunkedArray and other types
#[pyo3(transparent)]
CatchAll(Bound<'py, PyAny>),
}

impl<'py> IndexValuesLike<'py> {
fn to_index_values(&self) -> PyResult<BTreeSet<re_chunk_store::TimeInt>> {
match self {
Self::PyArrow(array) => {
let array = make_array(array.0.clone());

let int_array = array.as_any().downcast_ref::<Int64Array>().ok_or_else(|| {
PyTypeError::new_err("pyarrow.Array for IndexValuesLike must be of type int64.")
})?;

let values: BTreeSet<re_chunk_store::TimeInt> = int_array
.iter()
.map(|v| {
v.map_or_else(
|| re_chunk_store::TimeInt::STATIC,
// The use of temporal here should be fine even if the data is
// not actually temporal. The important thing is we are converting
// from an i64 input
re_chunk_store::TimeInt::new_temporal,
)
})
.collect();

if values.len() != int_array.len() {
return Err(PyValueError::new_err("Index values must be unique."));
}

Ok(values)
}
Self::NumPy(array) => {
let values: BTreeSet<re_chunk_store::TimeInt> = array
.readonly()
.as_array()
.iter()
// The use of temporal here should be fine even if the data is
// not actually temporal. The important thing is we are converting
// from an i64 input
.map(|v| re_chunk_store::TimeInt::new_temporal(*v))
.collect();

if values.len() != array.len()? {
return Err(PyValueError::new_err("Index values must be unique."));
}

Ok(values)
}
Self::CatchAll(any) => {
// If any has the `.chunks` attribute, we can try to try each chunk as pyarrow array
if let Ok(chunks) = any.getattr("chunks") {
let mut values = BTreeSet::new();
for chunk in chunks.iter()? {
let chunk = chunk?.extract::<PyArrowType<ArrayData>>()?;
let array = make_array(chunk.0.clone());

let int_array =
array.as_any().downcast_ref::<Int64Array>().ok_or_else(|| {
PyTypeError::new_err(
"pyarrow.Array for IndexValuesLike must be of type int64.",
)
})?;

values.extend(
int_array
.iter()
.map(|v| {
v.map_or_else(
|| re_chunk_store::TimeInt::STATIC,
// The use of temporal here should be fine even if the data is
// not actually temporal. The important thing is we are converting
// from an i64 input
re_chunk_store::TimeInt::new_temporal,
)
})
.collect::<BTreeSet<_>>(),
);
}

if values.len() != any.len()? {
return Err(PyValueError::new_err("Index values must be unique."));
}

Ok(values)
} else {
Err(PyTypeError::new_err(
"IndexValuesLike must be a pyarrow.Array, pyarrow.ChunkedArray, or numpy.ndarray",
))
}
}
}
}
}

struct ComponentLike(re_sdk::ComponentName);

impl FromPyObject<'_> for ComponentLike {
Expand Down Expand Up @@ -438,6 +541,18 @@ impl PyRecordingView {
query_expression,
})
}

fn filter_index_values(&self, values: IndexValuesLike<'_>) -> PyResult<Self> {
let values = values.to_index_values()?;

let mut query_expression = self.query_expression.clone();
query_expression.filtered_index_values = Some(values);

Ok(Self {
recording: self.recording.clone(),
query_expression,
})
}
}

impl PyRecording {
Expand Down
Loading

0 comments on commit 370d992

Please sign in to comment.