Skip to content

Commit

Permalink
feat: start recording index details in the mainifest, cache index typ…
Browse files Browse the repository at this point in the history
…e lookup (#3131)

This addresses a specific problem. When a dataset had a scalar index on
a string column we would perform I/O during the planning phase on every
query that contained a filter. This added considerably latency
(especially against S3) to query times.

We now cache that lookup.

It also starts to tackle a more central problem as well. Right now we
our manifest stores very little information about indices (pretty much
just the UUID). Any further information must be obtained by loading the
index. This PR introduces the concept of "index details" which is a spot
that an index can put index-specific (e.g. specific to btree or specific
to bitmap) information that can be accessed during planning (by just
looking at the manifest). At the moment this concept is still fairly
bare bones but I think, as scalar indices become more sophisticated,
this information can be useful.

If we decide we don't want it then I can pull it out as well and dial
this PR back to just the caching component.
  • Loading branch information
westonpace authored Nov 16, 2024
1 parent aa49421 commit a212395
Show file tree
Hide file tree
Showing 20 changed files with 430 additions and 37 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions protos/table.proto
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ syntax = "proto3";

package lance.table;

import "google/protobuf/any.proto";
import "google/protobuf/timestamp.proto";
import "file.proto";

Expand Down Expand Up @@ -187,6 +188,12 @@ message IndexMetadata {
///
/// The bitmap is stored as a 32-bit Roaring bitmap.
bytes fragment_bitmap = 5;

/// Details, specific to the index type, which are needed to load / interpret the index
///
/// Indices should avoid putting large amounts of information in this field, as it will
/// bloat the manifest.
google.protobuf.Any index_details = 6;
}

// Index Section, containing a list of index metadata for one dataset version.
Expand Down Expand Up @@ -340,3 +347,18 @@ message ExternalFile {
// The size of the data in the file.
uint64 size = 3;
}

/// The following messages are used for the index_details field in IndexMetadata.
///
/// This is not an exhaustive set of index types and just lists the index types supported
/// by a base distribution of Lance.

// Currently these are all empty messages because all needed details are either hard-coded (e.g.
// filenames) or stored in the index itself. However, we may want to add more details in the
// future, in particular we can add details that may be useful for planning queries (e.g. don't
// force us to load the index until we know we need it)
message BTreeIndexDetails {}
message BitmapIndexDetails {}
message LabelListIndexDetails {}
message InvertedIndexDetails {}
message VectorIndexDetails {}
1 change: 1 addition & 0 deletions python/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,9 @@ impl Operation {
fields,
dataset_version,
fragment_bitmap: Some(fragment_ids.into_iter().collect()),
// TODO: we should use lance::dataset::Dataset::commit_existing_index once
// we have a way to determine index details from an existing index.
index_details: None,
}];
let op = LanceOperation::CreateIndex {
new_indices,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,15 @@ fn benchmark(file_path: &str) {
}

// Print tsv headers
println!("for file: {}", file_path);
println!("Compression ratio\tCompression speed\tDecompression speed");
println!(
"{:.3}\t\t\t\t{:.2}MB/s\t\t\t{:.2}MB/s",
compression_ratio, com_speed, d_speed
);
#[allow(clippy::print_stdout)]
{
println!("for file: {}", file_path);
println!("Compression ratio\tCompression speed\tDecompression speed");
println!(
"{:.3}\t\t\t\t{:.2}MB/s\t\t\t{:.2}MB/s",
compression_ratio, com_speed, d_speed
);
}
for i in 0..TEST_NUM {
assert_eq!(inputs[i].value_data(), decompression_out_bufs[i]);
assert_eq!(inputs[i].value_offsets(), decompression_out_offsets_bufs[i]);
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-index/src/scalar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ pub mod lance_format;

pub const LANCE_SCALAR_INDEX: &str = "__lance_scalar_index";

#[derive(Debug)]
#[derive(Debug, Copy, Clone)]
pub enum ScalarIndexType {
BTree,
Bitmap,
Expand Down
1 change: 1 addition & 0 deletions rust/lance-table/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ fn main() -> Result<()> {
let mut prost_build = prost_build::Config::new();
prost_build.extern_path(".lance.file", "::lance_file::format::pb");
prost_build.protoc_arg("--experimental_allow_proto3_optional");
prost_build.enable_type_names();
prost_build.compile_protos(
&[
"./protos/table.proto",
Expand Down
8 changes: 8 additions & 0 deletions rust/lance-table/src/format/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ pub struct Index {
///
/// If this is None, then this is unknown.
pub fragment_bitmap: Option<RoaringBitmap>,

/// Metadata specific to the index type
///
/// This is an Option because older versions of Lance may not have this defined. However, it should always
/// be present in newer versions.
pub index_details: Option<prost_types::Any>,
}

impl DeepSizeOf for Index {
Expand Down Expand Up @@ -69,6 +75,7 @@ impl TryFrom<pb::IndexMetadata> for Index {
fields: proto.fields,
dataset_version: proto.dataset_version,
fragment_bitmap,
index_details: proto.index_details,
})
}
}
Expand All @@ -91,6 +98,7 @@ impl From<&Index> for pb::IndexMetadata {
fields: idx.fields.clone(),
dataset_version: idx.dataset_version,
fragment_bitmap,
index_details: idx.index_details.clone(),
}
}
}
1 change: 1 addition & 0 deletions rust/lance/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ object_store = { workspace = true, features = ["aws", "gcp", "azure"] }
aws-credential-types.workspace = true
pin-project.workspace = true
prost.workspace = true
prost-types.workspace = true
roaring.workspace = true
tokio.workspace = true
url.workspace = true
Expand Down
2 changes: 1 addition & 1 deletion rust/lance/examples/full_text_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
//! Benchmark of HNSW graph.
//!
//!
#![allow(clippy::print_stdout)]
use std::collections::HashSet;
use std::sync::Arc;

Expand Down
2 changes: 1 addition & 1 deletion rust/lance/examples/hnsw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
//! Run recall benchmarks for HNSW.
//!
//! run with `cargo run --release --example hnsw`
#![allow(clippy::print_stdout)]
use std::collections::HashSet;
use std::sync::Arc;

Expand Down
2 changes: 1 addition & 1 deletion rust/lance/examples/ivf_hnsw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
//! Run recall benchmarks for HNSW.
//!
//! run with `cargo run --release --example hnsw`
#![allow(clippy::print_stdout)]
use arrow::array::AsArray;
use arrow_array::types::Float32Type;
use clap::Parser;
Expand Down
2 changes: 1 addition & 1 deletion rust/lance/examples/write_read_ds.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
#![allow(clippy::print_stdout)]

use arrow::array::UInt32Array;
use arrow::datatypes::{DataType, Field, Schema};
Expand All @@ -8,7 +9,6 @@ use futures::StreamExt;
use lance::dataset::{WriteMode, WriteParams};
use lance::Dataset;
use std::sync::Arc;

// Writes sample dataset to the given path
async fn write_dataset(data_path: &str) {
// Define new schema
Expand Down
Loading

0 comments on commit a212395

Please sign in to comment.