-
Notifications
You must be signed in to change notification settings - Fork 240
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Benchmarks report to https://bencher.dev/console/projects/weston-lancedb/plots At some point it may be nice for these to be used for regression detection in PRs. However, we need to get a stable baseline first. These benchmarks rely on a private runner hosted by LanceDB and some private datasets. They run against GCS. It would be good to get some NVME & Azure & S3 benchmarks at some point.
- Loading branch information
1 parent
196ec06
commit e6c2343
Showing
9 changed files
with
259 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
name: Run Regression Benchmarks | ||
|
||
on: | ||
push: | ||
branches: | ||
- main | ||
|
||
jobs: | ||
bench_regress: | ||
timeout-minutes: 30 | ||
runs-on: warp-custom-gcp-storage-benchmark | ||
env: | ||
# Need up-to-date compilers for kernels | ||
CC: clang-18 | ||
CXX: clang-18 | ||
defaults: | ||
run: | ||
shell: bash | ||
working-directory: python | ||
steps: | ||
- uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 0 | ||
lfs: true | ||
- name: Authenticate with GCS | ||
uses: "google-github-actions/auth@v2" | ||
with: | ||
credentials_json: "${{ secrets.GCLOUD_BENCH_STORAGE_USER_KEY }}" | ||
- name: Install bencher | ||
uses: bencherdev/bencher@main | ||
- name: Set up Python | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: 3.11 # Ray does not support 3.12 yet. | ||
- uses: Swatinem/rust-cache@v2 | ||
with: | ||
workspaces: python | ||
- name: Install dependencies | ||
run: | | ||
sudo apt update | ||
sudo apt install -y protobuf-compiler libssl-dev | ||
- name: Build | ||
run: | | ||
python -m venv venv | ||
source venv/bin/activate | ||
pip install maturin duckdb requests pytest pytest-benchmark | ||
maturin develop --locked --release | ||
- name: Generate datasets | ||
run: | | ||
python -m venv venv | ||
source venv/bin/activate | ||
python python/ci_benchmarks/datagen/gen_all.py | ||
- name: Run benchmarks | ||
run: | | ||
python -m venv venv | ||
source venv/bin/activate | ||
bencher run --project weston-lancedb --token ${{ secrets.LANCE_BENCHER_TOKEN }} --adapter python_pytest \ | ||
--branch main --testbed google-genoa --err --file results.json "python -mpytest --benchmark-json \ | ||
results.json python/ci_benchmarks" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# SPDX-FileCopyrightText: Copyright The Lance Authors |
24 changes: 24 additions & 0 deletions
24
python/python/ci_benchmarks/benchmarks/test_random_access.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
import random | ||
|
||
import lance | ||
import pytest | ||
from ci_benchmarks.datasets import get_dataset_uri | ||
|
||
DATASETS = ["tpch"] | ||
|
||
|
||
@pytest.mark.parametrize("dataset", DATASETS) | ||
def test_random_access(benchmark, dataset): | ||
NUM_INDICES = 10 | ||
dataset_uri = get_dataset_uri(dataset) | ||
|
||
ds = lance.dataset(dataset_uri) | ||
random_indices = [random.randint(0, ds.count_rows()) for _ in range(NUM_INDICES)] | ||
|
||
def bench(random_indices): | ||
ds.take(random_indices) | ||
|
||
benchmark.pedantic(bench, args=(random_indices,), rounds=5) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
import lance | ||
import pytest | ||
from ci_benchmarks.datasets import get_dataset_uri | ||
|
||
DATASETS = ["tpch"] | ||
|
||
|
||
@pytest.mark.parametrize("dataset", DATASETS) | ||
def test_full_scan(benchmark, dataset): | ||
dataset_uri = get_dataset_uri(dataset) | ||
|
||
def bench(): | ||
ds = lance.dataset(dataset_uri) | ||
ds.to_table() | ||
|
||
benchmark.pedantic(bench, rounds=1, iterations=1) | ||
|
||
|
||
@pytest.mark.parametrize("dataset", DATASETS) | ||
def test_scan_slice(benchmark, dataset): | ||
dataset_uri = get_dataset_uri(dataset) | ||
|
||
ds = lance.dataset(dataset_uri) | ||
num_rows = ds.count_rows() | ||
|
||
def bench(): | ||
ds = lance.dataset(dataset_uri) | ||
ds.to_table(offset=num_rows - 100, limit=50) | ||
|
||
benchmark.pedantic(bench, rounds=1, iterations=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
import lance | ||
import pytest | ||
from ci_benchmarks.datasets import get_dataset_uri | ||
|
||
COLUMN_LABELS = ["bools", "normals"] | ||
COLUMNS = [["bools"], ["normals"]] | ||
FILTERS = [None, "bools IS TRUE"] | ||
|
||
|
||
@pytest.mark.parametrize("columns", COLUMNS, ids=COLUMN_LABELS) | ||
@pytest.mark.parametrize("filt", FILTERS) | ||
def test_eda_search(benchmark, columns, filt): | ||
dataset_uri = get_dataset_uri("image_eda") | ||
|
||
batch_size = 32 if columns == ["image_data"] else None | ||
limit = None if filter is None else 100000 | ||
frag_readahead = ( | ||
4 | ||
if (columns == ["image_data"] or columns == ["strings"]) and filter is None | ||
else None | ||
) | ||
|
||
def bench(): | ||
ds = lance.dataset(dataset_uri) | ||
ds.to_table( | ||
columns=columns, | ||
filter=filt, | ||
batch_size=batch_size, | ||
fragment_readahead=frag_readahead, | ||
limit=limit, | ||
) | ||
|
||
benchmark.pedantic(bench, rounds=1, iterations=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# SPDX-FileCopyrightText: Copyright The Lance Authors |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
import logging | ||
|
||
from ci_benchmarks.datagen.lineitems import gen_tcph | ||
|
||
if __name__ == "__main__": | ||
logging.basicConfig(level=logging.INFO) | ||
gen_tcph() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
# Creates a dataset containing the TPC-H lineitems table using a prebuilt Parquet file | ||
import logging | ||
|
||
import duckdb | ||
import lance | ||
|
||
from ci_benchmarks.datasets import get_dataset_uri | ||
|
||
NUM_ROWS = 59986052 | ||
|
||
|
||
def _gen_data(): | ||
logging.info("Using DuckDB to generate TPC-H dataset") | ||
con = duckdb.connect(database=":memory:") | ||
con.execute("INSTALL tpch; LOAD tpch") | ||
con.execute("CALL dbgen(sf=10)") | ||
res = con.query("SELECT * FROM lineitem") | ||
return res.to_arrow_table() | ||
|
||
|
||
def _create(dataset_uri: str): | ||
try: | ||
ds = lance.dataset(dataset_uri) | ||
print(ds.count_rows()) | ||
if ds.count_rows() == NUM_ROWS: | ||
return | ||
elif ds.count_rows() == 0: | ||
lance.write_dataset( | ||
_gen_data(), dataset_uri, mode="append", use_legacy_format=False | ||
) | ||
else: | ||
raise Exception( | ||
"Cannot generate TPC-H dataset because a dataset with the URI " | ||
f"{dataset_uri} already exists and doesn't appear to be the " | ||
"same dataset" | ||
) | ||
except ValueError: | ||
lance.write_dataset( | ||
_gen_data(), dataset_uri, mode="create", use_legacy_format=False | ||
) | ||
|
||
|
||
def gen_tcph(): | ||
dataset_uri = get_dataset_uri("tpch") | ||
_create(dataset_uri) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
import logging | ||
from functools import cache | ||
from pathlib import Path | ||
|
||
import requests | ||
|
||
|
||
def _is_on_google() -> bool: | ||
logging.info("Testing if running on Google Cloud") | ||
try: | ||
rsp = requests.get("http://metadata.google.internal", timeout=5) | ||
logging.info("Metadata-Flavor: %s", rsp.headers.get("Metadata-Flavor")) | ||
return rsp.headers["Metadata-Flavor"] == "Google" | ||
except requests.exceptions.RequestException as ex: | ||
logging.info("Failed to connect to metadata server: %s", ex) | ||
return False | ||
|
||
|
||
@cache | ||
def _get_base_uri() -> str: | ||
if _is_on_google(): | ||
logging.info( | ||
"Running on Google Cloud, using gs://lance-benchmarks-ci-datasets/" | ||
) | ||
return "gs://lance-benchmarks-ci-datasets/" | ||
else: | ||
data_path = Path.home() / "lance-benchmarks-ci-datasets" | ||
if not data_path.exists(): | ||
data_path.mkdir(parents=True, exist_ok=True) | ||
logging.info("Running locally, using %s", data_path) | ||
return f"{data_path}/" | ||
|
||
|
||
def get_dataset_uri(name: str) -> str: | ||
"""Given a dataset name, return the URI appropriate for the current environment.""" | ||
# This is a custom-built dataset, on a unique bucket, that is too big to reproduce | ||
# locally | ||
if name == "image_eda": | ||
if not _is_on_google(): | ||
raise ValueError("The image_eda dataset is only available on Google Cloud") | ||
return "gs://lance-benchmarks-ci-datasets/image_eda.lance" | ||
return f"{_get_base_uri()}{name}" |