Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fully using new workspace #67

Merged
merged 24 commits into from
Dec 30, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
ed21a60
added (failing) test for test_tpch_data
wangpatrick57 Dec 29, 2024
91a6db3
wrote and passed test_tpch_data
wangpatrick57 Dec 29, 2024
0f00113
tpch data -> tables
wangpatrick57 Dec 29, 2024
74c32e2
now passing test_tpch_workload
wangpatrick57 Dec 29, 2024
8c2c4ac
added and passed test_job_tables
wangpatrick57 Dec 29, 2024
86f62eb
wrote and passed test_job_workload
wangpatrick57 Dec 29, 2024
883b183
fixed bug in workspace where it would always sleep for 1 second
wangpatrick57 Dec 29, 2024
664728d
can now run integtest_benchmark.py fully
wangpatrick57 Dec 29, 2024
e4235fe
gymlib integtests no longer crash when run together with other integt…
wangpatrick57 Dec 29, 2024
c430a90
fixed some paths in gymlib integtests
wangpatrick57 Dec 30, 2024
e0cde22
added and passed test_postgres_build
wangpatrick57 Dec 30, 2024
9d111d3
added and passed test_postgres_dbdata
wangpatrick57 Dec 30, 2024
85f40a0
now passing all integtests
wangpatrick57 Dec 30, 2024
395a6f8
fordpath/fpath/dpath -> path
wangpatrick57 Dec 30, 2024
1dcf669
dbms integtest now uses intended_dbdata_hardware
wangpatrick57 Dec 30, 2024
b5fd76e
fully removed the old open_and_save, save_file, and link_result
wangpatrick57 Dec 30, 2024
49364c9
num_times_created_this_run -> _num_times_created_this_run
wangpatrick57 Dec 30, 2024
2ffa8c7
rm lab.py
wangpatrick57 Dec 30, 2024
17f89ff
deleted all the cur_* functions from workspace fixed all uses of them
wangpatrick57 Dec 30, 2024
e3ac705
deleted append_group
wangpatrick57 Dec 30, 2024
f5c89e7
deleted more functions from workspace.py
wangpatrick57 Dec 30, 2024
e6ed566
refactored everything to use linkname helpers
wangpatrick57 Dec 30, 2024
388aeec
refactored how workspace works in gymlib tests
wangpatrick57 Dec 30, 2024
e3f7bfe
now resetting num times created in integtests
wangpatrick57 Dec 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
now passing test_tpch_workload
  • Loading branch information
wangpatrick57 committed Dec 29, 2024
commit 74c32e2cb039163365ba992da619ad821e810029
19 changes: 11 additions & 8 deletions benchmark/job/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@
from typing import Optional

import click
from gymlib.symlinks_paths import get_tables_dirname
from gymlib.symlinks_paths import (
get_tables_dirname,
get_workload_dirname,
get_workload_suffix,
)

from benchmark.constants import DEFAULT_SCALE_FACTOR
from util.log import DBGYM_LOGGER_NAME
from util.shell import subprocess_run
from util.workspace import (
DBGymWorkspace,
get_workload_name,
is_fully_resolved,
link_result,
)
from util.workspace import DBGymWorkspace, is_fully_resolved, link_result

JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz"
JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz"
Expand Down Expand Up @@ -235,7 +234,11 @@ def _generate_job_workload(
dbgym_workspace: DBGymWorkspace,
query_subset: str,
) -> None:
workload_name = get_workload_name(DEFAULT_SCALE_FACTOR, query_subset)
workload_name = get_workload_dirname(
"job",
DEFAULT_SCALE_FACTOR,
get_workload_suffix("job", query_subset=query_subset),
)
expected_workload_symlink_dpath = dbgym_workspace.cur_symlinks_data_path(
mkdir=True
) / (workload_name + ".link")
Expand Down
44 changes: 38 additions & 6 deletions benchmark/tests/integtest_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,20 @@
import unittest
from pathlib import Path

from gymlib.symlinks_paths import get_tables_symlink_path
from gymlib.symlinks_paths import (
get_tables_symlink_path,
get_workload_suffix,
get_workload_symlink_path,
)

# It's ok to import private functions from the benchmark module because this is an integration test.
from benchmark.tpch.cli import _tpch_tables
from util.workspace import DBGymWorkspace, get_workspace_path_from_config
from benchmark.tpch.cli import _tpch_tables, _tpch_workload
from benchmark.tpch.constants import DEFAULT_TPCH_SEED
from util.workspace import (
DBGymWorkspace,
fully_resolve_path,
get_workspace_path_from_config,
)


class TestBenchmark(unittest.TestCase):
Expand All @@ -15,11 +24,13 @@ class TestBenchmark(unittest.TestCase):
def setUp(self) -> None:
workspace_path = get_workspace_path_from_config(TestBenchmark.DBGYM_CONFIG_PATH)
# Get a clean start each time.
shutil.rmtree(workspace_path)
if workspace_path.exists():
shutil.rmtree(workspace_path)
self.workspace = DBGymWorkspace(workspace_path)

# def tearDown(self) -> None:
# shutil.rmtree(self.workspace.dbgym_workspace_path)
def tearDown(self) -> None:
if self.workspace.dbgym_workspace_path.exists():
shutil.rmtree(self.workspace.dbgym_workspace_path)

def test_tpch_tables(self) -> None:
scale_factor = 0.01
Expand All @@ -29,6 +40,27 @@ def test_tpch_tables(self) -> None:
self.assertFalse(tables_path.exists())
_tpch_tables(self.workspace, scale_factor)
self.assertTrue(tables_path.exists())
self.assertTrue(fully_resolve_path(tables_path).exists())

def test_tpch_workload(self) -> None:
scale_factor = 0.01
workload_path = get_workload_symlink_path(
self.workspace.dbgym_workspace_path,
"tpch",
scale_factor,
get_workload_suffix(
"tpch",
seed_start=DEFAULT_TPCH_SEED,
seed_end=DEFAULT_TPCH_SEED,
query_subset="all",
),
)
self.assertFalse(workload_path.exists())
_tpch_workload(
self.workspace, DEFAULT_TPCH_SEED, DEFAULT_TPCH_SEED, "all", scale_factor
)
self.assertTrue(workload_path.exists())
self.assertTrue(fully_resolve_path(workload_path).exists())


if __name__ == "__main__":
Expand Down
54 changes: 40 additions & 14 deletions benchmark/tpch/cli.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
import logging

import click
from gymlib.symlinks_paths import get_tables_dirname, get_tables_symlink_path
from gymlib.symlinks_paths import (
get_tables_dirname,
get_tables_symlink_path,
get_workload_dirname,
get_workload_suffix,
)

from benchmark.constants import DEFAULT_SCALE_FACTOR
from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
from util.log import DBGYM_LOGGER_NAME
from util.shell import subprocess_run
from util.workspace import (
DBGymWorkspace,
fully_resolve_path,
get_scale_factor_string,
get_workload_name,
is_fully_resolved,
link_result,
)
Expand Down Expand Up @@ -68,6 +73,19 @@ def tpch_workload(
query_subset: str,
scale_factor: float,
) -> None:
_tpch_workload(dbgym_workspace, seed_start, seed_end, query_subset, scale_factor)


def _tpch_workload(
dbgym_workspace: DBGymWorkspace,
seed_start: int,
seed_end: int,
query_subset: str,
scale_factor: float,
) -> None:
"""
This function exists as a hook for integration tests.
"""
assert (
seed_start <= seed_end
), f"seed_start ({seed_start}) must be <= seed_end ({seed_end})"
Expand Down Expand Up @@ -124,6 +142,7 @@ def _generate_tpch_queries(
dbgym_workspace.dbgym_this_run_path
/ _get_queries_dirname(seed, scale_factor)
)
queries_parent_path.mkdir(parents=False, exist_ok=False)
for i in range(1, NUM_TPCH_QUERIES + 1):
target_sql = (queries_parent_path / f"{i}.sql").resolve()
subprocess_run(
Expand Down Expand Up @@ -158,7 +177,7 @@ def _generate_tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float)
tables_parent_path = dbgym_workspace.dbgym_this_run_path / get_tables_dirname(
"tpch", scale_factor
)
tables_parent_path.mkdir(parents=True, exist_ok=False)
tables_parent_path.mkdir(parents=False, exist_ok=False)
subprocess_run(f"mv ./*.tbl {tables_parent_path}", cwd=tpch_kit_dpath / "dbgen")

tables_symlink_dpath = dbgym_workspace.link_result(tables_parent_path)
Expand All @@ -175,16 +194,22 @@ def _generate_tpch_workload(
query_subset: str,
scale_factor: float,
) -> None:
symlink_data_dpath = dbgym_workspace.cur_symlinks_data_path(mkdir=True)
workload_name = get_workload_name(
scale_factor, f"{seed_start}_{seed_end}_{query_subset}"
workload_name = get_workload_dirname(
"tpch",
scale_factor,
get_workload_suffix(
"tpch", seed_start=seed_start, seed_end=seed_end, query_subset=query_subset
),
)
expected_workload_symlink_dpath = dbgym_workspace.dbgym_cur_symlinks_path / (
workload_name + ".link"
)
expected_workload_symlink_dpath = symlink_data_dpath / (workload_name + ".link")

logging.getLogger(DBGYM_LOGGER_NAME).info(
f"Generating: {expected_workload_symlink_dpath}"
)
real_dpath = dbgym_workspace.cur_task_runs_data_path(workload_name, mkdir=True)
workload_path = dbgym_workspace.dbgym_this_run_path / workload_name
workload_path.mkdir(parents=False, exist_ok=False)

query_names = None
if query_subset == "all":
Expand All @@ -196,20 +221,21 @@ def _generate_tpch_workload(
else:
assert False

with open(real_dpath / "order.txt", "w") as f:
with open(workload_path / "order.txt", "w") as f:
for seed in range(seed_start, seed_end + 1):
queries_parent_path = dbgym_workspace.dbgym_cur_symlinks_path / (
_get_queries_dirname(seed, scale_factor) + ".link"
)

for qname in query_names:
sql_fpath = (
symlink_data_dpath
/ (_get_queries_dirname(seed, scale_factor) + ".link")
).resolve() / f"{qname}.sql"
sql_fpath = fully_resolve_path(queries_parent_path / f"{qname}.sql")
assert is_fully_resolved(
sql_fpath
), "We should only write existent real absolute paths to a file"
f.write(f"S{seed}-Q{qname},{sql_fpath}\n")
# TODO(WAN): add option to deep-copy the workload.

workload_symlink_dpath = link_result(dbgym_workspace, real_dpath)
workload_symlink_dpath = dbgym_workspace.link_result(workload_path)
assert workload_symlink_dpath == expected_workload_symlink_dpath
logging.getLogger(DBGYM_LOGGER_NAME).info(
f"Generated: {expected_workload_symlink_dpath}"
Expand Down
20 changes: 14 additions & 6 deletions env/tests/gymlib_integtest_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@
from pathlib import Path
from typing import Optional

# TODO: remove symlinks_paths from the import
from gymlib.symlinks_paths import get_workload_dirname, get_workload_suffix

from benchmark.tpch.constants import DEFAULT_TPCH_SEED
from env.tuning_artifacts import TuningMetadata
from util.workspace import (
DBGymWorkspace,
fully_resolve_path,
get_default_dbdata_parent_dpath,
get_default_pgbin_path,
get_default_pristine_dbdata_snapshot_path,
get_default_workload_name_suffix,
get_default_workload_path,
get_workload_name,
get_workspace_path_from_config,
)

Expand Down Expand Up @@ -75,16 +77,22 @@ def get_dbgym_workspace() -> DBGymWorkspace:
@staticmethod
def get_default_metadata() -> TuningMetadata:
dbgym_workspace = GymlibIntegtestManager.get_dbgym_workspace()
assert GymlibIntegtestManager.BENCHMARK == "tpch"
suffix = get_workload_suffix(
GymlibIntegtestManager.BENCHMARK,
seed_start=DEFAULT_TPCH_SEED,
seed_end=DEFAULT_TPCH_SEED,
query_subset="all",
)
return TuningMetadata(
workload_path=fully_resolve_path(
get_default_workload_path(
dbgym_workspace.dbgym_workspace_path,
GymlibIntegtestManager.BENCHMARK,
get_workload_name(
get_workload_dirname(
GymlibIntegtestManager.BENCHMARK,
GymlibIntegtestManager.SCALE_FACTOR,
get_default_workload_name_suffix(
GymlibIntegtestManager.BENCHMARK
),
suffix,
),
),
),
Expand Down
18 changes: 2 additions & 16 deletions env/tests/integtest_workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,7 @@
from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
from env.tests.gymlib_integtest_util import GymlibIntegtestManager
from env.workload import Workload
from util.workspace import (
fully_resolve_path,
get_default_workload_name_suffix,
get_default_workload_path,
get_workload_name,
)
from util.workspace import fully_resolve_path, get_default_workload_path


class WorkloadTests(unittest.TestCase):
Expand All @@ -17,16 +12,7 @@ def setUpClass() -> None:
GymlibIntegtestManager.set_up_workspace()

def test_workload(self) -> None:
workload_dpath = fully_resolve_path(
get_default_workload_path(
GymlibIntegtestManager.get_dbgym_workspace().dbgym_workspace_path,
GymlibIntegtestManager.BENCHMARK,
get_workload_name(
GymlibIntegtestManager.SCALE_FACTOR,
get_default_workload_name_suffix(GymlibIntegtestManager.BENCHMARK),
),
),
)
workload_dpath = GymlibIntegtestManager.get_default_metadata().workload_path

workload = Workload(
GymlibIntegtestManager.get_dbgym_workspace(), workload_dpath
Expand Down
31 changes: 29 additions & 2 deletions gymlib_package/gymlib/symlinks_paths.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pathlib import Path
from typing import Any

# TODO: move these into workspace.py and move workspace.py into gymlib.
SYMLINKS_DNAME = "symlinks"
Expand All @@ -20,7 +21,22 @@ def get_scale_factor_string(scale_factor: float | str) -> str:


def get_tables_dirname(benchmark: str, scale_factor: float | str) -> str:
return f"{benchmark}_sf{get_scale_factor_string(scale_factor)}_tables"
return f"tables_{benchmark}_sf{get_scale_factor_string(scale_factor)}"


def get_workload_suffix(benchmark: str, **kwargs: Any) -> str:
if benchmark == "tpch":
assert kwargs.keys() == {"seed_start", "seed_end", "query_subset"}
return f"{kwargs['seed_start']}_{kwargs['seed_end']}_{kwargs['query_subset']}"
elif benchmark == "job":
assert kwargs.keys() == {"query_subset"}
return f"{kwargs['query_subset']}"
else:
assert False


def get_workload_dirname(benchmark: str, scale_factor: float | str, suffix: str) -> str:
return f"workload_{benchmark}_sf{get_scale_factor_string(scale_factor)}_{suffix}"


def get_tables_symlink_path(
Expand All @@ -30,5 +46,16 @@ def get_tables_symlink_path(
workspace_path
/ SYMLINKS_DNAME
/ DBGYM_APP_NAME
/ get_tables_dirname(benchmark, scale_factor)
/ (get_tables_dirname(benchmark, scale_factor) + ".link")
)


def get_workload_symlink_path(
workspace_path: Path, benchmark: str, scale_factor: float | str, suffix: str
) -> Path:
return (
workspace_path
/ SYMLINKS_DNAME
/ DBGYM_APP_NAME
/ (get_workload_dirname(benchmark, scale_factor, suffix) + ".link")
)
20 changes: 0 additions & 20 deletions util/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,22 +53,6 @@ def get_latest_run_path_from_workspace_path(workspace_path: Path) -> Path:
DEFAULT_BOOT_CONFIG_FPATH = POSTGRES_PATH / "default_boot_config.yaml"


# Generally useful functions


def get_default_workload_name_suffix(benchmark_name: str) -> str:
if benchmark_name == "tpch":
return f"{DEFAULT_TPCH_SEED}_{DEFAULT_TPCH_SEED}_all"
if benchmark_name == "job":
return "all"
else:
assert False


# Standard names of files/directories. These can refer to either the actual file/directory or a link to the file/directory.
# Since they can refer to either the actual or the link, they do not have ".link" in them.


# Paths of dependencies in the workspace. These are named "*_path" because they will be an absolute path
# The reason these _cannot_ be relative paths is because relative paths are relative to the codebase root, not the workspace root
# Note that it's okay to hardcode the codebase paths (like dbgym_dbms_postgres) here. In the worst case, we'll just break an
Expand Down Expand Up @@ -110,10 +94,6 @@ def get_dbdata_tgz_filename(benchmark_name: str, scale_factor: float | str) -> s
return f"{benchmark_name}_sf{get_scale_factor_string(scale_factor)}_pristine_dbdata.tgz"


def get_workload_name(scale_factor: float | str, suffix: str) -> str:
return f"workload_sf{get_scale_factor_string(scale_factor)}_{suffix}"


def get_default_pristine_dbdata_snapshot_path(
workspace_path: Path, benchmark_name: str, scale_factor: float | str
) -> Path:
Expand Down