Redesigned logging (cmu-db#40)

**Summary**: redesigned logging across the whole project. Also fixed some bugs to pass the CI. **Demo**: ![Screenshot 2024-10-20 at 12 37 33](https://github.com/user-attachments/assets/6052f7e0-74cf-4593-be56-c57b7a636b49) The log files archived in `run_*/dbgym/artifacts/`. There are logs from dbgym and from third-party libraries. You can see that all logs, even info logs, are captured in `dbgym.log`. **Logging Design**: * I was having difficulty debugging some bugs because the console was too cluttered. This motivated me to redesign logging. * I removed all class-name loggers. All these loggers behaved the same (at least from what I could tell) so it's easier to just make them all use the same logger. * We use the loggers `dbgym`, `dbgym.output`, `dbgym.replay`, and the root logger. * `dbgym` is the "base" logger and should be used most of the time. It outputs errors to the console and all logs to the file `run_*/dbgym/artifacts/dbgym.log`. * `dbgym.output` is used when you actually want to output something to show the user. It just outputs the message straight to the console without any extra metadata. As a child of `dbgym`, anything logged here will also be propagated to `dbgym` and thus archived in `dbgym.log`. * `dbgym.replay` is specific to Proto-X and is where Proto-X stores log information only relevant by replay. By making it its own logger, we insulate it from any changes to the main logging system. * The root logger is used to help debug unit tests. Unit tests are isolated from the main logging system for simplicity. See `test_clean.py` for an example of this. * Certain third-party loggers like `ray` are redirected to a file to reduce console clutter. * I kept the ray dashboard in the console though because it's pretty useful. * `print()` is reserved for use for actual debugging. * I redirected `warnings` to a separate file too to further reduce clutter. * I do special handling to eliminate the warnings that show up every time when import tensorflow (see `task.py` for an example of this). **Other Details**: * Upgraded nccl to version 2.20.* in requirements.txt to fix an import error. * Embedding datagen was not working. I added additional unit tests to help me debug this. * Made workload_tests.py more robust by checking fields other than class mapping. This is done by saving reference `Workload` and `IndexSpace` objects as `pkl` files. * Verified that replay still works (since it relies on log files).
wangpatrick57 · Oct 23, 2024 · 9e652d1 · 9e652d1
1 parent ac849f8
commit 9e652d1
Show file tree

Hide file tree

Showing 178 changed files with 679 additions and 1,689 deletions.
diff --git a/.github/workflows/tests_ci.yml b/.github/workflows/tests_ci.yml
@@ -38,7 +38,7 @@ jobs:
 
     - name: Static type checking
       run: |
-        mypy --config-file scripts/mypy.ini .
+        ./scripts/mypy.sh
 
     - name: Run unit tests
       run: |
@@ -47,7 +47,7 @@ jobs:
 
     - name: Run integration tests
       # Delete the workspace. Run once with a clean workspace. Run again from the existing workspace.
-      # Need to run with a non-root user in order to start Postgres.
+      # Note that we need to run with a non-root user in order to start Postgres.
       run: |
         . "$HOME/.cargo/env"
         rm -rf ../dbgym_integtest_workspace

diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
@@ -9,11 +9,9 @@
     link_result,
     workload_name_fn,
 )
+from util.log import DBGYM_LOGGER_NAME
 from util.shell import subprocess_run
 
-benchmark_tpch_logger = logging.getLogger("benchmark/tpch")
-benchmark_tpch_logger.setLevel(logging.INFO)
-
 
 @click.group(name="tpch")
 @click.pass_obj
@@ -75,17 +73,19 @@ def _clone(dbgym_cfg: DBGymConfig) -> None:
         dbgym_cfg.cur_symlinks_build_path(mkdir=True) / "tpch-kit.link"
     )
     if expected_symlink_dpath.exists():
-        benchmark_tpch_logger.info(f"Skipping clone: {expected_symlink_dpath}")
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
+            f"Skipping clone: {expected_symlink_dpath}"
+        )
         return
 
-    benchmark_tpch_logger.info(f"Cloning: {expected_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloning: {expected_symlink_dpath}")
     real_build_path = dbgym_cfg.cur_task_runs_build_path()
     subprocess_run(
         f"./tpch_setup.sh {real_build_path}", cwd=dbgym_cfg.cur_source_path()
     )
     symlink_dpath = link_result(dbgym_cfg, real_build_path / "tpch-kit")
     assert expected_symlink_dpath.samefile(symlink_dpath)
-    benchmark_tpch_logger.info(f"Cloned: {expected_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloned: {expected_symlink_dpath}")
 
 
 def _get_tpch_kit_dpath(dbgym_cfg: DBGymConfig) -> Path:
@@ -103,7 +103,7 @@ def _generate_queries(
 ) -> None:
     tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_cfg)
     data_path = dbgym_cfg.cur_symlinks_data_path(mkdir=True)
-    benchmark_tpch_logger.info(
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
         f"Generating queries: {data_path} [{seed_start}, {seed_end}]"
     )
     for seed in range(seed_start, seed_end + 1):
@@ -125,7 +125,7 @@ def _generate_queries(
             )
         queries_symlink_dpath = link_result(dbgym_cfg, real_dir)
         assert queries_symlink_dpath.samefile(expected_queries_symlink_dpath)
-    benchmark_tpch_logger.info(
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
         f"Generated queries: {data_path} [{seed_start}, {seed_end}]"
     )
 
@@ -137,12 +137,14 @@ def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None:
         data_path / f"tables_sf{get_scale_factor_string(scale_factor)}.link"
     )
     if expected_tables_symlink_dpath.exists():
-        benchmark_tpch_logger.info(
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
             f"Skipping generation: {expected_tables_symlink_dpath}"
         )
         return
 
-    benchmark_tpch_logger.info(f"Generating: {expected_tables_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Generating: {expected_tables_symlink_dpath}"
+    )
     subprocess_run(f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_dpath / "dbgen")
     real_dir = dbgym_cfg.cur_task_runs_data_path(
         f"tables_sf{get_scale_factor_string(scale_factor)}", mkdir=True
@@ -151,7 +153,9 @@ def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None:
 
     tables_symlink_dpath = link_result(dbgym_cfg, real_dir)
     assert tables_symlink_dpath.samefile(expected_tables_symlink_dpath)
-    benchmark_tpch_logger.info(f"Generated: {expected_tables_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Generated: {expected_tables_symlink_dpath}"
+    )
 
 
 def _generate_workload(
@@ -165,7 +169,9 @@ def _generate_workload(
     workload_name = workload_name_fn(scale_factor, seed_start, seed_end, query_subset)
     expected_workload_symlink_dpath = symlink_data_dpath / (workload_name + ".link")
 
-    benchmark_tpch_logger.info(f"Generating: {expected_workload_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Generating: {expected_workload_symlink_dpath}"
+    )
     real_dpath = dbgym_cfg.cur_task_runs_data_path(workload_name, mkdir=True)
 
     queries = None
@@ -190,10 +196,11 @@ def _generate_workload(
                     and not sql_fpath.is_symlink()
                     and sql_fpath.is_absolute()
                 ), "We should only write existent real absolute paths to a file"
-                output = ",".join([f"S{seed}-Q{qnum}", str(sql_fpath)])
-                print(output, file=f)
+                f.write(f"S{seed}-Q{qnum},{sql_fpath}\n")
                 # TODO(WAN): add option to deep-copy the workload.
 
     workload_symlink_dpath = link_result(dbgym_cfg, real_dpath)
     assert workload_symlink_dpath == expected_workload_symlink_dpath
-    benchmark_tpch_logger.info(f"Generated: {expected_workload_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Generated: {expected_workload_symlink_dpath}"
+    )
diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py
@@ -29,6 +29,7 @@
     open_and_save,
     save_file,
 )
+from util.log import DBGYM_LOGGER_NAME
 from util.pg import (
     DBGYM_POSTGRES_DBNAME,
     DBGYM_POSTGRES_PASS,
@@ -42,9 +43,6 @@
 )
 from util.shell import subprocess_run
 
-dbms_postgres_logger = logging.getLogger("dbms/postgres")
-dbms_postgres_logger.setLevel(logging.INFO)
-
 
 @click.group(name="postgres")
 @click.pass_obj
@@ -142,12 +140,14 @@ def _get_repo_symlink_path(dbgym_cfg: DBGymConfig) -> Path:
 def _build_repo(dbgym_cfg: DBGymConfig, rebuild: bool) -> None:
     expected_repo_symlink_dpath = _get_repo_symlink_path(dbgym_cfg)
     if not rebuild and expected_repo_symlink_dpath.exists():
-        dbms_postgres_logger.info(
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
             f"Skipping _build_repo: {expected_repo_symlink_dpath}"
         )
         return
 
-    dbms_postgres_logger.info(f"Setting up repo in {expected_repo_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Setting up repo in {expected_repo_symlink_dpath}"
+    )
     repo_real_dpath = dbgym_cfg.cur_task_runs_build_path("repo", mkdir=True)
     subprocess_run(
         f"./build_repo.sh {repo_real_dpath}", cwd=dbgym_cfg.cur_source_path()
@@ -156,7 +156,9 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild: bool) -> None:
     # only link at the end so that the link only ever points to a complete repo
     repo_symlink_dpath = link_result(dbgym_cfg, repo_real_dpath)
     assert expected_repo_symlink_dpath.samefile(repo_symlink_dpath)
-    dbms_postgres_logger.info(f"Set up repo in {expected_repo_symlink_dpath}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Set up repo in {expected_repo_symlink_dpath}"
+    )
 
 
 def _create_dbdata(
@@ -207,7 +209,9 @@ def _create_dbdata(
     # Create symlink.
     # Only link at the end so that the link only ever points to a complete dbdata.
     dbdata_tgz_symlink_path = link_result(dbgym_cfg, dbdata_tgz_real_fpath)
-    dbms_postgres_logger.info(f"Created dbdata in {dbdata_tgz_symlink_path}")
+    logging.getLogger(DBGYM_LOGGER_NAME).info(
+        f"Created dbdata in {dbdata_tgz_symlink_path}"
+    )
 
 
 def _generic_dbdata_setup(dbgym_cfg: DBGymConfig) -> None:

diff --git a/dependencies/requirements.txt b/dependencies/requirements.txt
@@ -1,5 +1,6 @@
 absl-py==2.1.0
 aiosignal==1.3.1
+astroid==3.2.4
 astunparse==1.6.3
 async-timeout==4.0.3
 attrs==23.2.0
@@ -11,6 +12,7 @@ click==8.1.7
 cloudpickle==3.0.0
 cmake==3.28.1
 cramjam==2.8.1
+dill==0.3.8
 distlib==0.3.8
 faiss-gpu==1.7.2
 Farama-Notifications==0.0.4
@@ -42,6 +44,7 @@ libclang==16.0.6
 lit==17.0.6
 Markdown==3.5.2
 MarkupSafe==2.1.4
+mccabe==0.7.0
 ml-dtypes==0.2.0
 mpmath==1.3.0
 msgpack==1.0.7
@@ -67,7 +70,7 @@ nvidia-cusolver-cu11==11.4.0.1
 nvidia-cusolver-cu12==11.4.5.107
 nvidia-cusparse-cu11==11.7.4.91
 nvidia-cusparse-cu12==12.1.0.106
-nvidia-nccl-cu11==2.14.3
+nvidia-nccl-cu11==2.20.5
 nvidia-nccl-cu12==2.20.5
 nvidia-nvjitlink-cu12==12.3.101
 nvidia-nvtx-cu11==11.7.91
@@ -116,6 +119,7 @@ tensorflow-io-gcs-filesystem==0.36.0
 termcolor==2.4.0
 threadpoolctl==3.2.0
 tomli==2.0.1
+tomlkit==0.13.2
 torch==2.4.0
 tqdm==4.66.1
 triton==3.0.0

diff --git a/experiments/protox_tpch_sf1/main.sh b/experiments/protox_tpch_sf1/main.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+SCALE_FACTOR=1
+INTENDED_DBDATA_HARDWARE=ssd
+. ./experiments/load_per_machine_envvars.sh
+
+# space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars)
+python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --max-concurrent 4 --tune-duration-during-hpo 1 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot
+exit 0
+
+# benchmark
+python3 task.py benchmark tpch data $SCALE_FACTOR
+python3 task.py benchmark tpch workload --scale-factor $SCALE_FACTOR
+
+# postgres
+python3 task.py dbms postgres build
+python3 task.py dbms postgres dbdata tpch --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH
+
+# embedding
+python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH
+python3 task.py tune protox embedding train tpch --scale-factor $SCALE_FACTOR --train-max-concurrent 10
+
+# agent
+python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --max-concurrent 4 --tune-duration-during-hpo 4 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot
+python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR
diff --git a/manage/cli.py b/manage/cli.py
@@ -13,9 +13,7 @@
     is_child_path,
     parent_dpath_of_path,
 )
-
-task_logger = logging.getLogger("task")
-task_logger.setLevel(logging.INFO)
+from util.log import DBGYM_LOGGER_NAME, DBGYM_OUTPUT_LOGGER_NAME
 
 
 # This is used in test_clean.py. It's defined here to avoid a circular import.
@@ -49,7 +47,7 @@ def manage_clean(dbgym_cfg: DBGymConfig, mode: str) -> None:
 @click.pass_obj
 def manage_count(dbgym_cfg: DBGymConfig) -> None:
     num_files = _count_files_in_workspace(dbgym_cfg)
-    print(
+    logging.getLogger(DBGYM_OUTPUT_LOGGER_NAME).info(
         f"The workspace ({dbgym_cfg.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks."
     )
 
@@ -184,10 +182,10 @@ def clean_workspace(
     ending_num_files = _count_files_in_workspace(dbgym_cfg)
 
     if verbose:
-        task_logger.info(
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
             f"Removed {starting_num_files - ending_num_files} out of {starting_num_files} files"
         )
-        task_logger.info(
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
             f"Workspace went from {starting_num_files - ending_num_files} to {starting_num_files}"
         )
 

diff --git a/manage/tests/test_clean.py b/manage/tests/test_clean.py
@@ -11,19 +11,16 @@
 
 # This is here instead of on `if __name__ == "__main__"` because we often run individual tests, which
 #   does not go through the `if __name__ == "__main__"` codepath.
-# Make it DEBUG to see logs from verify_structure(). Make it INFO to not see logs.
-logging.basicConfig(level=logging.INFO)
+# Make it DEBUG to see logs from verify_structure(). Make it CRITICAL to not see any logs.
+# We use the root logger for unit tests to keep it separate from the standard logging subsystem which
+#   uses the dbgym.* loggers.
+logging.basicConfig(level=logging.CRITICAL)
 
 
 FilesystemStructure = NewType("FilesystemStructure", dict[str, Any])
 
 
 class CleanTests(unittest.TestCase):
-    """
-    I deemed "clean" important enough to write extensive unit tests for because a bug could lead to
-    losing important files.
-    """
-
     scratchspace_path: Path = Path()
 
     @staticmethod

diff --git a/misc/utils.py b/misc/utils.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import shutil
 import subprocess
@@ -9,6 +10,7 @@
 import redis
 import yaml
 
+from util.log import DBGYM_LOGGER_NAME
 from util.shell import subprocess_run
 
 # Enums
@@ -107,8 +109,8 @@ def get_dbdata_tgz_name(benchmark_name: str, scale_factor: float | str) -> str:
 #  - If a name already has the workload_name, I omit scale factor. This is because the workload_name includes the scale factor
 #  - By convention, symlinks should end with ".link". The bug that motivated this decision involved replaying a tuning run. When
 #    replaying a tuning run, you read the tuning_steps/ folder of the tuning run. Earlier, I created a symlink to that tuning_steps/
-#    folder called run_*/dbgym_agent_protox_tune/tuning_steps. However, replay itself generates an output.log file, which goes in
-#    run_*/dbgym_agent_protox_tune/tuning_steps/. The bug was that my replay function was overwriting the output.log file of the
+#    folder called run_*/dbgym_agent_protox_tune/tuning_steps. However, replay itself generates an replay_info.log file, which goes in
+#    run_*/dbgym_agent_protox_tune/tuning_steps/. The bug was that my replay function was overwriting the replay_info.log file of the
 #    tuning run. By naming all symlinks "*.link", we avoid the possibility of subtle bugs like this happening.
 default_traindata_path: Callable[[Path, str, str], Path] = (
     lambda workspace_path, benchmark_name, workload_name: get_symlinks_path_from_workspace_path(
@@ -674,5 +676,5 @@ def is_ssd(path: Path) -> bool:
                 return is_ssd
         return False
     except Exception as e:
-        print(f"An error occurred: {e}")
+        logging.getLogger(DBGYM_LOGGER_NAME).error(f"An error occurred: {e}")
         return False
diff --git a/scripts/mypy.sh b/scripts/mypy.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+mypy --config-file scripts/mypy.ini .
diff --git a/scripts/pat_test.sh b/scripts/pat_test.sh
@@ -7,9 +7,7 @@ INTENDED_DBDATA_HARDWARE=ssd
 . ./experiments/load_per_machine_envvars.sh
 
 # space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars)
-python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01  --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot
-python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR --tune-duration-during-tune 0.02
-python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR
+python3 task.py tune protox embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2
 exit 0
 
 # benchmark
@@ -20,8 +18,6 @@ python3 task.py benchmark tpch workload --scale-factor $SCALE_FACTOR
 python3 task.py dbms postgres build
 python3 task.py dbms postgres dbdata tpch --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH
 
-exit 0
-
 # embedding
 # python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --default-sample-limit 64 --file-limit 64 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # short datagen for testing
 python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # long datagen so that train doesn't crash

diff --git a/scripts/read_parquet.py b/scripts/read_parquet.py
@@ -1,21 +1,24 @@
+import logging
 import sys
 from pathlib import Path
 
 import pandas as pd
 
+from util.log import DBGYM_OUTPUT_LOGGER_NAME
 
-def read_and_print_parquet(file_path: Path) -> None:
+
+def read_and_output_parquet(file_path: Path) -> None:
     # Read the Parquet file into a DataFrame
     df = pd.read_parquet(file_path)
 
-    # Print the DataFrame
-    print("DataFrame:")
-    print(df)
+    # Output the DataFrame
+    logging.getLogger(DBGYM_OUTPUT_LOGGER_NAME).info("DataFrame:")
+    logging.getLogger(DBGYM_OUTPUT_LOGGER_NAME).info(df)
 
 
 if __name__ == "__main__":
     # Specify the path to the Parquet file
     parquet_file_path = Path(sys.argv[0])
 
-    # Call the function to read and print the Parquet file
-    read_and_print_parquet(parquet_file_path)
+    # Call the function to read and output the Parquet file
+    read_and_output_parquet(parquet_file_path)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#!/bin/bash
		mypy --config-file scripts/mypy.ini .