merged with main

cmu-db · Jul 8, 2024 · 2a838cb · 2a838cb
2 parents 3dbeac0 + 2f17bd4
commit 2a838cb
Show file tree

Hide file tree

Showing 43 changed files with 1,508 additions and 838 deletions.
diff --git a/README.md b/README.md
@@ -1 +1,86 @@
-# Database Gym
+# 🛢️ Database Gym 🏋️
+[\[Slides\]](http://www.cidrdb.org/cidr2023/slides/p27-lim-slides.pdf) [\[Paper\]](https://www.cidrdb.org/cidr2023/papers/p27-lim.pdf)
+
+*An end-to-end research vehicle for the field of self-driving DBMSs.*
+
+## Quickstart
+
+These steps were tested on a fresh repository clone, Ubuntu 22.04.
+
+```
+# Setup dependencies.
+# You may want to create a Python virtual environment (e.g. with conda) before doing this.
+./dependency/install_dependencies.sh
+
+# Compile a custom fork of PostgreSQL, load TPC-H (SF 0.01), train the Proto-X agent, and tune.
+./scripts/quickstart.sh postgres tpch 0.01 protox
+```
+
+## Overview
+
+Autonomous DBMS research often involves more engineering than research.
+As new advances in state-of-the-art technology are made, it is common to find that they have
+reimplemented the database tuning pipeline from scratch: workload capture, database setup,
+training data collection, model creation, model deployment, and more.
+Moreover, these bespoke pipelines make it difficult to combine different techniques even when they
+should be independent (e.g., using a different operator latency model in a tuning algorithm).
+
+The database gym project is our attempt at standardizing the APIs between these disparate tasks,
+allowing researchers to mix-and-match the different pipeline components.
+It draws inspiration from the Farama Foundation's Gymnasium (formerly OpenAI Gym), which
+accelerates the development and comparison of reinforcement learning algorithms by providing a set
+of agents, environments, and a standardized API for communicating between them.
+Through the database gym, we hope to save other people time and reimplementation effort by
+providing an extensible open-source platform for autonomous DBMS research.
+
+This project is under active development.
+Currently, we decompose the database tuning pipeline into the following components:
+
+1. Workload: collection, forecasting, synthesis
+2. Database: database loading, instrumentation, orchestrating workload execution
+3. Agent: identifying tuning actions, suggesting an action
+
+## Repository Structure
+
+`task.py` is the entrypoint for all tasks.
+The tasks are grouped into categories that correspond to the top-level directories of the repository:
+
+- `benchmark` - tasks to generate data and queries for different benchmarks (e.g., TPC-H, JOB)
+- `dbms` - tasks to build and start DBMSs (e.g., PostgreSQL)
+- `tune` - tasks to train autonomous database tuning agents
+
+## Credits
+
+The Database Gym project rose from the ashes of the [NoisePage](https://db.cs.cmu.edu/projects/noisepage/) self-driving DBMS project.
+
+The first prototype was written by [Patrick Wang](https://github.com/wangpatrick57), integrating [Boot (VLDB 2024)](https://github.com/lmwnshn/boot) and [Proto-X (VLDB 2024)](https://github.com/17zhangw/protox) into a cohesive system.
+
+## Citing This Repository
+
+If you use this repository in an academic paper, please cite:
+
+```
+@inproceedings{lim23,
+ author = {Lim, Wan Shen and Butrovich, Matthew and Zhang, William and Crotty, Andrew and Ma, Lin and Xu, Peijing and Gehrke, Johannes and Pavlo, Andrew},
+ title = {Database Gyms},
+ booktitle = {{CIDR} 2023, Conference on Innovative Data Systems Research},
+ year = {2023},
+ url = {https://db.cs.cmu.edu/papers/2023/p27-lim.pdf},
+ }
+```
+
+Additionally, please cite any module-specific paper that is relevant to your use.
+
+**Accelerating Training Data Generation**
+
+```
+(citation pending)
+Boot, appearing at VLDB 2024.
+```
+
+**Simultaneously Tuning Multiple Configuration Spaces with Proto Actions**
+
+```
+(citation pending)
+Proto-X, appearing at VLDB 2024.
+```
diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
@@ -4,7 +4,7 @@
 
 import click
 
-from misc.utils import DBGymConfig, get_scale_factor_string, workload_name_fn
+from misc.utils import DBGymConfig, get_scale_factor_string, link_result, workload_name_fn
 from util.shell import subprocess_run
 from util.pg import *
 
@@ -21,8 +21,8 @@ def tpch_group(dbgym_cfg: DBGymConfig):
 @tpch_group.command(name="data")
 @click.argument("scale-factor", type=float)
 @click.pass_obj
-# The reason generate-data is separate from create-pgdata is because generate-data is generic
-# to all DBMSs while create-pgdata is specific to Postgres.
+# The reason generate data is separate from create dbdata is because generate-data is generic
+# to all DBMSs while create dbdata is specific to a single DBMS.
 def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float):
  _clone(dbgym_cfg)
  _generate_data(dbgym_cfg, scale_factor)
@@ -56,68 +56,71 @@ def _get_queries_dname(seed: int, scale_factor: float) -> str:
 
 
 def _clone(dbgym_cfg: DBGymConfig):
- symlink_dir = dbgym_cfg.cur_symlinks_build_path("tpch-kit")
- if symlink_dir.exists():
- benchmark_tpch_logger.info(f"Skipping clone: {symlink_dir}")
+ expected_symlink_dpath = dbgym_cfg.cur_symlinks_build_path(mkdir=True) / "tpch-kit.link"
+ if expected_symlink_dpath.exists():
+ benchmark_tpch_logger.info(f"Skipping clone: {expected_symlink_dpath}")
  return
 
- benchmark_tpch_logger.info(f"Cloning: {symlink_dir}")
+ benchmark_tpch_logger.info(f"Cloning: {expected_symlink_dpath}")
  real_build_path = dbgym_cfg.cur_task_runs_build_path()
  subprocess_run(
  f"./tpch_setup.sh {real_build_path}", cwd=dbgym_cfg.cur_source_path()
  )
- subprocess_run(
- f"ln -s {real_build_path / 'tpch-kit'} {dbgym_cfg.cur_symlinks_build_path(mkdir=True)}"
- )
- benchmark_tpch_logger.info(f"Cloned: {symlink_dir}")
+ symlink_dpath = link_result(dbgym_cfg, real_build_path / "tpch-kit")
+ assert os.path.samefile(expected_symlink_dpath, symlink_dpath)
+ benchmark_tpch_logger.info(f"Cloned: {expected_symlink_dpath}")
 
 
-def _generate_queries(dbgym_cfg: DBGymConfig, seed_start: int, seed_end: int, scale_factor: float):
- build_path = dbgym_cfg.cur_symlinks_build_path()
- assert build_path.exists()
+def _get_tpch_kit_dpath(dbgym_cfg: DBGymConfig) -> Path:
+ tpch_kit_dpath = (dbgym_cfg.cur_symlinks_build_path() / "tpch-kit.link").resolve()
+ assert tpch_kit_dpath.exists() and tpch_kit_dpath.is_absolute() and not tpch_kit_dpath.is_symlink()
+ return tpch_kit_dpath
 
+
+def _generate_queries(dbgym_cfg: DBGymConfig, seed_start: int, seed_end: int, scale_factor: float):
+ tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_cfg)
  data_path = dbgym_cfg.cur_symlinks_data_path(mkdir=True)
  benchmark_tpch_logger.info(
  f"Generating queries: {data_path} [{seed_start}, {seed_end}]"
  )
  for seed in range(seed_start, seed_end + 1):
- symlinked_seed = data_path / _get_queries_dname(seed, scale_factor)
- if symlinked_seed.exists():
+ expected_queries_symlink_dpath = data_path / (_get_queries_dname(seed, scale_factor) + ".link")
+ if expected_queries_symlink_dpath.exists():
  continue
 
  real_dir = dbgym_cfg.cur_task_runs_data_path(_get_queries_dname(seed, scale_factor), mkdir=True)
  for i in range(1, 22 + 1):
  target_sql = (real_dir / f"{i}.sql").resolve()
  subprocess_run(
  f"DSS_QUERY=./queries ./qgen {i} -r {seed} -s {scale_factor} > {target_sql}",
- cwd=build_path / "tpch-kit" / "dbgen",
+ cwd=tpch_kit_dpath / "dbgen",
  verbose=False,
  )
- subprocess_run(f"ln -s {real_dir} {data_path}", verbose=False)
+ queries_symlink_dpath = link_result(dbgym_cfg, real_dir)
+ assert os.path.samefile(queries_symlink_dpath, expected_queries_symlink_dpath)
  benchmark_tpch_logger.info(
  f"Generated queries: {data_path} [{seed_start}, {seed_end}]"
  )
 
 
 def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float):
- build_path = dbgym_cfg.cur_symlinks_build_path()
- assert build_path.exists()
-
+ tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_cfg)
  data_path = dbgym_cfg.cur_symlinks_data_path(mkdir=True)
- symlink_dir = data_path / f"tables_sf{get_scale_factor_string(scale_factor)}"
- if symlink_dir.exists():
- benchmark_tpch_logger.info(f"Skipping generation: {symlink_dir}")
+ expected_tables_symlink_dpath = data_path / f"tables_sf{get_scale_factor_string(scale_factor)}.link"
+ if expected_tables_symlink_dpath.exists():
+ benchmark_tpch_logger.info(f"Skipping generation: {expected_tables_symlink_dpath}")
  return
 
- benchmark_tpch_logger.info(f"Generating: {symlink_dir}")
+ benchmark_tpch_logger.info(f"Generating: {expected_tables_symlink_dpath}")
  subprocess_run(
- f"./dbgen -vf -s {scale_factor}", cwd=build_path / "tpch-kit" / "dbgen"
+ f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_dpath / "dbgen"
  )
  real_dir = dbgym_cfg.cur_task_runs_data_path(f"tables_sf{get_scale_factor_string(scale_factor)}", mkdir=True)
- subprocess_run(f"mv ./*.tbl {real_dir}", cwd=build_path / "tpch-kit" / "dbgen")
+ subprocess_run(f"mv ./*.tbl {real_dir}", cwd=tpch_kit_dpath / "dbgen")
 
- subprocess_run(f"ln -s {real_dir} {data_path}")
- benchmark_tpch_logger.info(f"Generated: {symlink_dir}")
+ tables_symlink_dpath = link_result(dbgym_cfg, real_dir)
+ assert os.path.samefile(tables_symlink_dpath, expected_tables_symlink_dpath)
+ benchmark_tpch_logger.info(f"Generated: {expected_tables_symlink_dpath}")
 
 
 def _generate_workload(
@@ -129,9 +132,9 @@ def _generate_workload(
 ):
  symlink_data_dir = dbgym_cfg.cur_symlinks_data_path(mkdir=True)
  workload_name = workload_name_fn(scale_factor, seed_start, seed_end, query_subset)
- workload_symlink_path = symlink_data_dir / workload_name
+ expected_workload_symlink_dpath = symlink_data_dir / (workload_name + ".link")
 
- benchmark_tpch_logger.info(f"Generating: {workload_symlink_path}")
+ benchmark_tpch_logger.info(f"Generating: {expected_workload_symlink_dpath}")
  real_dir = dbgym_cfg.cur_task_runs_data_path(
  workload_name, mkdir=True
  )
@@ -147,13 +150,12 @@ def _generate_workload(
  with open(real_dir / "order.txt", "w") as f:
  for seed in range(seed_start, seed_end + 1):
  for qnum in queries:
- sqlfile = symlink_data_dir / _get_queries_dname(seed, scale_factor) / f"{qnum}.sql"
- assert sqlfile.exists()
- output = ",".join([f"S{seed}-Q{qnum}", str(sqlfile)])
+ sql_fpath = (symlink_data_dir / (_get_queries_dname(seed, scale_factor) + ".link")).resolve() / f"{qnum}.sql"
+ assert sql_fpath.exists() and not sql_fpath.is_symlink() and sql_fpath.is_absolute(), "We should only write existent real absolute paths to a file"
+ output = ",".join([f"S{seed}-Q{qnum}", str(sql_fpath)])
  print(output, file=f)
  # TODO(WAN): add option to deep-copy the workload.
 
- if workload_symlink_path.exists():
- os.remove(workload_symlink_path)
- subprocess_run(f"ln -s {real_dir} {workload_symlink_path}")
- benchmark_tpch_logger.info(f"Generated: {workload_symlink_path}")
+ workload_symlink_dpath = link_result(dbgym_cfg, real_dir)
+ assert workload_symlink_dpath == expected_workload_symlink_dpath
+ benchmark_tpch_logger.info(f"Generated: {expected_workload_symlink_dpath}")
diff --git a/benchmark/tpch/load_info.py b/benchmark/tpch/load_info.py
@@ -1,5 +1,5 @@
 from dbms.load_info_base_class import LoadInfoBaseClass
-from misc.utils import get_scale_factor_string
+from misc.utils import DBGymConfig, get_scale_factor_string
 
 
 TPCH_SCHEMA_FNAME = "tpch_schema.sql"
@@ -22,7 +22,7 @@ class TpchLoadInfo(LoadInfoBaseClass):
  "lineitem",
  ]
 
- def __init__(self, dbgym_cfg, scale_factor):
+ def __init__(self, dbgym_cfg: DBGymConfig, scale_factor: float):
  # schema and constraints
  schema_root_dpath = dbgym_cfg.dbgym_repo_path
  for component in TpchLoadInfo.CODEBASE_PATH_COMPONENTS[
@@ -39,13 +39,12 @@ def __init__(self, dbgym_cfg, scale_factor):
  ), f"self._constraints_fpath ({self._constraints_fpath}) does not exist"
 
  # tables
- data_root_dpath = (
- dbgym_cfg.dbgym_symlinks_path / TpchLoadInfo.CODEBASE_DNAME / "data"
- )
- tables_dpath = data_root_dpath / f"tables_sf{get_scale_factor_string(scale_factor)}"
+ data_root_dpath = dbgym_cfg.dbgym_symlinks_path / TpchLoadInfo.CODEBASE_DNAME / "data"
+ tables_symlink_dpath = data_root_dpath / f"tables_sf{get_scale_factor_string(scale_factor)}.link"
+ tables_dpath = tables_symlink_dpath.resolve()
  assert (
- tables_dpath.exists()
- ), f"tables_dpath ({tables_dpath}) does not exist. Make sure you have generated the TPC-H data"
+ tables_dpath.exists() and tables_dpath.is_absolute() and not tables_dpath.is_symlink()
+ ), f"tables_dpath ({tables_dpath}) should be an existent real absolute path. Make sure you have generated the TPC-H data"
  self._tables_and_fpaths = []
  for table in TpchLoadInfo.TABLES:
  table_fpath = tables_dpath / f"{table}.tbl"

diff --git a/benchmark/tpch/tpch_constraints.sql b/benchmark/tpch/tpch_constraints.sql
@@ -7,26 +7,27 @@ ALTER TABLE orders ADD CONSTRAINT orders_o_custkey_fkey FOREIGN KEY (o_custkey)
 ALTER TABLE lineitem ADD CONSTRAINT lineitem_l_orderkey_fkey FOREIGN KEY (l_orderkey) REFERENCES orders (o_orderkey) ON DELETE CASCADE;
 ALTER TABLE lineitem ADD CONSTRAINT lineitem_l_partkey_l_suppkey_fkey FOREIGN KEY (l_partkey, l_suppkey) REFERENCES partsupp (ps_partkey, ps_suppkey) ON DELETE CASCADE;
 
-CREATE UNIQUE INDEX r_rk ON region (r_regionkey ASC);
-CREATE UNIQUE INDEX n_nk ON nation (n_nationkey ASC);
-CREATE INDEX n_rk ON nation (n_regionkey ASC);
-CREATE UNIQUE INDEX p_pk ON part (p_partkey ASC);
-CREATE UNIQUE INDEX s_sk ON supplier (s_suppkey ASC);
-CREATE INDEX s_nk ON supplier (s_nationkey ASC);
-CREATE INDEX ps_pk ON partsupp (ps_partkey ASC);
-CREATE INDEX ps_sk ON partsupp (ps_suppkey ASC);
-CREATE UNIQUE INDEX ps_pk_sk ON partsupp (ps_partkey ASC, ps_suppkey ASC);
-CREATE UNIQUE INDEX ps_sk_pk ON partsupp (ps_suppkey ASC, ps_partkey ASC);
-CREATE UNIQUE INDEX c_ck ON customer (c_custkey ASC);
-CREATE INDEX c_nk ON customer (c_nationkey ASC);
-CREATE UNIQUE INDEX o_ok ON orders (o_orderkey ASC);
-CREATE INDEX o_ck ON orders (o_custkey ASC);
-CREATE INDEX o_od ON orders (o_orderdate ASC);
-CREATE INDEX l_ok ON lineitem (l_orderkey ASC);
-CREATE INDEX l_pk ON lineitem (l_partkey ASC);
-CREATE INDEX l_sk ON lineitem (l_suppkey ASC);
-CREATE INDEX l_sd ON lineitem (l_shipdate ASC);
-CREATE INDEX l_cd ON lineitem (l_commitdate ASC);
-CREATE INDEX l_rd ON lineitem (l_receiptdate ASC);
-CREATE INDEX l_pk_sk ON lineitem (l_partkey ASC, l_suppkey ASC);
-CREATE INDEX l_sk_pk ON lineitem (l_suppkey ASC, l_partkey ASC);
+-- We don't create any indexes so that there's a clean slate for tuning
+-- CREATE UNIQUE INDEX r_rk ON region (r_regionkey ASC);
+-- CREATE UNIQUE INDEX n_nk ON nation (n_nationkey ASC);
+-- CREATE INDEX n_rk ON nation (n_regionkey ASC);
+-- CREATE UNIQUE INDEX p_pk ON part (p_partkey ASC);
+-- CREATE UNIQUE INDEX s_sk ON supplier (s_suppkey ASC);
+-- CREATE INDEX s_nk ON supplier (s_nationkey ASC);
+-- CREATE INDEX ps_pk ON partsupp (ps_partkey ASC);
+-- CREATE INDEX ps_sk ON partsupp (ps_suppkey ASC);
+-- CREATE UNIQUE INDEX ps_pk_sk ON partsupp (ps_partkey ASC, ps_suppkey ASC);
+-- CREATE UNIQUE INDEX ps_sk_pk ON partsupp (ps_suppkey ASC, ps_partkey ASC);
+-- CREATE UNIQUE INDEX c_ck ON customer (c_custkey ASC);
+-- CREATE INDEX c_nk ON customer (c_nationkey ASC);
+-- CREATE UNIQUE INDEX o_ok ON orders (o_orderkey ASC);
+-- CREATE INDEX o_ck ON orders (o_custkey ASC);
+-- CREATE INDEX o_od ON orders (o_orderdate ASC);
+-- CREATE INDEX l_ok ON lineitem (l_orderkey ASC);
+-- CREATE INDEX l_pk ON lineitem (l_partkey ASC);
+-- CREATE INDEX l_sk ON lineitem (l_suppkey ASC);
+-- CREATE INDEX l_sd ON lineitem (l_shipdate ASC);
+-- CREATE INDEX l_cd ON lineitem (l_commitdate ASC);
+-- CREATE INDEX l_rd ON lineitem (l_receiptdate ASC);
+-- CREATE INDEX l_pk_sk ON lineitem (l_partkey ASC, l_suppkey ASC);
+-- CREATE INDEX l_sk_pk ON lineitem (l_suppkey ASC, l_partkey ASC);
diff --git a/dbms/postgres/build_repo.sh b/dbms/postgres/build_repo.sh
@@ -4,34 +4,34 @@ set -euxo pipefail
 
 REPO_REAL_PARENT_DPATH="$1"
 
-# download and make postgres from the boot repository
+# Download and make postgres from the boot repository.
 mkdir -p "${REPO_REAL_PARENT_DPATH}"
 cd "${REPO_REAL_PARENT_DPATH}"
-git clone git@github.com:lmwnshn/boot.git --single-branch --branch boot --depth 1
+git clone git@github.com:lmwnshn/boot.git --single-branch --branch vldb_2024 --depth 1
 cd ./boot
 ./cmudb/build/configure.sh release "${REPO_REAL_PARENT_DPATH}/boot/build/postgres"
 make clean
 make install-world-bin -j4
 
-# download and make bytejack
-cd ./cmudb/extension/bytejack_rs/
+# Download and make boot.
+cd ./cmudb/extension/boot_rs/
 cargo build --release
-cbindgen . -o target/bytejack_rs.h --lang c
+cbindgen . -o target/boot_rs.h --lang c
 cd "${REPO_REAL_PARENT_DPATH}/boot"
 
-cd ./cmudb/extension/bytejack/
+cd ./cmudb/extension/boot/
 make clean
 make install -j
 cd "${REPO_REAL_PARENT_DPATH}/boot"
 
-# download and make hypopg
+# Download and make hypopg.
 git clone git@github.com:HypoPG/hypopg.git
 cd ./hypopg
 PG_CONFIG="${REPO_REAL_PARENT_DPATH}/boot/build/postgres/bin/pg_config" make install
 cd "${REPO_REAL_PARENT_DPATH}/boot"
 
-# download and make pg_hint_plan
-# we need -L to follow links
+# Download and make pg_hint_plan.
+# We need -L to follow links.
 curl -L https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL15_1_5_1.tar.gz -o REL15_1_5_1.tar.gz
 tar -xzf REL15_1_5_1.tar.gz
 rm REL15_1_5_1.tar.gz