Add README. (#33)

**Summary**: Wrote **Quickstart** and **Overview** sections. Go [here](https://github.com/wangpatrick57/dbgym/tree/readme) to see the README on the website. **Details**: * **Overview** summarizes the research motivation behind the project, giving background as necessary. * **Quickstart** gives a single shell script which compiles Postgres with Boot, generates data, builds a Proto-X embedding, and trains a Proto-X agent. * I renamed all occurrences of "pgdata" to "dbdata" to match the project's vision of working for multiple DBMSs (as described in the README). * I removed the startup check. * I got rid of the `ssd_checker` dependency as it's a very small repository. * Fixed Postgres compilation code to work with the new `vldb_2024` branch of Boot. --------- Co-authored-by: Wan Shen Lim <wanshen.lim@gmail.com>
cmu-db · Jul 7, 2024 · d5cc4c2 · d5cc4c2
1 parent 3aecdd1
commit d5cc4c2
Show file tree

Hide file tree

Showing 22 changed files with 387 additions and 284 deletions.
diff --git a/README.md b/README.md
@@ -1 +1,86 @@
-# Database Gym
+# 🛢️ Database Gym 🏋️
+[\[Slides\]](http://www.cidrdb.org/cidr2023/slides/p27-lim-slides.pdf) [\[Paper\]](https://www.cidrdb.org/cidr2023/papers/p27-lim.pdf)
+
+*An end-to-end research vehicle for the field of self-driving DBMSs.*
+
+## Quickstart
+
+These steps were tested on a fresh repository clone, Ubuntu 22.04.
+
+```
+# Setup dependencies.
+# You may want to create a Python virtual environment (e.g. with conda) before doing this.
+./dependency/install_dependencies.sh
+
+# Compile a custom fork of PostgreSQL, load TPC-H (SF 0.01), train the Proto-X agent, and tune.
+./scripts/quickstart.sh postgres tpch 0.01 protox
+```
+
+## Overview
+
+Autonomous DBMS research often involves more engineering than research.
+As new advances in state-of-the-art technology are made, it is common to find that they have have
+reimplemented the database tuning pipeline from scratch: workload capture, database setup,
+training data collection, model creation, model deployment, and more.
+Moreover, these bespoke pipelines make it difficult to combine different techniques even when they
+should be independent (e.g., using a different operator latency model in a tuning algorithm).
+
+The database gym project is our attempt at standardizing the APIs between these disparate tasks,
+allowing researchers to mix-and-match the different pipeline components.
+It draws inspiration from the Farama Foundation's Gymnasium (formerly OpenAI Gym), which
+accelerates the development and comparison of reinforcement learning algorithms by providing a set
+of agents, environments, and a standardized API for communicating between them.
+Through the database gym, we hope to save other people time and reimplementation effort by
+providing an extensible open-source platform for autonomous DBMS research.
+
+This project is under active development.
+Currently, we decompose the database tuning pipeline into the following components:
+
+1. Workload: collection, forecasting, synthesis
+2. Database: database loading, instrumentation, orchestrating workload execution
+3. Agent: identifying tuning actions, suggesting an action
+
+## Repository Structure
+
+`task.py` is the entrypoint for all tasks.
+The tasks are grouped into categories that correspond to the top-level directories of the repository:
+
+- `benchmark` - tasks to generate data and queries for different benchmarks (e.g., TPC-H, JOB)
+- `dbms` - tasks to build and start DBMSs (e.g., PostgreSQL)
+- `tune` - tasks to train autonomous database tuning agents
+
+## Credits
+
+The Database Gym project rose from the ashes of the [NoisePage](https://db.cs.cmu.edu/projects/noisepage/) self-driving DBMS project.
+
+The first prototype was written by [Patrick Wang](https://github.com/wangpatrick57), integrating [Boot (VLDB 2024)](https://github.com/lmwnshn/boot) and [Proto-X (VLDB 2024)](https://github.com/17zhangw/protox) into a cohesive system.
+
+## Citing This Repository
+
+If you use this repository in an academic paper, please cite:
+
+```
+@inproceedings{lim23,
+ author = {Lim, Wan Shen and Butrovich, Matthew and Zhang, William and Crotty, Andrew and Ma, Lin and Xu, Peijing and Gehrke, Johannes and Pavlo, Andrew},
+ title = {Database Gyms},
+ booktitle = {{CIDR} 2023, Conference on Innovative Data Systems Research},
+ year = {2023},
+ url = {https://db.cs.cmu.edu/papers/2023/p27-lim.pdf},
+ }
+```
+
+Additionally, please cite any module-specific paper that is relevant to your use.
+
+**Accelerating Training Data Generation**
+
+```
+(citation pending)
+Boot, appearing at VLDB 2024.
+```
+
+**Simultaneously Tuning Multiple Configuration Spaces with Proto Actions**
+
+```
+(citation pending)
+Proto-X, appearing at VLDB 2024.
+```
diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
@@ -21,8 +21,8 @@ def tpch_group(dbgym_cfg: DBGymConfig):
 @tpch_group.command(name="data")
 @click.argument("scale-factor", type=float)
 @click.pass_obj
-# The reason generate-data is separate from create-pgdata is because generate-data is generic
-# to all DBMSs while create-pgdata is specific to Postgres.
+# The reason generate data is separate from create dbdata is because generate-data is generic
+# to all DBMSs while create dbdata is specific to a single DBMS.
 def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float):
  _clone(dbgym_cfg)
  _generate_data(dbgym_cfg, scale_factor)

diff --git a/dbms/postgres/build_repo.sh b/dbms/postgres/build_repo.sh
@@ -4,34 +4,34 @@ set -euxo pipefail
 
 REPO_REAL_PARENT_DPATH="$1"
 
-# download and make postgres from the boot repository
+# Download and make postgres from the boot repository.
 mkdir -p "${REPO_REAL_PARENT_DPATH}"
 cd "${REPO_REAL_PARENT_DPATH}"
-git clone git@github.com:lmwnshn/boot.git --single-branch --branch boot --depth 1
+git clone git@github.com:lmwnshn/boot.git --single-branch --branch vldb_2024 --depth 1
 cd ./boot
 ./cmudb/build/configure.sh release "${REPO_REAL_PARENT_DPATH}/boot/build/postgres"
 make clean
 make install-world-bin -j4
 
-# download and make bytejack
-cd ./cmudb/extension/bytejack_rs/
+# Download and make boot.
+cd ./cmudb/extension/boot_rs/
 cargo build --release
-cbindgen . -o target/bytejack_rs.h --lang c
+cbindgen . -o target/boot_rs.h --lang c
 cd "${REPO_REAL_PARENT_DPATH}/boot"
 
-cd ./cmudb/extension/bytejack/
+cd ./cmudb/extension/boot/
 make clean
 make install -j
 cd "${REPO_REAL_PARENT_DPATH}/boot"
 
-# download and make hypopg
+# Download and make hypopg.
 git clone git@github.com:HypoPG/hypopg.git
 cd ./hypopg
 PG_CONFIG="${REPO_REAL_PARENT_DPATH}/boot/build/postgres/bin/pg_config" make install
 cd "${REPO_REAL_PARENT_DPATH}/boot"
 
-# download and make pg_hint_plan
-# we need -L to follow links
+# Download and make pg_hint_plan.
+# We need -L to follow links.
 curl -L https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL15_1_5_1.tar.gz -o REL15_1_5_1.tar.gz
 tar -xzf REL15_1_5_1.tar.gz
 rm REL15_1_5_1.tar.gz

diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py
@@ -1,5 +1,5 @@
 """
-At a high level, this file's goal is to (1) install+build postgres and (2) create pgdata.
+At a high level, this file's goal is to (1) build postgres and (2) create dbdata (aka pgdata).
 On the other hand, the goal of tune.protox.env.util.postgres is to provide helpers to manage
  a Postgres instance during agent tuning.
 util.pg provides helpers used by *both* of the above files (as well as other files).
@@ -10,11 +10,10 @@
 import subprocess
 from pathlib import Path
 import click
-import ssd_checker
 
 from benchmark.tpch.load_info import TpchLoadInfo
 from dbms.load_info_base_class import LoadInfoBaseClass
-from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, link_result, open_and_save, save_file, get_pgdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER, default_pgdata_parent_dpath
+from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, link_result, open_and_save, save_file, get_dbdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER, default_dbdata_parent_dpath, is_ssd
 from util.shell import subprocess_run
 from sqlalchemy import Connection
 from util.pg import SHARED_PRELOAD_LIBRARIES, conn_execute, sql_file_execute, DBGYM_POSTGRES_DBNAME, create_conn, DEFAULT_POSTGRES_PORT, DBGYM_POSTGRES_USER, DBGYM_POSTGRES_PASS, DEFAULT_POSTGRES_DBNAME
@@ -32,7 +31,7 @@ def postgres_group(dbgym_cfg: DBGymConfig):
 
 @postgres_group.command(
  name="build",
- help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create pgdata.",
+ help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create dbdata.",
 )
 @click.pass_obj
 @click.option("--rebuild", is_flag=True, help="Include this flag to rebuild Postgres even if it already exists.")
@@ -41,46 +40,46 @@ def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool):
 
 
 @postgres_group.command(
- name="pgdata",
- help="Build a .tgz file of pgdata with various specifications for its contents.",
+ name="dbdata",
+ help="Build a .tgz file of dbdata with various specifications for its contents.",
 )
 @click.pass_obj
 @click.argument("benchmark_name", type=str)
 @click.option("--scale-factor", type=float, default=1)
 @click.option("--pgbin-path", type=Path, default=None, help=f"The path to the bin containing Postgres executables. The default is {default_pgbin_path(WORKSPACE_PATH_PLACEHOLDER)}.")
 @click.option(
- "--intended-pgdata-hardware",
+ "--intended-dbdata-hardware",
  type=click.Choice(["hdd", "ssd"]),
  default="hdd",
- help=f"The intended hardware pgdata should be on. Used as a sanity check for --pgdata-parent-dpath.",
+ help=f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata-parent-dpath.",
 )
 @click.option(
- "--pgdata-parent-dpath",
+ "--dbdata-parent-dpath",
  default=None,
  type=Path,
- help=f"The path to the parent directory of the pgdata which will be actively tuned. The default is {default_pgdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.",
+ help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {default_dbdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.",
 )
-def postgres_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, intended_pgdata_hardware: str, pgdata_parent_dpath: Path):
+def postgres_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, intended_dbdata_hardware: str, dbdata_parent_dpath: Path):
  # Set args to defaults programmatically (do this before doing anything else in the function)
  if pgbin_path == None:
  pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path)
- if pgdata_parent_dpath == None:
- pgdata_parent_dpath = default_pgdata_parent_dpath(dbgym_cfg.dbgym_workspace_path)
+ if dbdata_parent_dpath == None:
+ dbdata_parent_dpath = default_dbdata_parent_dpath(dbgym_cfg.dbgym_workspace_path)
 
  # Convert all input paths to absolute paths
  pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path)
- pgdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, pgdata_parent_dpath)
+ dbdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, dbdata_parent_dpath)
 
  # Check assertions on args
- if intended_pgdata_hardware == "hdd":
- assert not ssd_checker.is_ssd(pgdata_parent_dpath), f"Intended hardware is HDD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an SSD"
- elif intended_pgdata_hardware == "ssd":
- assert ssd_checker.is_ssd(pgdata_parent_dpath), f"Intended hardware is SSD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an HDD"
+ if intended_dbdata_hardware == "hdd":
+ assert not is_ssd(dbdata_parent_dpath), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD"
+ elif intended_dbdata_hardware == "ssd":
+ assert is_ssd(dbdata_parent_dpath), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD"
  else:
  assert False
 
- # Create pgdata
- _create_pgdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path, pgdata_parent_dpath)
+ # Create dbdata
+ _create_dbdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path, dbdata_parent_dpath)
 
 
 def _get_pgbin_symlink_path(dbgym_cfg: DBGymConfig) -> Path:
@@ -109,52 +108,52 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild):
  dbms_postgres_logger.info(f"Set up repo in {expected_repo_symlink_dpath}")
 
 
-def _create_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, pgdata_parent_dpath: Path) -> None:
+def _create_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, dbdata_parent_dpath: Path) -> None:
  """
- I chose *not* for this function to skip by default if pgdata_tgz_symlink_path already exists. This
+ I chose *not* for this function to skip by default if dbdata_tgz_symlink_path already exists. This
  is because, while the generated data is deterministic given benchmark_name and scale_factor, any
- change in the _create_pgdata() function would result in a different pgdata. Since _create_pgdata()
+ change in the _create_dbdata() function would result in a different dbdata. Since _create_dbdata()
  may change somewhat frequently, I decided to get rid of the footgun of having changes to
- _create_pgdata() not propagate to [pgdata].tgz by default.
+ _create_dbdata() not propagate to [dbdata].tgz by default.
  """
 
- # It's ok for the pgdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place.
- pgdata_dpath = pgdata_parent_dpath / "pgdata_being_created"
- # We might be reusing the same pgdata_parent_dpath, so delete pgdata_dpath if it already exists
- if pgdata_dpath.exists():
- shutil.rmtree(pgdata_dpath)
+ # It's ok for the dbdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place.
+ dbdata_dpath = dbdata_parent_dpath / "dbdata_being_created"
+ # We might be reusing the same dbdata_parent_dpath, so delete dbdata_dpath if it already exists
+ if dbdata_dpath.exists():
+ shutil.rmtree(dbdata_dpath)
 
  # Call initdb.
  # Save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run.
  save_file(dbgym_cfg, pgbin_path / "initdb")
- subprocess_run(f'./initdb -D "{pgdata_dpath}"', cwd=pgbin_path)
+ subprocess_run(f'./initdb -D "{dbdata_dpath}"', cwd=pgbin_path)
 
- # Start Postgres (all other pgdata setup requires postgres to be started).
+ # Start Postgres (all other dbdata setup requires postgres to be started).
  # Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead.
- start_postgres(dbgym_cfg, pgbin_path, pgdata_dpath)
+ start_postgres(dbgym_cfg, pgbin_path, dbdata_dpath)
 
  # Set up Postgres.
- _generic_pgdata_setup(dbgym_cfg)
- _load_benchmark_into_pgdata(dbgym_cfg, benchmark_name, scale_factor)
+ _generic_dbdata_setup(dbgym_cfg)
+ _load_benchmark_into_dbdata(dbgym_cfg, benchmark_name, scale_factor)
 
  # Stop Postgres so that we don't "leak" processes.
- stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath)
+ stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath)
 
  # Create .tgz file.
- # Note that you can't pass "[pgdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[pgdata].tgz" as a dir.
- pgdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path(
+ # Note that you can't pass "[dbdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[dbdata].tgz" as a dir.
+ dbdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path(
  mkdir=True
- ) / get_pgdata_tgz_name(benchmark_name, scale_factor)
- # We need to cd into pgdata_dpath so that the tar file does not contain folders for the whole path of pgdata_dpath.
- subprocess_run(f"tar -czf {pgdata_tgz_real_fpath} .", cwd=pgdata_dpath)
+ ) / get_dbdata_tgz_name(benchmark_name, scale_factor)
+ # We need to cd into dbdata_dpath so that the tar file does not contain folders for the whole path of dbdata_dpath.
+ subprocess_run(f"tar -czf {dbdata_tgz_real_fpath} .", cwd=dbdata_dpath)
 
  # Create symlink.
- # Only link at the end so that the link only ever points to a complete pgdata.
- pgdata_tgz_symlink_path = link_result(dbgym_cfg, pgdata_tgz_real_fpath)
- dbms_postgres_logger.info(f"Created pgdata in {pgdata_tgz_symlink_path}")
+ # Only link at the end so that the link only ever points to a complete dbdata.
+ dbdata_tgz_symlink_path = link_result(dbgym_cfg, dbdata_tgz_real_fpath)
+ dbms_postgres_logger.info(f"Created dbdata in {dbdata_tgz_symlink_path}")
 
 
-def _generic_pgdata_setup(dbgym_cfg: DBGymConfig):
+def _generic_dbdata_setup(dbgym_cfg: DBGymConfig):
  # get necessary vars
  pgbin_real_dpath = _get_pgbin_symlink_path(dbgym_cfg).resolve()
  assert pgbin_real_dpath.exists()
@@ -182,29 +181,29 @@ def _generic_pgdata_setup(dbgym_cfg: DBGymConfig):
  cwd=pgbin_real_dpath,
  )
 
- # Create the dbgym database. since one pgdata dir maps to one benchmark, all benchmarks will use the same database
- # as opposed to using databases named after the benchmark
+ # Create the dbgym database. Since one dbdata dir maps to one benchmark, all benchmarks will use the same database
+ # as opposed to using databases named after the benchmark.
  subprocess_run(
  f"./psql -c \"create database {DBGYM_POSTGRES_DBNAME} with owner = '{dbgym_pguser}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost",
  cwd=pgbin_real_dpath,
  )
 
 
-def _load_benchmark_into_pgdata(
+def _load_benchmark_into_dbdata(
  dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float
 ):
  with create_conn(use_psycopg=False) as conn:
  if benchmark_name == "tpch":
  load_info = TpchLoadInfo(dbgym_cfg, scale_factor)
  else:
  raise AssertionError(
- f"_load_benchmark_into_pgdata(): the benchmark of name {benchmark_name} is not implemented"
+ f"_load_benchmark_into_dbdata(): the benchmark of name {benchmark_name} is not implemented"
  )
 
- _load_into_pgdata(dbgym_cfg, conn, load_info)
+ _load_into_dbdata(dbgym_cfg, conn, load_info)
 
 
-def _load_into_pgdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadInfoBaseClass):
+def _load_into_dbdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadInfoBaseClass):
  sql_file_execute(dbgym_cfg, conn, load_info.get_schema_fpath())
 
  # truncate all tables first before even loading a single one
@@ -223,29 +222,29 @@ def _load_into_pgdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadI
  sql_file_execute(dbgym_cfg, conn, constraints_fpath)
 
 
-def start_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path) -> None:
- _start_or_stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath, True)
+def start_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None:
+ _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, True)
 
 
-def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path) -> None:
- _start_or_stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath, False)
+def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None:
+ _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, False)
 
 
-def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path, is_start: bool) -> None:
+def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path, is_start: bool) -> None:
  # They should be absolute paths and should exist
  assert pgbin_path.is_absolute() and pgbin_path.exists()
- assert pgdata_dpath.is_absolute() and pgdata_dpath.exists()
+ assert dbdata_dpath.is_absolute() and dbdata_dpath.exists()
  # The inputs may be symlinks so we need to resolve them first
  pgbin_real_dpath = pgbin_path.resolve()
- pgdata_dpath = pgdata_dpath.resolve()
+ dbdata_dpath = dbdata_dpath.resolve()
  pgport = DEFAULT_POSTGRES_PORT
  save_file(dbgym_cfg, pgbin_real_dpath / "pg_ctl")
 
  if is_start:
  # We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start".
  # The reason subprocess_run() never returns is because pg_ctl spawns a postgres process so .poll() always returns None.
  # On the other hand, subprocess.run() does return normally, like calling `./pg_ctl` on the command line would do.
- result = subprocess.run(f"./pg_ctl -D \"{pgdata_dpath}\" -o '-p {pgport}' start", cwd=pgbin_real_dpath, shell=True)
+ result = subprocess.run(f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' start", cwd=pgbin_real_dpath, shell=True)
  result.check_returncode()
  else:
- subprocess_run(f"./pg_ctl -D \"{pgdata_dpath}\" -o '-p {pgport}' stop", cwd=pgbin_real_dpath)
+ subprocess_run(f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' stop", cwd=pgbin_real_dpath)