replace (almost) all occurrences of pgdata with dbdata

cmu-db · Jul 7, 2024 · c45c4b6 · c45c4b6
1 parent dfad09a
commit c45c4b6
Show file tree

Hide file tree

Showing 17 changed files with 204 additions and 210 deletions.
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ These steps were tested on a fresh repository clone, Ubuntu ??.04.
 ./dependency/install_dependencies.sh
 
 # Compile a custom fork of PostgreSQL, load TPC-H, train the Proto-X agent, and tune.
-./scripts/quickstart.sh postgres path/to/put/pgdata/in tpch 0.01 protox
+./scripts/quickstart.sh postgres dir/to/put/dbdata/in/ tpch 0.01 protox
 ```
 
 ## Overview

diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py
@@ -21,8 +21,8 @@ def tpch_group(dbgym_cfg: DBGymConfig):
 @tpch_group.command(name="data")
 @click.argument("scale-factor", type=float)
 @click.pass_obj
-# The reason generate-data is separate from create-pgdata is because generate-data is generic
-#   to all DBMSs while create-pgdata is specific to Postgres.
+# The reason generate data is separate from create dbdata is because generate-data is generic
+#   to all DBMSs while create dbdata is specific to a single DBMS.
 def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float):
     _clone(dbgym_cfg)
     _generate_data(dbgym_cfg, scale_factor)

diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py
@@ -1,5 +1,5 @@
 """
-At a high level, this file's goal is to (1) install+build postgres and (2) create pgdata.
+At a high level, this file's goal is to (1) build postgres and (2) create dbdata (aka pgdata).
 On the other hand, the goal of tune.protox.env.util.postgres is to provide helpers to manage
     a Postgres instance during agent tuning.
 util.pg provides helpers used by *both* of the above files (as well as other files).
@@ -13,7 +13,7 @@
 
 from benchmark.tpch.load_info import TpchLoadInfo
 from dbms.load_info_base_class import LoadInfoBaseClass
-from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, link_result, open_and_save, save_file, get_pgdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER, default_pgdata_parent_dpath, is_ssd
+from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, link_result, open_and_save, save_file, get_dbdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER, default_dbdata_parent_dpath, is_ssd
 from util.shell import subprocess_run
 from sqlalchemy import Connection
 from util.pg import SHARED_PRELOAD_LIBRARIES, conn_execute, sql_file_execute, DBGYM_POSTGRES_DBNAME, create_conn, DEFAULT_POSTGRES_PORT, DBGYM_POSTGRES_USER, DBGYM_POSTGRES_PASS, DEFAULT_POSTGRES_DBNAME
@@ -31,7 +31,7 @@ def postgres_group(dbgym_cfg: DBGymConfig):
 
 @postgres_group.command(
     name="build",
-    help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create pgdata.",
+    help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create dbdata.",
 )
 @click.pass_obj
 @click.option("--rebuild", is_flag=True, help="Include this flag to rebuild Postgres even if it already exists.")
@@ -40,46 +40,46 @@ def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool):
 
 
 @postgres_group.command(
-    name="pgdata",
-    help="Build a .tgz file of pgdata with various specifications for its contents.",
+    name="dbdata",
+    help="Build a .tgz file of dbdata with various specifications for its contents.",
 )
 @click.pass_obj
 @click.argument("benchmark_name", type=str)
 @click.option("--scale-factor", type=float, default=1)
 @click.option("--pgbin-path", type=Path, default=None, help=f"The path to the bin containing Postgres executables. The default is {default_pgbin_path(WORKSPACE_PATH_PLACEHOLDER)}.")
 @click.option(
-    "--intended-pgdata-hardware",
+    "--intended-dbdata-hardware",
     type=click.Choice(["hdd", "ssd"]),
     default="hdd",
-    help=f"The intended hardware pgdata should be on. Used as a sanity check for --pgdata-parent-dpath.",
+    help=f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata-parent-dpath.",
 )
 @click.option(
-    "--pgdata-parent-dpath",
+    "--dbdata-parent-dpath",
     default=None,
     type=Path,
-    help=f"The path to the parent directory of the pgdata which will be actively tuned. The default is {default_pgdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.",
+    help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {default_dbdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.",
 )
-def postgres_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, intended_pgdata_hardware: str, pgdata_parent_dpath: Path):
+def postgres_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, intended_dbdata_hardware: str, dbdata_parent_dpath: Path):
     # Set args to defaults programmatically (do this before doing anything else in the function)
     if pgbin_path == None:
         pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path)
-    if pgdata_parent_dpath == None:
-        pgdata_parent_dpath = default_pgdata_parent_dpath(dbgym_cfg.dbgym_workspace_path)
+    if dbdata_parent_dpath == None:
+        dbdata_parent_dpath = default_dbdata_parent_dpath(dbgym_cfg.dbgym_workspace_path)
 
     # Convert all input paths to absolute paths
     pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path)
-    pgdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, pgdata_parent_dpath)
+    dbdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, dbdata_parent_dpath)
 
     # Check assertions on args
-    if intended_pgdata_hardware == "hdd":
-        assert not is_ssd(pgdata_parent_dpath), f"Intended hardware is HDD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an SSD"
-    elif intended_pgdata_hardware == "ssd":
-        assert is_ssd(pgdata_parent_dpath), f"Intended hardware is SSD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an HDD"
+    if intended_dbdata_hardware == "hdd":
+        assert not is_ssd(dbdata_parent_dpath), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD"
+    elif intended_dbdata_hardware == "ssd":
+        assert is_ssd(dbdata_parent_dpath), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD"
     else:
         assert False
 
-    # Create pgdata
-    _create_pgdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path, pgdata_parent_dpath)
+    # Create dbdata
+    _create_dbdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path, dbdata_parent_dpath)
 
 
 def _get_pgbin_symlink_path(dbgym_cfg: DBGymConfig) -> Path:
@@ -108,52 +108,52 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild):
     dbms_postgres_logger.info(f"Set up repo in {expected_repo_symlink_dpath}")
 
 
-def _create_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, pgdata_parent_dpath: Path) -> None:
+def _create_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, dbdata_parent_dpath: Path) -> None:
     """
-    I chose *not* for this function to skip by default if pgdata_tgz_symlink_path already exists. This
+    I chose *not* for this function to skip by default if dbdata_tgz_symlink_path already exists. This
       is because, while the generated data is deterministic given benchmark_name and scale_factor, any
-      change in the _create_pgdata() function would result in a different pgdata. Since _create_pgdata()
+      change in the _create_dbdata() function would result in a different dbdata. Since _create_dbdata()
       may change somewhat frequently, I decided to get rid of the footgun of having changes to
-      _create_pgdata() not propagate to [pgdata].tgz by default.
+      _create_dbdata() not propagate to [dbdata].tgz by default.
     """
 
-    # It's ok for the pgdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place.
-    pgdata_dpath = pgdata_parent_dpath / "pgdata_being_created"
-    # We might be reusing the same pgdata_parent_dpath, so delete pgdata_dpath if it already exists
-    if pgdata_dpath.exists():
-        shutil.rmtree(pgdata_dpath)
+    # It's ok for the dbdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place.
+    dbdata_dpath = dbdata_parent_dpath / "dbdata_being_created"
+    # We might be reusing the same dbdata_parent_dpath, so delete dbdata_dpath if it already exists
+    if dbdata_dpath.exists():
+        shutil.rmtree(dbdata_dpath)
 
     # Call initdb.
     # Save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run.
     save_file(dbgym_cfg, pgbin_path / "initdb")
-    subprocess_run(f'./initdb -D "{pgdata_dpath}"', cwd=pgbin_path)
+    subprocess_run(f'./initdb -D "{dbdata_dpath}"', cwd=pgbin_path)
 
-    # Start Postgres (all other pgdata setup requires postgres to be started).
+    # Start Postgres (all other dbdata setup requires postgres to be started).
     # Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead.
-    start_postgres(dbgym_cfg, pgbin_path, pgdata_dpath)
+    start_postgres(dbgym_cfg, pgbin_path, dbdata_dpath)
 
     # Set up Postgres.
-    _generic_pgdata_setup(dbgym_cfg)
-    _load_benchmark_into_pgdata(dbgym_cfg, benchmark_name, scale_factor)
+    _generic_dbdata_setup(dbgym_cfg)
+    _load_benchmark_into_dbdata(dbgym_cfg, benchmark_name, scale_factor)
 
     # Stop Postgres so that we don't "leak" processes.
-    stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath)
+    stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath)
 
     # Create .tgz file.
-    # Note that you can't pass "[pgdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[pgdata].tgz" as a dir.
-    pgdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path(
+    # Note that you can't pass "[dbdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[dbdata].tgz" as a dir.
+    dbdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path(
         mkdir=True
-    ) / get_pgdata_tgz_name(benchmark_name, scale_factor)
-    # We need to cd into pgdata_dpath so that the tar file does not contain folders for the whole path of pgdata_dpath.
-    subprocess_run(f"tar -czf {pgdata_tgz_real_fpath} .", cwd=pgdata_dpath)
+    ) / get_dbdata_tgz_name(benchmark_name, scale_factor)
+    # We need to cd into dbdata_dpath so that the tar file does not contain folders for the whole path of dbdata_dpath.
+    subprocess_run(f"tar -czf {dbdata_tgz_real_fpath} .", cwd=dbdata_dpath)
 
     # Create symlink.
-    # Only link at the end so that the link only ever points to a complete pgdata.
-    pgdata_tgz_symlink_path = link_result(dbgym_cfg, pgdata_tgz_real_fpath)
-    dbms_postgres_logger.info(f"Created pgdata in {pgdata_tgz_symlink_path}")
+    # Only link at the end so that the link only ever points to a complete dbdata.
+    dbdata_tgz_symlink_path = link_result(dbgym_cfg, dbdata_tgz_real_fpath)
+    dbms_postgres_logger.info(f"Created dbdata in {dbdata_tgz_symlink_path}")
 
 
-def _generic_pgdata_setup(dbgym_cfg: DBGymConfig):
+def _generic_dbdata_setup(dbgym_cfg: DBGymConfig):
     # get necessary vars
     pgbin_real_dpath = _get_pgbin_symlink_path(dbgym_cfg).resolve()
     assert pgbin_real_dpath.exists()
@@ -181,29 +181,29 @@ def _generic_pgdata_setup(dbgym_cfg: DBGymConfig):
             cwd=pgbin_real_dpath,
         )
 
-    # Create the dbgym database. since one pgdata dir maps to one benchmark, all benchmarks will use the same database
-    # as opposed to using databases named after the benchmark
+    # Create the dbgym database. Since one dbdata dir maps to one benchmark, all benchmarks will use the same database
+    # as opposed to using databases named after the benchmark.
     subprocess_run(
         f"./psql -c \"create database {DBGYM_POSTGRES_DBNAME} with owner = '{dbgym_pguser}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost",
         cwd=pgbin_real_dpath,
     )
 
 
-def _load_benchmark_into_pgdata(
+def _load_benchmark_into_dbdata(
     dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float
 ):
     with create_conn(use_psycopg=False) as conn:
         if benchmark_name == "tpch":
             load_info = TpchLoadInfo(dbgym_cfg, scale_factor)
         else:
             raise AssertionError(
-                f"_load_benchmark_into_pgdata(): the benchmark of name {benchmark_name} is not implemented"
+                f"_load_benchmark_into_dbdata(): the benchmark of name {benchmark_name} is not implemented"
             )
 
-        _load_into_pgdata(dbgym_cfg, conn, load_info)
+        _load_into_dbdata(dbgym_cfg, conn, load_info)
 
 
-def _load_into_pgdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadInfoBaseClass):
+def _load_into_dbdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadInfoBaseClass):
     sql_file_execute(dbgym_cfg, conn, load_info.get_schema_fpath())
 
     # truncate all tables first before even loading a single one
@@ -222,29 +222,29 @@ def _load_into_pgdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadI
         sql_file_execute(dbgym_cfg, conn, constraints_fpath)
 
 
-def start_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path) -> None:
-    _start_or_stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath, True)
+def start_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None:
+    _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, True)
 
 
-def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path) -> None:
-    _start_or_stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath, False)
+def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None:
+    _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, False)
 
 
-def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path, is_start: bool) -> None:
+def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path, is_start: bool) -> None:
     # They should be absolute paths and should exist
     assert pgbin_path.is_absolute() and pgbin_path.exists()
-    assert pgdata_dpath.is_absolute() and pgdata_dpath.exists()
+    assert dbdata_dpath.is_absolute() and dbdata_dpath.exists()
     # The inputs may be symlinks so we need to resolve them first
     pgbin_real_dpath = pgbin_path.resolve()
-    pgdata_dpath = pgdata_dpath.resolve()
+    dbdata_dpath = dbdata_dpath.resolve()
     pgport = DEFAULT_POSTGRES_PORT
     save_file(dbgym_cfg, pgbin_real_dpath / "pg_ctl")
 
     if is_start:
         # We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start".
         # The reason subprocess_run() never returns is because pg_ctl spawns a postgres process so .poll() always returns None.
         # On the other hand, subprocess.run() does return normally, like calling `./pg_ctl` on the command line would do.
-        result = subprocess.run(f"./pg_ctl -D \"{pgdata_dpath}\" -o '-p {pgport}' start", cwd=pgbin_real_dpath, shell=True)
+        result = subprocess.run(f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' start", cwd=pgbin_real_dpath, shell=True)
         result.check_returncode()
     else:
-        subprocess_run(f"./pg_ctl -D \"{pgdata_dpath}\" -o '-p {pgport}' stop", cwd=pgbin_real_dpath)
+        subprocess_run(f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' stop", cwd=pgbin_real_dpath)
diff --git a/experiments/load_per_machine_envvars.sh b/experiments/load_per_machine_envvars.sh
@@ -2,9 +2,9 @@
 host=$(hostname)
 
 if [ "$host" == "dev4" ]; then
-    export PGDATA_PARENT_DPATH=/mnt/nvme1n1/phw2/dbgym_tmp/
+    export DBDATA_PARENT_DPATH=/mnt/nvme1n1/phw2/dbgym_tmp/
 elif [ "$host" == "dev6" ]; then
-    export PGDATA_PARENT_DPATH=/mnt/nvme0n1/phw2/dbgym_tmp/
+    export DBDATA_PARENT_DPATH=/mnt/nvme0n1/phw2/dbgym_tmp/
 else
     echo "Did not recognize host \"$host\""
     exit 1

diff --git a/experiments/protox_tpch_sf0point1/main.sh b/experiments/protox_tpch_sf0point1/main.sh
@@ -3,12 +3,12 @@
 set -euxo pipefail
 
 SCALE_FACTOR=0.1
-INTENDED_PGDATA_HARDWARE=ssd
+INTENDED_DBDATA_HARDWARE=ssd
 . ./experiments/load_per_machine_envvars.sh
-echo $PGDATA_PARENT_DPATH
+echo $DBDATA_PARENT_DPATH
 
 # space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars)
-# python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 0.1  --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH
+# python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 0.1  --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH
 python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR --tune-duration-during-tune 0.2
 python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR
 exit 0
@@ -19,15 +19,15 @@ python3 task.py benchmark tpch workload --scale-factor $SCALE_FACTOR
 
 # postgres
 python3 task.py dbms postgres build
-python3 task.py dbms postgres pgdata tpch --scale-factor $SCALE_FACTOR --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH
+python3 task.py dbms postgres dbdata tpch --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH
 
 exit 0
 
 # embedding
-python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH # long datagen so that train doesn't crash
+python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # long datagen so that train doesn't crash
 python3 task.py tune protox embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2
 
 # agent
-python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 1  --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH --build-space-good-for-boot
+python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 1  --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot
 python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR
 python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR