Skip to content

Commit

Permalink
replace (almost) all occurrences of pgdata with dbdata
Browse files Browse the repository at this point in the history
  • Loading branch information
wangpatrick57 committed Jul 7, 2024
1 parent dfad09a commit c45c4b6
Show file tree
Hide file tree
Showing 17 changed files with 204 additions and 210 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ These steps were tested on a fresh repository clone, Ubuntu ??.04.
./dependency/install_dependencies.sh
# Compile a custom fork of PostgreSQL, load TPC-H, train the Proto-X agent, and tune.
./scripts/quickstart.sh postgres path/to/put/pgdata/in tpch 0.01 protox
./scripts/quickstart.sh postgres dir/to/put/dbdata/in/ tpch 0.01 protox
```

## Overview
Expand Down
4 changes: 2 additions & 2 deletions benchmark/tpch/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def tpch_group(dbgym_cfg: DBGymConfig):
@tpch_group.command(name="data")
@click.argument("scale-factor", type=float)
@click.pass_obj
# The reason generate-data is separate from create-pgdata is because generate-data is generic
# to all DBMSs while create-pgdata is specific to Postgres.
# The reason generate data is separate from create dbdata is because generate-data is generic
# to all DBMSs while create dbdata is specific to a single DBMS.
def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float):
_clone(dbgym_cfg)
_generate_data(dbgym_cfg, scale_factor)
Expand Down
116 changes: 58 additions & 58 deletions dbms/postgres/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
At a high level, this file's goal is to (1) install+build postgres and (2) create pgdata.
At a high level, this file's goal is to (1) build postgres and (2) create dbdata (aka pgdata).
On the other hand, the goal of tune.protox.env.util.postgres is to provide helpers to manage
a Postgres instance during agent tuning.
util.pg provides helpers used by *both* of the above files (as well as other files).
Expand All @@ -13,7 +13,7 @@

from benchmark.tpch.load_info import TpchLoadInfo
from dbms.load_info_base_class import LoadInfoBaseClass
from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, link_result, open_and_save, save_file, get_pgdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER, default_pgdata_parent_dpath, is_ssd
from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, link_result, open_and_save, save_file, get_dbdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER, default_dbdata_parent_dpath, is_ssd
from util.shell import subprocess_run
from sqlalchemy import Connection
from util.pg import SHARED_PRELOAD_LIBRARIES, conn_execute, sql_file_execute, DBGYM_POSTGRES_DBNAME, create_conn, DEFAULT_POSTGRES_PORT, DBGYM_POSTGRES_USER, DBGYM_POSTGRES_PASS, DEFAULT_POSTGRES_DBNAME
Expand All @@ -31,7 +31,7 @@ def postgres_group(dbgym_cfg: DBGymConfig):

@postgres_group.command(
name="build",
help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create pgdata.",
help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create dbdata.",
)
@click.pass_obj
@click.option("--rebuild", is_flag=True, help="Include this flag to rebuild Postgres even if it already exists.")
Expand All @@ -40,46 +40,46 @@ def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool):


@postgres_group.command(
name="pgdata",
help="Build a .tgz file of pgdata with various specifications for its contents.",
name="dbdata",
help="Build a .tgz file of dbdata with various specifications for its contents.",
)
@click.pass_obj
@click.argument("benchmark_name", type=str)
@click.option("--scale-factor", type=float, default=1)
@click.option("--pgbin-path", type=Path, default=None, help=f"The path to the bin containing Postgres executables. The default is {default_pgbin_path(WORKSPACE_PATH_PLACEHOLDER)}.")
@click.option(
"--intended-pgdata-hardware",
"--intended-dbdata-hardware",
type=click.Choice(["hdd", "ssd"]),
default="hdd",
help=f"The intended hardware pgdata should be on. Used as a sanity check for --pgdata-parent-dpath.",
help=f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata-parent-dpath.",
)
@click.option(
"--pgdata-parent-dpath",
"--dbdata-parent-dpath",
default=None,
type=Path,
help=f"The path to the parent directory of the pgdata which will be actively tuned. The default is {default_pgdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.",
help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {default_dbdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.",
)
def postgres_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, intended_pgdata_hardware: str, pgdata_parent_dpath: Path):
def postgres_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, intended_dbdata_hardware: str, dbdata_parent_dpath: Path):
# Set args to defaults programmatically (do this before doing anything else in the function)
if pgbin_path == None:
pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path)
if pgdata_parent_dpath == None:
pgdata_parent_dpath = default_pgdata_parent_dpath(dbgym_cfg.dbgym_workspace_path)
if dbdata_parent_dpath == None:
dbdata_parent_dpath = default_dbdata_parent_dpath(dbgym_cfg.dbgym_workspace_path)

# Convert all input paths to absolute paths
pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path)
pgdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, pgdata_parent_dpath)
dbdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, dbdata_parent_dpath)

# Check assertions on args
if intended_pgdata_hardware == "hdd":
assert not is_ssd(pgdata_parent_dpath), f"Intended hardware is HDD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an SSD"
elif intended_pgdata_hardware == "ssd":
assert is_ssd(pgdata_parent_dpath), f"Intended hardware is SSD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an HDD"
if intended_dbdata_hardware == "hdd":
assert not is_ssd(dbdata_parent_dpath), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD"
elif intended_dbdata_hardware == "ssd":
assert is_ssd(dbdata_parent_dpath), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD"
else:
assert False

# Create pgdata
_create_pgdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path, pgdata_parent_dpath)
# Create dbdata
_create_dbdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path, dbdata_parent_dpath)


def _get_pgbin_symlink_path(dbgym_cfg: DBGymConfig) -> Path:
Expand Down Expand Up @@ -108,52 +108,52 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild):
dbms_postgres_logger.info(f"Set up repo in {expected_repo_symlink_dpath}")


def _create_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, pgdata_parent_dpath: Path) -> None:
def _create_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, dbdata_parent_dpath: Path) -> None:
"""
I chose *not* for this function to skip by default if pgdata_tgz_symlink_path already exists. This
I chose *not* for this function to skip by default if dbdata_tgz_symlink_path already exists. This
is because, while the generated data is deterministic given benchmark_name and scale_factor, any
change in the _create_pgdata() function would result in a different pgdata. Since _create_pgdata()
change in the _create_dbdata() function would result in a different dbdata. Since _create_dbdata()
may change somewhat frequently, I decided to get rid of the footgun of having changes to
_create_pgdata() not propagate to [pgdata].tgz by default.
_create_dbdata() not propagate to [dbdata].tgz by default.
"""

# It's ok for the pgdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place.
pgdata_dpath = pgdata_parent_dpath / "pgdata_being_created"
# We might be reusing the same pgdata_parent_dpath, so delete pgdata_dpath if it already exists
if pgdata_dpath.exists():
shutil.rmtree(pgdata_dpath)
# It's ok for the dbdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place.
dbdata_dpath = dbdata_parent_dpath / "dbdata_being_created"
# We might be reusing the same dbdata_parent_dpath, so delete dbdata_dpath if it already exists
if dbdata_dpath.exists():
shutil.rmtree(dbdata_dpath)

# Call initdb.
# Save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run.
save_file(dbgym_cfg, pgbin_path / "initdb")
subprocess_run(f'./initdb -D "{pgdata_dpath}"', cwd=pgbin_path)
subprocess_run(f'./initdb -D "{dbdata_dpath}"', cwd=pgbin_path)

# Start Postgres (all other pgdata setup requires postgres to be started).
# Start Postgres (all other dbdata setup requires postgres to be started).
# Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead.
start_postgres(dbgym_cfg, pgbin_path, pgdata_dpath)
start_postgres(dbgym_cfg, pgbin_path, dbdata_dpath)

# Set up Postgres.
_generic_pgdata_setup(dbgym_cfg)
_load_benchmark_into_pgdata(dbgym_cfg, benchmark_name, scale_factor)
_generic_dbdata_setup(dbgym_cfg)
_load_benchmark_into_dbdata(dbgym_cfg, benchmark_name, scale_factor)

# Stop Postgres so that we don't "leak" processes.
stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath)
stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath)

# Create .tgz file.
# Note that you can't pass "[pgdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[pgdata].tgz" as a dir.
pgdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path(
# Note that you can't pass "[dbdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[dbdata].tgz" as a dir.
dbdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path(
mkdir=True
) / get_pgdata_tgz_name(benchmark_name, scale_factor)
# We need to cd into pgdata_dpath so that the tar file does not contain folders for the whole path of pgdata_dpath.
subprocess_run(f"tar -czf {pgdata_tgz_real_fpath} .", cwd=pgdata_dpath)
) / get_dbdata_tgz_name(benchmark_name, scale_factor)
# We need to cd into dbdata_dpath so that the tar file does not contain folders for the whole path of dbdata_dpath.
subprocess_run(f"tar -czf {dbdata_tgz_real_fpath} .", cwd=dbdata_dpath)

# Create symlink.
# Only link at the end so that the link only ever points to a complete pgdata.
pgdata_tgz_symlink_path = link_result(dbgym_cfg, pgdata_tgz_real_fpath)
dbms_postgres_logger.info(f"Created pgdata in {pgdata_tgz_symlink_path}")
# Only link at the end so that the link only ever points to a complete dbdata.
dbdata_tgz_symlink_path = link_result(dbgym_cfg, dbdata_tgz_real_fpath)
dbms_postgres_logger.info(f"Created dbdata in {dbdata_tgz_symlink_path}")


def _generic_pgdata_setup(dbgym_cfg: DBGymConfig):
def _generic_dbdata_setup(dbgym_cfg: DBGymConfig):
# get necessary vars
pgbin_real_dpath = _get_pgbin_symlink_path(dbgym_cfg).resolve()
assert pgbin_real_dpath.exists()
Expand Down Expand Up @@ -181,29 +181,29 @@ def _generic_pgdata_setup(dbgym_cfg: DBGymConfig):
cwd=pgbin_real_dpath,
)

# Create the dbgym database. since one pgdata dir maps to one benchmark, all benchmarks will use the same database
# as opposed to using databases named after the benchmark
# Create the dbgym database. Since one dbdata dir maps to one benchmark, all benchmarks will use the same database
# as opposed to using databases named after the benchmark.
subprocess_run(
f"./psql -c \"create database {DBGYM_POSTGRES_DBNAME} with owner = '{dbgym_pguser}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost",
cwd=pgbin_real_dpath,
)


def _load_benchmark_into_pgdata(
def _load_benchmark_into_dbdata(
dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float
):
with create_conn(use_psycopg=False) as conn:
if benchmark_name == "tpch":
load_info = TpchLoadInfo(dbgym_cfg, scale_factor)
else:
raise AssertionError(
f"_load_benchmark_into_pgdata(): the benchmark of name {benchmark_name} is not implemented"
f"_load_benchmark_into_dbdata(): the benchmark of name {benchmark_name} is not implemented"
)

_load_into_pgdata(dbgym_cfg, conn, load_info)
_load_into_dbdata(dbgym_cfg, conn, load_info)


def _load_into_pgdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadInfoBaseClass):
def _load_into_dbdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadInfoBaseClass):
sql_file_execute(dbgym_cfg, conn, load_info.get_schema_fpath())

# truncate all tables first before even loading a single one
Expand All @@ -222,29 +222,29 @@ def _load_into_pgdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadI
sql_file_execute(dbgym_cfg, conn, constraints_fpath)


def start_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path) -> None:
_start_or_stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath, True)
def start_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None:
_start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, True)


def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path) -> None:
_start_or_stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath, False)
def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None:
_start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, False)


def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path, is_start: bool) -> None:
def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path, is_start: bool) -> None:
# They should be absolute paths and should exist
assert pgbin_path.is_absolute() and pgbin_path.exists()
assert pgdata_dpath.is_absolute() and pgdata_dpath.exists()
assert dbdata_dpath.is_absolute() and dbdata_dpath.exists()
# The inputs may be symlinks so we need to resolve them first
pgbin_real_dpath = pgbin_path.resolve()
pgdata_dpath = pgdata_dpath.resolve()
dbdata_dpath = dbdata_dpath.resolve()
pgport = DEFAULT_POSTGRES_PORT
save_file(dbgym_cfg, pgbin_real_dpath / "pg_ctl")

if is_start:
# We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start".
# The reason subprocess_run() never returns is because pg_ctl spawns a postgres process so .poll() always returns None.
# On the other hand, subprocess.run() does return normally, like calling `./pg_ctl` on the command line would do.
result = subprocess.run(f"./pg_ctl -D \"{pgdata_dpath}\" -o '-p {pgport}' start", cwd=pgbin_real_dpath, shell=True)
result = subprocess.run(f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' start", cwd=pgbin_real_dpath, shell=True)
result.check_returncode()
else:
subprocess_run(f"./pg_ctl -D \"{pgdata_dpath}\" -o '-p {pgport}' stop", cwd=pgbin_real_dpath)
subprocess_run(f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' stop", cwd=pgbin_real_dpath)
4 changes: 2 additions & 2 deletions experiments/load_per_machine_envvars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
host=$(hostname)

if [ "$host" == "dev4" ]; then
export PGDATA_PARENT_DPATH=/mnt/nvme1n1/phw2/dbgym_tmp/
export DBDATA_PARENT_DPATH=/mnt/nvme1n1/phw2/dbgym_tmp/
elif [ "$host" == "dev6" ]; then
export PGDATA_PARENT_DPATH=/mnt/nvme0n1/phw2/dbgym_tmp/
export DBDATA_PARENT_DPATH=/mnt/nvme0n1/phw2/dbgym_tmp/
else
echo "Did not recognize host \"$host\""
exit 1
Expand Down
12 changes: 6 additions & 6 deletions experiments/protox_tpch_sf0point1/main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
set -euxo pipefail

SCALE_FACTOR=0.1
INTENDED_PGDATA_HARDWARE=ssd
INTENDED_DBDATA_HARDWARE=ssd
. ./experiments/load_per_machine_envvars.sh
echo $PGDATA_PARENT_DPATH
echo $DBDATA_PARENT_DPATH

# space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars)
# python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 0.1 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH
# python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 0.1 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH
python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR --tune-duration-during-tune 0.2
python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR
exit 0
Expand All @@ -19,15 +19,15 @@ python3 task.py benchmark tpch workload --scale-factor $SCALE_FACTOR

# postgres
python3 task.py dbms postgres build
python3 task.py dbms postgres pgdata tpch --scale-factor $SCALE_FACTOR --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH
python3 task.py dbms postgres dbdata tpch --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH

exit 0

# embedding
python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH # long datagen so that train doesn't crash
python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # long datagen so that train doesn't crash
python3 task.py tune protox embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2

# agent
python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 1 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH --build-space-good-for-boot
python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 1 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot
python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR
python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR
Loading

0 comments on commit c45c4b6

Please sign in to comment.