Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrated Boot into Proto-X #31

Merged
merged 29 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
d858d04
now logging to artifacts/ instead of artifacts/artifacts/
wangpatrick57 Apr 6, 2024
e3fc4af
small change
wangpatrick57 Apr 6, 2024
a1a8ff3
boot is now set up
wangpatrick57 Apr 6, 2024
4f99c34
set up boot
wangpatrick57 Apr 9, 2024
1341680
tpch sf1 exp
wangpatrick57 Apr 9, 2024
aa7ce95
faster pgdata creation on ssd
wangpatrick57 Apr 9, 2024
75d0a2b
ray and boot redis port
wangpatrick57 Apr 9, 2024
70010e8
now starting redis correctly
wangpatrick57 Apr 9, 2024
0cc9401
now using execute() instead of psql() to set up boot
wangpatrick57 Apr 9, 2024
276b180
now saving postgresql.auto.conf
wangpatrick57 Apr 10, 2024
6b6d7bf
centralized where shared preload libs were defined
wangpatrick57 Apr 11, 2024
6450b9f
added use boot option
wangpatrick57 Apr 11, 2024
409dbd4
added use boot during hpo
wangpatrick57 Apr 11, 2024
7fe2447
Merge branch 'integrate-boot' of github.com:wangpatrick57/dbgym into …
wangpatrick57 Apr 11, 2024
643674f
hpoed_params -> hpo_params in all but tune.py
wangpatrick57 Apr 11, 2024
4adc7fb
Merge branch 'integrate-boot' of github.com:wangpatrick57/dbgym into …
wangpatrick57 Apr 11, 2024
6c23215
added boot config fpath
wangpatrick57 Apr 11, 2024
0e8cd76
Merge branch 'integrate-boot' of github.com:wangpatrick57/dbgym into …
wangpatrick57 Apr 11, 2024
34c814b
added use boot option
wangpatrick57 Apr 11, 2024
4aba7de
Merge branch 'integrate-boot' of github.com:wangpatrick57/dbgym into …
wangpatrick57 Apr 11, 2024
00e8f38
now passing boot settings
wangpatrick57 Apr 11, 2024
11d7e41
added rebuild command for postgres build
wangpatrick57 Apr 11, 2024
3b50f74
fixed multiple shared preload libraries
wangpatrick57 Apr 11, 2024
86b3699
now saving boot config path when opening
wangpatrick57 Apr 11, 2024
2a3dc35
fixed create_pgdata to use link_result and work if there's already pg…
wangpatrick57 Apr 11, 2024
9eaeafc
added args for tune boot
wangpatrick57 Apr 11, 2024
640b0d7
added enable boot option to tune
wangpatrick57 Apr 15, 2024
334740c
merge
wangpatrick57 Apr 15, 2024
41e82ca
explained testing space more
wangpatrick57 Apr 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
dbgym_workspace_path: ../dbgym_workspace
boot_redis_port: 6379
ray_gcs_port: 6380
32 changes: 21 additions & 11 deletions dbms/postgres/build_repo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,40 @@

set -euxo pipefail

REPO_REAL_DPATH="$1"
REPO_REAL_PARENT_DPATH="$1"

# download and make postgres with boot
mkdir -p "${REPO_REAL_DPATH}"
cd "${REPO_REAL_DPATH}"
# download and make postgres from the boot repository
mkdir -p "${REPO_REAL_PARENT_DPATH}"
cd "${REPO_REAL_PARENT_DPATH}"
git clone git@github.com:lmwnshn/boot.git --single-branch --branch boot --depth 1
cd ./boot
./cmudb/build/configure.sh release "${REPO_REAL_DPATH}/boot/build/postgres"
./cmudb/build/configure.sh release "${REPO_REAL_PARENT_DPATH}/boot/build/postgres"
make clean
make install-world-bin -j4
cd ../

# download and make bytejack
cd ./cmudb/extension/bytejack_rs/
cargo build --release
cbindgen . -o target/bytejack_rs.h --lang c
cd "${REPO_REAL_PARENT_DPATH}/boot"

cd ./cmudb/extension/bytejack/
make clean
make install -j
cd "${REPO_REAL_PARENT_DPATH}/boot"

# download and make hypopg
git clone git@github.com:HypoPG/hypopg.git
cd ./hypopg
PG_CONFIG=../boot/build/postgres/bin/pg_config make install
cd ../
PG_CONFIG="${REPO_REAL_PARENT_DPATH}/boot/build/postgres/bin/pg_config" make install
cd "${REPO_REAL_PARENT_DPATH}/boot"

# download and make pg_hint_plan
# we need -L to follow links
curl -L https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL15_1_5_1.tar.gz -o REL15_1_5_1.tar.gz
tar -xzf REL15_1_5_1.tar.gz
rm REL15_1_5_1.tar.gz
cd ./pg_hint_plan-REL15_1_5_1
PATH="${REPO_REAL_DPATH}/boot/build/postgres/bin:$PATH" make
PATH="${REPO_REAL_DPATH}/boot/build/postgres/bin:$PATH" make install
cp ./pg_hint_plan.so ${REPO_REAL_DPATH}/boot/build/postgres/lib
PATH="${REPO_REAL_PARENT_DPATH}/boot/build/postgres/bin:$PATH" make
PATH="${REPO_REAL_PARENT_DPATH}/boot/build/postgres/bin:$PATH" make install
cp ./pg_hint_plan.so ${REPO_REAL_PARENT_DPATH}/boot/build/postgres/lib
130 changes: 68 additions & 62 deletions dbms/postgres/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,18 @@
'''
import logging
import os
import shutil
import subprocess
from pathlib import Path
import click
import shutil
import ssd_checker

from benchmark.tpch.load_info import TpchLoadInfo
from dbms.load_info_base_class import LoadInfoBaseClass
from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, open_and_save, save_file, get_pgdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER
from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, link_result, open_and_save, save_file, get_pgdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER, default_pgdata_parent_dpath
from util.shell import subprocess_run
from sqlalchemy import Connection
from util.pg import conn_execute, sql_file_execute, DBGYM_POSTGRES_DBNAME, create_conn, DEFAULT_POSTGRES_PORT, DBGYM_POSTGRES_USER, DBGYM_POSTGRES_PASS, DEFAULT_POSTGRES_DBNAME
from util.pg import SHARED_PRELOAD_LIBRARIES, conn_execute, sql_file_execute, DBGYM_POSTGRES_DBNAME, create_conn, DEFAULT_POSTGRES_PORT, DBGYM_POSTGRES_USER, DBGYM_POSTGRES_PASS, DEFAULT_POSTGRES_DBNAME


dbms_postgres_logger = logging.getLogger("dbms/postgres")
Expand All @@ -34,8 +35,9 @@ def postgres_group(dbgym_cfg: DBGymConfig):
help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create pgdata.",
)
@click.pass_obj
def postgres_build(dbgym_cfg: DBGymConfig):
_build_repo(dbgym_cfg)
@click.option("--rebuild", is_flag=True, help="Include this flag to rebuild Postgres even if it already exists.")
def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool):
_build_repo(dbgym_cfg, rebuild)


@postgres_group.command(
Expand All @@ -46,16 +48,39 @@ def postgres_build(dbgym_cfg: DBGymConfig):
@click.argument("benchmark_name", type=str)
@click.option("--scale-factor", type=float, default=1)
@click.option("--pgbin-path", type=Path, default=None, help=f"The path to the bin containing Postgres executables. The default is {default_pgbin_path(WORKSPACE_PATH_PLACEHOLDER)}.")
def postgres_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path):
@click.option(
"--intended-pgdata-hardware",
type=click.Choice(["hdd", "ssd"]),
default="hdd",
help=f"The intended hardware pgdata should be on. Used as a sanity check for --pgdata-parent-dpath.",
)
@click.option(
"--pgdata-parent-dpath",
default=None,
type=Path,
help=f"The path to the parent directory of the pgdata which will be actively tuned. The default is {default_pgdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.",
)
def postgres_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, intended_pgdata_hardware: str, pgdata_parent_dpath: Path):
# Set args to defaults programmatically (do this before doing anything else in the function)
if pgbin_path == None:
pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path)
if pgdata_parent_dpath == None:
pgdata_parent_dpath = default_pgdata_parent_dpath(dbgym_cfg.dbgym_workspace_path)

# Convert all input paths to absolute paths
pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path)
pgdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, pgdata_parent_dpath)

# Check assertions on args
if intended_pgdata_hardware == "hdd":
assert not ssd_checker.is_ssd(pgdata_parent_dpath), f"Intended hardware is HDD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an SSD"
elif intended_pgdata_hardware == "ssd":
assert ssd_checker.is_ssd(pgdata_parent_dpath), f"Intended hardware is SSD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an HDD"
else:
assert False

# Create pgdata
_create_pgdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path)
_create_pgdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path, pgdata_parent_dpath)


def _get_pgbin_symlink_path(dbgym_cfg: DBGymConfig) -> Path:
Expand All @@ -66,35 +91,25 @@ def _get_repo_symlink_path(dbgym_cfg: DBGymConfig) -> Path:
return dbgym_cfg.cur_symlinks_build_path("repo")


def _get_pgdata_tgz_symlink_path(
dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float
) -> Path:
# you can't pass "[pgdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[pgdata].tgz" as a dir
return dbgym_cfg.cur_symlinks_data_path(".", mkdir=True) / get_pgdata_tgz_name(
benchmark_name, scale_factor
)


def _build_repo(dbgym_cfg: DBGymConfig):
repo_symlink_dpath = _get_repo_symlink_path(dbgym_cfg)
if repo_symlink_dpath.exists():
dbms_postgres_logger.info(f"Skipping _build_repo: {repo_symlink_dpath}")
def _build_repo(dbgym_cfg: DBGymConfig, rebuild):
expected_repo_symlink_dpath = _get_repo_symlink_path(dbgym_cfg)
if not rebuild and expected_repo_symlink_dpath.exists():
dbms_postgres_logger.info(f"Skipping _build_repo: {expected_repo_symlink_dpath}")
return

dbms_postgres_logger.info(f"Setting up repo in {repo_symlink_dpath}")
dbms_postgres_logger.info(f"Setting up repo in {expected_repo_symlink_dpath}")
repo_real_dpath = dbgym_cfg.cur_task_runs_build_path("repo", mkdir=True)
subprocess_run(
f"./build_repo.sh {repo_real_dpath}", cwd=dbgym_cfg.cur_source_path()
)

# only link at the end so that the link only ever points to a complete repo
subprocess_run(
f"ln -s {repo_real_dpath} {dbgym_cfg.cur_symlinks_build_path(mkdir=True)}"
)
dbms_postgres_logger.info(f"Set up repo in {repo_symlink_dpath}")
repo_symlink_dpath = link_result(dbgym_cfg, repo_real_dpath)
assert os.path.samefile(expected_repo_symlink_dpath, repo_symlink_dpath)
dbms_postgres_logger.info(f"Set up repo in {expected_repo_symlink_dpath}")


def _create_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path) -> None:
def _create_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, pgdata_parent_dpath: Path) -> None:
"""
I chose *not* for this function to skip by default if pgdata_tgz_symlink_path already exists. This
is because, while the generated data is deterministic given benchmark_name and scale_factor, any
Expand All @@ -103,48 +118,39 @@ def _create_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: fl
_create_pgdata() not propagate to [pgdata].tgz by default.
"""

# Create a temporary dir for this pgdata
# It's ok for the pgdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place
pgdata_dpath = dbgym_cfg.dbgym_tmp_path / "pgdata"
# It's ok for the pgdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place.
pgdata_dpath = pgdata_parent_dpath / "pgdata_being_created"
# We might be reusing the same pgdata_parent_dpath, so delete pgdata_dpath if it already exists
if pgdata_dpath.exists():
shutil.rmtree(pgdata_dpath)

# initdb
# save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run
# Call initdb.
# Save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run.
save_file(dbgym_cfg, pgbin_path / "initdb")
subprocess_run(f'./initdb -D "{pgdata_dpath}"', cwd=pgbin_path)

# start postgres (all other pgdata setup requires postgres to be started)
# note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead
# Start Postgres (all other pgdata setup requires postgres to be started).
# Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead.
start_postgres(dbgym_cfg, pgbin_path, pgdata_dpath)

# setup
# Set up Postgres.
_generic_pgdata_setup(dbgym_cfg)
_load_benchmark_into_pgdata(dbgym_cfg, benchmark_name, scale_factor)

# stop postgres so that we don't "leak" processes
# Stop Postgres so that we don't "leak" processes.
stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath)

# create .tgz file
# you can't pass "[pgdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[pgdata].tgz" as a dir
# Create .tgz file.
# Note that you can't pass "[pgdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[pgdata].tgz" as a dir.
pgdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path(
".", mkdir=True
) / get_pgdata_tgz_name(benchmark_name, scale_factor)
# we need to cd into pgdata_dpath so that the tar file does not contain folders for the whole path of pgdata_dpath
# We need to cd into pgdata_dpath so that the tar file does not contain folders for the whole path of pgdata_dpath.
subprocess_run(f"tar -czf {pgdata_tgz_real_fpath} .", cwd=pgdata_dpath)

# create symlink
# only link at the end so that the link only ever points to a complete pgdata
pgdata_tgz_symlink_path = _get_pgdata_tgz_symlink_path(
dbgym_cfg, benchmark_name, scale_factor
)
if pgdata_tgz_symlink_path.exists():
os.remove(pgdata_tgz_symlink_path)
subprocess_run(
f"ln -s {pgdata_tgz_real_fpath} {dbgym_cfg.cur_symlinks_data_path(mkdir=True)}"
)
assert (
pgdata_tgz_symlink_path.exists()
) # basically asserts that pgdata_tgz_symlink_path matches dbgym_cfg.cur_symlinks_data_path(mkdir=True) / "[pgdata].tgz"

# Create symlink.
# Only link at the end so that the link only ever points to a complete pgdata.
pgdata_tgz_symlink_path = link_result(dbgym_cfg, pgdata_tgz_real_fpath)
dbms_postgres_logger.info(f"Created pgdata in {pgdata_tgz_symlink_path}")


Expand All @@ -156,7 +162,7 @@ def _generic_pgdata_setup(dbgym_cfg: DBGymConfig):
dbgym_pgpass = DBGYM_POSTGRES_PASS
pgport = DEFAULT_POSTGRES_PORT

# create user
# Create user
save_file(dbgym_cfg, pgbin_symlink_dpath / "psql")
subprocess_run(
f"./psql -c \"create user {dbgym_pguser} with superuser password '{dbgym_pgpass}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost",
Expand All @@ -167,16 +173,16 @@ def _generic_pgdata_setup(dbgym_cfg: DBGymConfig):
cwd=pgbin_symlink_dpath,
)

# load shared preload libraries
shared_preload_libraries_fpath = (
dbgym_cfg.cur_source_path() / "shared_preload_libraries.sql"
)
subprocess_run(
f"./psql -f {shared_preload_libraries_fpath} {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost",
cwd=pgbin_symlink_dpath,
)
# Load shared preload libraries
if SHARED_PRELOAD_LIBRARIES:
subprocess_run(
# You have to use TO and you can't put single quotes around the libraries (https://postgrespro.com/list/thread-id/2580120)
# The method I wrote here works for both one library and multiple libraries
f"./psql -c \"ALTER SYSTEM SET shared_preload_libraries TO {SHARED_PRELOAD_LIBRARIES};\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost",
cwd=pgbin_symlink_dpath,
)

# create the dbgym database. since one pgdata dir maps to one benchmark, all benchmarks will use the same database
# Create the dbgym database. since one pgdata dir maps to one benchmark, all benchmarks will use the same database
# as opposed to using databases named after the benchmark
subprocess_run(
f"./psql -c \"create database {DBGYM_POSTGRES_DBNAME} with owner = '{dbgym_pguser}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost",
Expand Down
11 changes: 11 additions & 0 deletions dbms/postgres/default_boot_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Macro accelerator
intelligent_cache: true

# Micro accelerator
early_stop: true
seq_sample: true
seq_sample_pct: 50
seq_sample_seed: 15721
mu_hyp_opt: 0.01
mu_hyp_time: 100000
mu_hyp_stdev: 1.0
1 change: 0 additions & 1 deletion dbms/postgres/shared_preload_libraries.sql

This file was deleted.

3 changes: 3 additions & 0 deletions dependency/apt_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ flex
libreadline-dev
rpm
zlib1g-dev
cbindgen
redis-server
redis-tools
1 change: 1 addition & 0 deletions dependency/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,4 @@ Werkzeug==3.0.1
wrapt==1.14.1
zipp==3.17.0
ssd_checker==1.0.3
redis==5.0.3
2 changes: 2 additions & 0 deletions dependency/rust.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
12 changes: 9 additions & 3 deletions experiments/protox_tpch_sf10/main.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,25 @@
set -euxo pipefail

SCALE_FACTOR=10
INTENDED_PGDATA_HARDWARE=ssd
PGDATA_PARENT_DPATH=/mnt/nvme1n1/phw2/dbgym_tmp/

# space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars)
python3 task.py --no-startup-check tune protox agent tune tpch --scale-factor $SCALE_FACTOR --enable-boot-during-tune
exit 0

# benchmark
python3 task.py --no-startup-check benchmark tpch data $SCALE_FACTOR
python3 task.py --no-startup-check benchmark tpch workload --scale-factor $SCALE_FACTOR

# postgres
python3 task.py --no-startup-check dbms postgres build
python3 task.py --no-startup-check dbms postgres pgdata tpch --scale-factor $SCALE_FACTOR
python3 task.py --no-startup-check dbms postgres pgdata tpch --scale-factor $SCALE_FACTOR --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH

# embedding
python3 task.py --no-startup-check tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-pgdata-hardware ssd --pgdata-parent-dpath /mnt/nvme1n1/phw2/dbgym_tmp/
python3 task.py --no-startup-check tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH
python3 task.py --no-startup-check tune protox embedding train tpch --scale-factor $SCALE_FACTOR --train-max-concurrent 10

# agent
python3 task.py --no-startup-check tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --max-concurrent 4 --duration 4 --intended-pgdata-hardware ssd --pgdata-parent-dpath /mnt/nvme1n1/phw2/dbgym_tmp/
python3 task.py --no-startup-check tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --max-concurrent 4 --duration 4 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH --enable-boot-during-hpo
python3 task.py --no-startup-check tune protox agent tune tpch --scale-factor $SCALE_FACTOR
Loading
Loading