Skip to content

Commit

Permalink
Automated workspace pruning/cleaning (#35)
Browse files Browse the repository at this point in the history
**Summary**: A command to find and remove unneeded files from the
workspace directory.

**Demo**:
Passing 25 unit tests with many edge cases which create
files/directories, call `clean_workspace()`, and then verify its
contents.
![Screenshot 2024-07-07 at 17 05
07](https://github.com/cmu-db/dbgym/assets/20631215/ed2edeae-062a-40ae-b38e-e2ad3718d6b3)

**Details**
* "Aggressive" mode removes all task_runs/\*/ directories that are not
directly pointed to by a symlink in symlinks/.
* "Safe" mode also keeps task_runs/\*/ directories which are indirectly
pointed to by a symlink. This can happen if a symlink points to a
task_runs/\*/ directory which has a symlink in it that points to another
task_runs/*/ directory.
* I chose to write so many unit tests because this operation must be
bug-free.
  • Loading branch information
wangpatrick57 committed Jul 15, 2024
1 parent 2f17bd4 commit 3245aab
Show file tree
Hide file tree
Showing 251 changed files with 1,030 additions and 130 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
__pycache__/
.conda/
.idea/
test_clean_scratchspace/

workspace/
default_*_benchbase_config_*.xml
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ These steps were tested on a fresh repository clone, Ubuntu 22.04.

```
# Setup dependencies.
# You may want to create a Python virtual environment (e.g. with conda) before doing this.
# You may want to create a Python 3.10 virtual environment (e.g. with conda) before doing this.
./dependency/install_dependencies.sh
# Compile a custom fork of PostgreSQL, load TPC-H (SF 0.01), train the Proto-X agent, and tune.
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 2 additions & 0 deletions dependencies/rust.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
File renamed without changes.
217 changes: 217 additions & 0 deletions manage/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
import shutil
from typing import List, Set
import click
import yaml
import logging
from pathlib import Path
from misc.utils import DBGymConfig, is_child_path, parent_dpath_of_path
from itertools import chain
import os


task_logger = logging.getLogger("task")
task_logger.setLevel(logging.INFO)


@click.group(name="manage")
def manage_group():
pass


@click.command(name="show")
@click.argument("keys", nargs=-1)
@click.pass_obj
def manage_show(dbgym_cfg, keys):
config_path = dbgym_cfg.path
config_yaml = dbgym_cfg.yaml

# Traverse the YAML.
for key in keys:
config_yaml = config_yaml[key]

# Pretty-print the requested YAML value.
output_str = None
if type(config_yaml) != dict:
output_str = config_yaml
else:
output_str = yaml.dump(config_yaml, default_flow_style=False)
if len(keys) > 0:
output_str = " " + output_str.replace("\n", "\n ")
output_str = output_str.rstrip()
print(output_str)

task_logger.info(f"Read: {Path(config_path)}")


@click.command(name="write")
@click.argument("keys", nargs=-1)
@click.argument("value_type")
@click.argument("value")
@click.pass_obj
def manage_write(dbgym_cfg, keys, value_type, value):
config_path = dbgym_cfg.path
config_yaml = dbgym_cfg.yaml

# Traverse the YAML.
root_yaml = config_yaml
for key in keys[:-1]:
config_yaml = config_yaml[key]

# Modify the requested YAML value and write the YAML file.
assert type(config_yaml[keys[-1]]) != dict
config_yaml[keys[-1]] = getattr(__builtins__, value_type)(value)
new_yaml = yaml.dump(root_yaml, default_flow_style=False).rstrip()
Path(config_path).write_text(new_yaml)

task_logger.info(f"Updated: {Path(config_path)}")


@click.command(name="standardize")
@click.pass_obj
def manage_standardize(dbgym_cfg):
config_path = dbgym_cfg.path
config_yaml = dbgym_cfg.yaml

# Write the YAML file.
new_yaml = yaml.dump(config_yaml, default_flow_style=False).rstrip()
Path(config_path).write_text(new_yaml)

task_logger.info(f"Updated: {Path(config_path)}")


@click.command("clean")
@click.pass_obj
@click.option(
"--mode",
type=click.Choice(["safe", "aggressive"]),
default="safe",
help="The mode to clean the workspace (default=\"safe\"). \"aggressive\" means \"only keep run_*/ folders referenced by a file in symlinks/\". \"safe\" means \"in addition to that, recursively keep any run_*/ folders referenced by any symlinks in run_*/ folders we are keeping.\""
)
def manage_clean(dbgym_cfg: DBGymConfig, mode: str):
clean_workspace(dbgym_cfg, mode=mode, verbose=True)


@click.command("count")
@click.pass_obj
def manage_count(dbgym_cfg: DBGymConfig):
num_files = _count_files_in_workspace(dbgym_cfg)
print(f"The workspace ({dbgym_cfg.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks.")


def add_symlinks_in_dpath(symlinks_stack: List[Path], root_dpath: Path, processed_symlinks: Set[Path]) -> None:
"""
Will modify symlinks_stack and processed_symlinks.
"""
for root_pathstr, dir_names, file_names in os.walk(root_dpath):
root_path = Path(root_pathstr)
# symlinks can either be files or directories, so we go through both dir_names and file_names
for file_name in chain(dir_names, file_names):
file_path = root_path / file_name
if file_path.is_symlink() and file_path not in processed_symlinks:
symlinks_stack.append(file_path)
processed_symlinks.add(file_path)


def _count_files_in_workspace(dbgym_cfg: DBGymConfig) -> int:
"""
Counts the number of files (regular file or dir or symlink) in the workspace.
"""
total_count = 0
for dirpath, dirnames, filenames in os.walk(dbgym_cfg.dbgym_workspace_path, followlinks=False):
# Check if any of the directories are symbolic links and remove them from dirnames
dirnames[:] = [d for d in dirnames if not os.path.islink(os.path.join(dirpath, d))]

# Count files and directories (non-symlink directories already filtered)
total_count += len(filenames) + len(dirnames)

return total_count


def clean_workspace(dbgym_cfg: DBGymConfig, mode: str="safe", verbose=False) -> None:
"""
Clean all [workspace]/task_runs/run_*/ directories that are not referenced by any "active symlinks".
If mode is "aggressive", "active symlinks" means *only* the symlinks directly in [workspace]/symlinks/.
If mode is "safe", "active symlinks" means the symlinks directly in [workspace]/symlinks/ as well as
any symlinks referenced in task_runs/run_*/ directories we have already decided to keep.
"""
# This stack holds the symlinks that are left to be processed
symlink_fpaths_to_process = []
# This set holds the symlinks that have already been processed to avoid infinite loops
processed_symlinks = set()

# 1. Initialize paths to process
if dbgym_cfg.dbgym_symlinks_path.exists():
add_symlinks_in_dpath(symlink_fpaths_to_process, dbgym_cfg.dbgym_symlinks_path, processed_symlinks)

# 2. Go through symlinks, figuring out which "children of task runs" to keep
# Based on the rules of the framework, "children of task runs" should be run_*/ directories.
# However, the user's workspace might happen to break these rules by putting directories not
# named "run_*/" or files directly in task_runs/. Thus, I use the term "task_run_child_fordpaths"
# instead of "run_dpaths".
task_run_child_fordpaths_to_keep = set()

if dbgym_cfg.dbgym_runs_path.exists():
while symlink_fpaths_to_process:
symlink_fpath: Path = symlink_fpaths_to_process.pop()
assert symlink_fpath.is_symlink()
# Path.resolve() resolves all layers of symlinks while os.readlink() only resolves one layer.
# However, os.readlink() literally reads the string contents of the link. We need to do some
# processing on the result of os.readlink() to convert it to an absolute path
real_fordpath = symlink_fpath.resolve()
one_layer_resolved_fordpath = os.readlink(symlink_fpath)
assert str(real_fordpath) == str(os.readlink(symlink_fpath)), f"symlink_fpath ({symlink_fpath}) seems to point to *another* symlink. This is difficult to handle, so it is currently disallowed. Please resolve this situation manually."

# If the file doesn't exist, we'll just ignore it.
if not real_fordpath.exists():
continue
# We're only trying to figure out which direct children of task_runs/ to save. If the file isn't
# even a descendant, we don't care about it.
if not is_child_path(real_fordpath, dbgym_cfg.dbgym_runs_path):
continue

assert not os.path.samefile(real_fordpath, dbgym_cfg.dbgym_runs_path)

# Figure out the task_run_child_fordpath to put into task_run_child_fordpaths_to_keep
task_run_child_fordpath = None
if os.path.samefile(parent_dpath_of_path(real_fordpath), dbgym_cfg.dbgym_runs_path):
# While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/,
# we'll just not delete it if the user happens to have one like this. Even if the user messed up
# the structure somehow, it's just a good idea not to delete it.
task_run_child_fordpath = real_fordpath
else:
# Technically, it's not allowed to symlink to any files not in task_runs/run_*/[codebase]/[organization]/.
# However, as with above, we won't just nuke files if the workspace doesn't follow this rule for
# some reason.
task_run_child_fordpath = real_fordpath
while not os.path.samefile(parent_dpath_of_path(task_run_child_fordpath), dbgym_cfg.dbgym_runs_path):
task_run_child_fordpath = parent_dpath_of_path(task_run_child_fordpath)
assert task_run_child_fordpath != None
assert os.path.samefile(parent_dpath_of_path(task_run_child_fordpath), dbgym_cfg.dbgym_runs_path), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_cfg.dbgym_runs_path"
task_run_child_fordpaths_to_keep.add(task_run_child_fordpath)

# If on safe mode, add symlinks inside the task_run_child_fordpath to be processed
if mode == "safe":
add_symlinks_in_dpath(symlink_fpaths_to_process, task_run_child_fordpath, processed_symlinks)

# 3. Go through all children of task_runs/*, deleting any that we weren't told to keep
# It's true that symlinks might link outside of task_runs/*. We'll just not care about those
starting_num_files = _count_files_in_workspace(dbgym_cfg)
if dbgym_cfg.dbgym_runs_path.exists():
for child_fordpath in dbgym_cfg.dbgym_runs_path.iterdir():
if child_fordpath not in task_run_child_fordpaths_to_keep:
if child_fordpath.is_dir():
shutil.rmtree(child_fordpath)
else:
os.remove(child_fordpath)
ending_num_files = _count_files_in_workspace(dbgym_cfg)

if verbose:
task_logger.info(f"Removed {starting_num_files - ending_num_files} out of {starting_num_files} files")
task_logger.info(f"Workspace went from {starting_num_files - ending_num_files} to {starting_num_files}")


manage_group.add_command(manage_show)
manage_group.add_command(manage_write)
manage_group.add_command(manage_standardize)
manage_group.add_command(manage_clean)
manage_group.add_command(manage_count)
Empty file added manage/tests/__init__.py
Empty file.
Loading

0 comments on commit 3245aab

Please sign in to comment.