-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1112 from lzjpaul/23-10-24-v410
Add the implementation for the model selection example
- Loading branch information
Showing
184 changed files
with
18,402 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
58 changes: 58 additions & 0 deletions
58
examples/model_selection/TRAILS-Database-Native-Model-Selection/Dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
FROM ubuntu:20.04 | ||
|
||
ENV DEBIAN_FRONTEND=noninteractive | ||
|
||
# Install Python, Vim, and necessary libraries | ||
RUN apt-get update && \ | ||
apt-get install -y software-properties-common wget gnupg2 lsb-release git && \ | ||
add-apt-repository ppa:deadsnakes/ppa && \ | ||
apt-get install -y python3.6 python3-pip vim && \ | ||
apt-get clean && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
# Install necessary dependencies for PostgreSQL and Rust | ||
RUN apt-get update && \ | ||
apt-get install -y pkg-config libssl-dev libpq-dev libclang-dev curl && \ | ||
apt-get clean && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
# Install necessary dependencies for pgrx | ||
RUN apt-get update && \ | ||
apt-get install -y bison flex libreadline-dev && \ | ||
apt-get clean && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
# Create the postgres user | ||
USER root | ||
RUN adduser --disabled-password --gecos "" postgres && \ | ||
mkdir /project && \ | ||
adduser postgres sudo && \ | ||
chown -R postgres:postgres /project | ||
|
||
# Switch to the postgres user andInstall Rust and init the cargo | ||
USER postgres | ||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \ | ||
echo 'source $HOME/.cargo/env' >> $HOME/.bashrc && \ | ||
/bin/bash -c "source $HOME/.cargo/env && cargo install cargo-pgrx --version '0.9.7' --locked" && \ | ||
/bin/bash -c "source $HOME/.cargo/env && cargo pgrx init" | ||
|
||
# Set environment variables for Rust and Python | ||
ENV PATH="/root/.cargo/bin:${PATH}" | ||
ENV PYTHONPATH="${PYTHONPATH}:/project/TRAILS/internal/ml/model_selection" | ||
|
||
WORKDIR /project | ||
COPY ./internal/ml/model_selection/requirement.txt ./requirement.txt | ||
RUN pip install -r requirement.txt | ||
|
||
RUN pip install https://www.comp.nus.edu.sg/~zhaojing/files/singa-3.1.0-cp38-cp38-manylinux2014_x86_64.whl | ||
|
||
# appendix | ||
USER root | ||
RUN apt-get update && apt-get install -y \ | ||
postgresql-client && \ | ||
apt-get clean && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
USER postgres | ||
|
||
CMD ["tail", "-f", "/dev/null"] |
148 changes: 148 additions & 0 deletions
148
examples/model_selection/TRAILS-Database-Native-Model-Selection/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
# Database-Native Model Selection | ||
|
||
-- based on Singa | ||
|
||
|
||
|
||
![image-20231020174425377](documents/image-20231020174425377.png) | ||
|
||
## Build Docker Image | ||
|
||
```bash | ||
git clone https://github.com/apache/singa.git | ||
cd singa/examples/model_selection/TRAILS-Database-Native-Model-Selection/ | ||
docker build -t trails-singa . | ||
``` | ||
|
||
## Run Docker Image | ||
Download exp_data.zip from https://www.dropbox.com/scl/fi/xz4teosklwmfc5j4x2ug6/exp_data.zip?rlkey=5fk2ttib0zt49suyppcjhsrn2&dl=0 | ||
and unzip the exp_data/ folder to a specific directory (path_to_exp_data_folder) | ||
```bash | ||
docker run -d --name trails-singa \ | ||
--network="host" \ | ||
-v path_to_exp_data_folder:/project/exp_data \ | ||
trails-singa | ||
``` | ||
|
||
## Start PostgreSQL Instance | ||
|
||
```bash | ||
# 1. Run docker container | ||
docker exec -it trails-singa bash | ||
# 2. Clone the code | ||
cd ~ | ||
git clone https://github.com/apache/singa.git | ||
cd singa/examples/model_selection/TRAILS-Database-Native-Model-Selection/ | ||
# 3. Export PYTHONPATH | ||
export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection | ||
# 4. Start the RDBMS and then exit | ||
cd internal/pg_extension | ||
cargo pgrx run | ||
exit | ||
cd ../.. | ||
# 5. Load data into RDBMS | ||
bash internal/ml/model_selection/scripts/database/load_data_to_db.sh /project/exp_data/data/structure_data/frappe frappe | ||
# 6. Run database server | ||
cd internal/pg_extension | ||
cargo pgrx run | ||
|
||
``` | ||
|
||
|
||
## Register Stored Procedure | ||
|
||
```sql | ||
CREATE OR REPLACE | ||
PROCEDURE model_selection_sp( | ||
dataset TEXT, --dataset name | ||
selected_columns TEXT[], --used columns | ||
N INTEGER, --number of models to evaluate | ||
batch_size INTEGER, --batch size, for profiling, filtering | ||
config_file TEXT --config file path | ||
) | ||
LANGUAGE plpgsql | ||
AS $$ | ||
DECLARE | ||
-- global inputs/outputs | ||
result_status TEXT; | ||
column_list TEXT; | ||
BEGIN | ||
-- combine the columns into a string | ||
column_list := array_to_string(selected_columns, ', '); | ||
|
||
-- 4. Run filtering phase to get top K models. | ||
EXECUTE format(' | ||
WITH batch_rows AS ( | ||
SELECT %s | ||
FROM %I | ||
ORDER BY RANDOM() | ||
LIMIT %s OFFSET 0 | ||
) | ||
SELECT filtering_phase( | ||
json_agg(row_to_json(t))::text, %s, %s, %L | ||
) | ||
FROM batch_rows AS t', column_list, dataset, batch_size, N, 1, config_file) INTO result_status; | ||
RAISE NOTICE '4. run filtering phase, k models = %', result_status; | ||
|
||
END; $$; | ||
``` | ||
|
||
# Compile the UDF | ||
|
||
```bash | ||
# Try compile the UDF | ||
DROP EXTENSION IF EXISTS pg_extension; | ||
CREATE EXTENSION pg_extension; | ||
``` | ||
|
||
If the above fails, open another terminal and go into the docker via docker exec -it trails-singa bash | ||
Then run the following | ||
```bash | ||
rm /home/postgres/.pgrx/14.9/pgrx-install/share/extension/pg_extension--0.1.0.sql | ||
vi /home/postgres/.pgrx/14.9/pgrx-install/share/extension/pg_extension--0.1.0.sql | ||
# Copy the following to the /home/postgres/.pgrx/14.9/pgrx-install/share/extension/pg_extension--0.1.0.sql | ||
-- src/lib.rs:66 | ||
-- pg_extension::filtering_phase | ||
CREATE FUNCTION "filtering_phase"( | ||
"mini_batch" TEXT, /* alloc::string::String */ | ||
"n" INT, /* i32 */ | ||
"k" INT, /* i32 */ | ||
"config_file" TEXT /* alloc::string::String */ | ||
) RETURNS TEXT /* alloc::string::String */ | ||
IMMUTABLE STRICT PARALLEL SAFE | ||
LANGUAGE c /* Rust */ | ||
AS 'MODULE_PATHNAME', 'filtering_phase_wrapper'; | ||
``` | ||
|
||
Go back to the first terminal and run the following in the database server again | ||
```bash | ||
# Try compile the UDF | ||
DROP EXTENSION IF EXISTS pg_extension; | ||
CREATE EXTENSION pg_extension; | ||
``` | ||
|
||
## Run Model Selection | ||
|
||
```sql | ||
-- Template for calling 'model_selection_sp' stored procedure | ||
CALL model_selection_sp( | ||
<TABLE_NAME>, -- The name of the table or dataset from which data should be retrieved. | ||
<COLUMN_NAMES_ARRAY>, -- An array of column names to be considered in the model selection process. | ||
<PARAMETER_1>, -- Number of models to explore | ||
<PARAMETER_2>, -- Batch size | ||
<CONFIG_FILE_PATH> -- The file path to a configuration file needed for the process. | ||
); | ||
|
||
|
||
# For example | ||
CALL model_selection_sp( | ||
'frappe_train', | ||
ARRAY['col1', 'col2', 'col3', 'label'], | ||
10, | ||
32, | ||
'/home/postgres/singa/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/config.ini'); | ||
``` | ||
|
||
# Example Result | ||
|
||
![image-20231020174945226](documents/image-20231020174945226.png) |
Oops, something went wrong.