feat: update demo (#1031)

4paradigm · Jan 13, 2022 · 1e643c5 · 1e643c5
1 parent f54f834
commit 1e643c5
Show file tree

Hide file tree

Showing 13 changed files with 148 additions and 306 deletions.
diff --git a/demo/README.md b/demo/README.md
@@ -24,25 +24,101 @@ w2 as (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN
 ```
 
 ## 2. Demo with The Cluster Mode
-
 > :warning: Required docker engine version >= 18.03
 
+**Start docker**
+```
+docker run -it 4pdosc/openmldb:0.4.0 bash
+```
+**Initialize environment**
 ```bash
-# Pull the docker and start it
-docker run -it 4pdosc/openmldb:0.3.2 bash
-
-# Initilize the environment
 ./init.sh
+```
+**Create table**
+```bash
+# Start the OpenMLDB CLI for the cluster mode
+../openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client
+```
+```sql
+# The below commands are executed in the CLI
+> CREATE DATABASE demo_db;
+> USE demo_db;
+> CREATE TABLE t1(id string, vendor_id int, pickup_datetime timestamp, dropoff_datetime timestamp, passenger_count int, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, store_and_fwd_flag string, trip_duration int);
+```
 
-# Run feature extraction and model training. Feature extraction will read offline data from the local file
-python3 train.py ./fe.sql /tmp/model.txt
-
-# Import the data to online database
-python3 import.py
+**Import offline data to OpenMLDB**
+```sql
+# The below commands are executed in the CLI
+> USE demo_db;
+> SET @@execute_mode='offline';
+> LOAD DATA INFILE '/work/taxi-trip/data/taxi_tour_table_train_simple.snappy.parquet' INTO TABLE t1 options(format='parquet', header=true, mode='append');
+# You can see job status by the below command
+> show jobs;
+```
+**Run offline feature extraction**
+```sql
+# The below commands are executed in the CLI
+> USE demo_db;
+> SET @@execute_mode='offline';
+> SELECT trip_duration, passenger_count,
+sum(pickup_latitude) OVER w AS vendor_sum_pl,
+max(pickup_latitude) OVER w AS vendor_max_pl,
+min(pickup_latitude) OVER w AS vendor_min_pl,
+avg(pickup_latitude) OVER w AS vendor_avg_pl,
+sum(pickup_latitude) OVER w2 AS pc_sum_pl,
+max(pickup_latitude) OVER w2 AS pc_max_pl,
+min(pickup_latitude) OVER w2 AS pc_min_pl,
+avg(pickup_latitude) OVER w2 AS pc_avg_pl,
+count(vendor_id) OVER w2 AS pc_cnt,
+count(vendor_id) OVER w AS vendor_cnt
+FROM t1
+WINDOW w AS (PARTITION BY vendor_id ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW),
+w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW) INTO OUTFILE '/tmp/feature_data';
+```
+**Train model**
+```bash
+python3 train.py /tmp/feature_data /tmp/model.txt
+```
+**Online SQL deployment**
+```sql
+# The below commands are executed in the CLI
+> USE demo_db;
+> SET @@execute_mode='online';
+> DEPLOY demo SELECT trip_duration, passenger_count,
+sum(pickup_latitude) OVER w AS vendor_sum_pl,
+max(pickup_latitude) OVER w AS vendor_max_pl,
+min(pickup_latitude) OVER w AS vendor_min_pl,
+avg(pickup_latitude) OVER w AS vendor_avg_pl,
+sum(pickup_latitude) OVER w2 AS pc_sum_pl,
+max(pickup_latitude) OVER w2 AS pc_max_pl,
+min(pickup_latitude) OVER w2 AS pc_min_pl,
+avg(pickup_latitude) OVER w2 AS pc_avg_pl,
+count(vendor_id) OVER w2 AS pc_cnt,
+count(vendor_id) OVER w AS vendor_cnt
+FROM t1
+WINDOW w AS (PARTITION BY vendor_id ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW),
+w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW);
+```
+:bulb: Note that:
 
-# Start the HTTP service for inference with OpenMLDB
-./start_predict_server.sh ./fe.sql /tmp/model.txt
+- The SQL used for the online deployment should be the same as that for offline feature extraction.
+- Do not insert or import online data into the reference tables before deploy
 
+**Import online data to OpenMLDB**
+```sql
+# The below commands are executed in the CLI
+> USE demo_db;
+> SET @@execute_mode='online';
+> LOAD DATA INFILE 'file:///work/taxi-trip/data/taxi_tour_table_train_simple.csv' INTO TABLE t1 options(format='csv', header=true, mode='append');
+# You can see job status by the below command
+> show jobs;
+```
+**Start HTTP service for inference with OpenMLDB**
+```bash
+./start_predict_server.sh 127.0.0.1:9080 /tmp/model.txt
+```
+**Run inference with HTTP request**
+```bash
 # Run inference with a HTTP request
 python3 predict.py
 # The following output is expected (the numbers might be slightly different)
@@ -52,7 +128,7 @@ python3 predict.py
 ---------------predict trip_duration -------------
 848.014745715936 s
 ```
-:bulb: To read more details about the cluster mode, please refer to the [QuickStart (Cluster Mode)](https://github.com/4paradigm/OpenMLDB/blob/main/docs/en/cluster.md)
+:bulb: To read more details about the cluster mode, please refer to the [QuickStart (Cluster Mode)](https://docs.openmldb.ai/content-1/openmldb_quickstart)
 
 ## 3. Demo with The Standalone Mode
 
@@ -61,14 +137,14 @@ python3 predict.py
 **Start docker**
 
 ```bash
-docker run -it 4pdosc/openmldb:0.3.2 bash
+docker run -it 4pdosc/openmldb:0.4.0 bash
 ```
 **Initialize environment**
 
 ```bash
 ./init.sh standalone
 ```
-**Create table and import the data to OpenMLDB.**
+**Create table and import the data to OpenMLDB**
 
 ```bash
 # Start the OpenMLDB CLI for the standalone mode
@@ -78,14 +154,13 @@ docker run -it 4pdosc/openmldb:0.3.2 bash
 # The below commands are executed in the CLI
 > CREATE DATABASE demo_db;
 > USE demo_db;
-> CREATE TABLE t1(id string, vendor_id int, pickup_datetime timestamp, dropoff_datetime timestamp, passenger_count int, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, store_and_fwd_flag string, trip_duration int, INDEX(ts=pickup_datetime));
+> CREATE TABLE t1(id string, vendor_id int, pickup_datetime timestamp, dropoff_datetime timestamp, passenger_count int, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, store_and_fwd_flag string, trip_duration int);
 > LOAD DATA INFILE './data/taxi_tour.csv' INTO TABLE t1;
 ```
 **Run offline feature extraction**
 
 ```sql
 # The below commands are executed in the CLI
-> SET PERFORMANCE_SENSITIVE = false;
 > SELECT trip_duration, passenger_count,
 sum(pickup_latitude) OVER w AS vendor_sum_pl,
 max(pickup_latitude) OVER w AS vendor_max_pl,
@@ -105,7 +180,7 @@ w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN
 **Train model**
 
 ```bash
-python3 train_s.py /tmp/feature.csv /tmp/model.txt
+python3 train.py /tmp/feature.csv /tmp/model.txt
 ```
 **Online SQL deployment**
 
@@ -140,7 +215,7 @@ w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN
 **Start HTTP service for inference with OpenMLDB**
 
 ```
-./start_predict_server.sh /tmp/model.txt
+./start_predict_server.sh 127.0.0.1:8080 /tmp/model.txt
 ```
 
 **Run inference with HTTP request**
@@ -155,5 +230,5 @@ python3 predict.py
 880.3688347542294 s
 ```
 
-:bulb: To read more details about the standalone mode, please refer to the [QuickStart (Standalone Mode)](https://github.com/4paradigm/OpenMLDB/blob/main/docs/en/standalone.md)
+:bulb: To read more details about the standalone mode, please refer to the [QuickStart (Standalone Mode)](https://docs.openmldb.ai/content-1/openmldb_quickstart)
 
diff --git a/demo/predict-taxi-trip-duration-nb/Dockerfile b/demo/predict-taxi-trip-duration-nb/Dockerfile
@@ -3,13 +3,14 @@ FROM openjdk:11.0.13-jre-slim-bullseye
 LABEL org.opencontainers.image.source https://github.com/4paradigm/OpenMLDB
 
 RUN  apt-get update \
-     && apt-get install -y --no-install-recommends libgomp1 curl binutils procps python3 python3-pip python3-numpy \
+     && apt-get install -y --no-install-recommends libgomp1 curl binutils procps python3 python3-pip python3-numpy vim \
      && rm -rf /var/lib/apt/lists/* \
      && pip install --no-cache-dir py4j==0.10.9 numpy lightgbm tornado requests pandas openmldb
 
 COPY script /work/taxi-trip/
 
 ENV LANG=en_US.UTF-8
+ENV SPARK_HOME=/work/openmldb/spark-3.0.0-bin-openmldbspark
 ARG OPENMLDB_VERSION=0.3.0
 
 COPY setup_openmldb.sh /

diff --git a/demo/predict-taxi-trip-duration-nb/script/import.py b/demo/predict-taxi-trip-duration-nb/script/import.py
diff --git a/demo/predict-taxi-trip-duration-nb/script/init.sh b/demo/predict-taxi-trip-duration-nb/script/init.sh
@@ -20,29 +20,16 @@ MODE="cluster"
 if [ $# -gt 0 ]; then
     MODE=$1
 fi
-pkill mon
 pkill python3
 rm -rf /tmp/*
+cd /work/openmldb && rm -rf logs* && rm -rf db*
 sleep 2
 if [[ "$MODE" = "standalone" ]]; then
-    sed -i "s/.*zk_cluster=.*/#--zk_cluster=127.0.0.1:2181/g" /work/openmldb/conf/nameserver.flags
-    sed -i "s/.*zk_root_path=.*/#--zk_root_path=\/openmldb/g" /work/openmldb/conf/nameserver.flags
-    sed -i "s/.*zk_cluster=.*/#--zk_cluster=127.0.0.1:2181/g" /work/openmldb/conf/tablet.flags
-    sed -i "s/.*zk_root_path=.*/#--zk_root_path=\/openmldb/g" /work/openmldb/conf/tablet.flags
-    sed -i "s/.*zk_cluster=.*/#--zk_cluster=127.0.0.1:2181/g" /work/openmldb/conf/apiserver.flags
-    sed -i "s/.*zk_root_path=.*/#--zk_root_path=\/openmldb/g" /work/openmldb/conf/apiserver.flags
     python3 convert_data.py < data/taxi_tour_table_train_simple.csv  > ./data/taxi_tour.csv
-    cd /work/openmldb && sh bin/start-all.sh
+    cd /work/openmldb && ./bin/stop-standalone.sh && ./bin/start-standalone.sh
     sleep 1
 else
-    sed -i "s/.*zk_cluster=.*/--zk_cluster=127.0.0.1:2181/g" /work/openmldb/conf/nameserver.flags
-    sed -i "s/.*zk_root_path=.*/--zk_root_path=\/openmldb/g" /work/openmldb/conf/nameserver.flags
-    sed -i "s/.*zk_cluster=.*/--zk_cluster=127.0.0.1:2181/g" /work/openmldb/conf/tablet.flags
-    sed -i "s/.*zk_root_path=.*/--zk_root_path=\/openmldb/g" /work/openmldb/conf/tablet.flags
     cd /work/zookeeper-3.4.14 && ./bin/zkServer.sh restart
     sleep 1
-    cd /work/openmldb && ./bin/start.sh start tablet
-    sleep 1
-    cd /work/openmldb && ./bin/start.sh start nameserver
-    sleep 1
+    cd /work/openmldb && ./bin/stop-all.sh && ./bin/start-all.sh
 fi
diff --git a/demo/predict-taxi-trip-duration-nb/script/predict_server.py b/demo/predict-taxi-trip-duration-nb/script/predict_server.py
@@ -20,14 +20,11 @@
 import json
 import lightgbm as lgb
 import sqlalchemy as db
+import requests
 import argparse
 
-sql = ""
 bst = None
 
-engine = db.create_engine('openmldb:///db_test?zk=127.0.0.1:2181&zkPath=/openmldb')
-connection = engine.connect()
-
 table_schema = [
 	("id", "string"),
 	("vendor_id", "int"),
@@ -42,6 +39,8 @@
 	("trip_duration", "int"),
 ]
 
+url = ""
+
 def get_schema():
     dict_schema = {}
     for i in table_schema:
@@ -64,15 +63,19 @@ class PredictHandler(tornado.web.RequestHandler):
     def post(self):
         row = json.loads(self.request.body)
         data = {}
+        data["input"] = []
+        row_data = []
         for i in table_schema:
             if i[1] == "string":
-                data[i[0]] = row.get(i[0], "")
+                row_data.append(row.get(i[0], ""))
             elif i[1] == "int" or i[1] == "double" or i[1] == "timestamp" or i[1] == "bigint":
-                data[i[0]] = row.get(i[0], 0)
+                row_data.append(row.get(i[0], 0))
             else:
-                data[i[0]] = None
-        rs = connection.execute(sql, data)
-        for r in rs:
+                row_data.append(None)
+        data["input"].append(row_data)       
+        rs = requests.post(url, json=data)
+        result = json.loads(rs.text)
+        for r in result["data"]["data"]:
             ins = build_feature(r)
             self.write("----------------ins---------------\n")
             self.write(str(ins) + "\n")
@@ -93,11 +96,10 @@ def make_app():
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("sql_file", help="specify the sql file")
+    parser.add_argument("endpoint",  help="specify the endpoint of apiserver")
     parser.add_argument("model_path",  help="specify the model path")
     args = parser.parse_args()
-    with open(args.sql_file, "r") as fd:
-      sql = fd.read()
+    url = "http://%s/dbs/demo_db/deployments/demo" % args.endpoint
     bst = lgb.Booster(model_file=args.model_path)
     app = make_app()
     app.listen(8887)