From f820695330564f33d94317bb4d1a503697f53233 Mon Sep 17 00:00:00 2001
From: zhaojing <zhaojing@comp.nus.edu.sg>
Date: Fri, 27 Oct 2023 17:43:48 +0800
Subject: [PATCH] update the license file and readme

---
 LICENSE                                       |  55 --
 examples/armnet/README.md                     |  24 -
 examples/mlp_postgresql/README.md             |  23 -
 .../README.md                                 |   5 +-
 .../pg_extension/sql/pg_extension--0.1.0.sql  | 160 -----
 examples/model_selection_psql/README.md       |  22 -
 examples/model_selection_psql/ms_mlp/run.sh   |  26 -
 .../model_selection_psql/ms_mlp/train_cnn.py  | 329 ----------
 .../model_selection_psql/ms_mlp/train_mlp.py  | 588 ------------------
 .../model_selection_psql/ms_mlp/train_mpi.py  |  91 ---
 .../ms_mlp/train_ms_model.py                  | 584 -----------------
 .../ms_mlp/train_multiprocess.py              | 111 ----
 .../ms_model_mlp/model.py                     | 224 -------
 .../ms_model_mlp/native.py                    | 137 ----
 examples/model_selection_psql/msmlp/model.py  | 209 -------
 examples/model_selection_psql/msmlp/native.py | 137 ----
 .../pkg_model_code/model.py                   | 366 -----------
 java/pom.xml                                  |  12 +
 18 files changed, 16 insertions(+), 3087 deletions(-)
 delete mode 100644 examples/armnet/README.md
 delete mode 100644 examples/mlp_postgresql/README.md
 delete mode 100644 examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/pg_extension--0.1.0.sql
 delete mode 100644 examples/model_selection_psql/README.md
 delete mode 100644 examples/model_selection_psql/ms_mlp/run.sh
 delete mode 100644 examples/model_selection_psql/ms_mlp/train_cnn.py
 delete mode 100644 examples/model_selection_psql/ms_mlp/train_mlp.py
 delete mode 100644 examples/model_selection_psql/ms_mlp/train_mpi.py
 delete mode 100644 examples/model_selection_psql/ms_mlp/train_ms_model.py
 delete mode 100644 examples/model_selection_psql/ms_mlp/train_multiprocess.py
 delete mode 100644 examples/model_selection_psql/ms_model_mlp/model.py
 delete mode 100644 examples/model_selection_psql/ms_model_mlp/native.py
 delete mode 100644 examples/model_selection_psql/msmlp/model.py
 delete mode 100644 examples/model_selection_psql/msmlp/native.py
 delete mode 100644 examples/model_selection_psql/pkg_model_code/model.py

diff --git a/LICENSE b/LICENSE
index 9c7ffb4751..1ccf06e13f 100644
--- a/LICENSE
+++ b/LICENSE
@@ -560,58 +560,3 @@ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-===============================================================================
-SINGA bundles the following under MIT License: 
-examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/*
-
-MIT License
-
-Portions Copyright 2019-2021 ZomboDB, LLC.
-Portions Copyright 2021-2023 Technology Concepts & Design, Inc.
-Portions Copyright 2023 PgCentral Foundation, Inc.
-
-All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-===============================================================================
-SINGA bundles the following under The PostgreSQL License: 
-examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/*
-
-The PostgreSQL License
-
-Portions Copyright (c) 1996-2023, The PostgreSQL Global Development Group
-
-Portions Copyright (c) 1994, The Regents of the University of California
-
-Permission to use, copy, modify, and distribute this software and its documentation for any 
-purpose, without fee, and without a written agreement is hereby granted, provided that the above 
-copyright notice and this paragraph and the following two paragraphs appear in all copies.
-
-IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, 
-SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING 
-OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF 
-THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, 
-BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
-A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, 
-AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, 
-UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
-
diff --git a/examples/armnet/README.md b/examples/armnet/README.md
deleted file mode 100644
index 698161fe1b..0000000000
--- a/examples/armnet/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-<!--
-    Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-      http://www.apache.org/licenses/LICENSE-2.0
-    Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
--->
-
-## ARM-Net: Adaptive Relation Modeling Network for Structured Data
-
-![version](https://img.shields.io/badge/version-v3.5-green)
-![python](https://img.shields.io/badge/python-3.8.3-blue)
-![singa](https://img.shields.io/badge/singa-3.1.0-orange)
-
-This folder contains our Singa implementation of [ARM-Net: Adaptive Relation Modeling Network for Structured Data](https://dl.acm.org/doi/10.1145/3448016.3457321).
diff --git a/examples/mlp_postgresql/README.md b/examples/mlp_postgresql/README.md
deleted file mode 100644
index e34f581972..0000000000
--- a/examples/mlp_postgresql/README.md
+++ /dev/null
@@ -1,23 +0,0 @@
-<!--
-    Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
--->
-
-# Multi-layer Perceptron (MLP) on top of PostgreSQL
-
-Examples inside this folder show how to run MLP models 
-on top of PostgreSQL.
\ No newline at end of file
diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/README.md b/examples/model_selection/TRAILS-Database-Native-Model-Selection/README.md
index 31cbf703da..59b76c0742 100644
--- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/README.md
+++ b/examples/model_selection/TRAILS-Database-Native-Model-Selection/README.md
@@ -19,7 +19,7 @@
 
 # Database-Native Model Selection 
 
-​																																																		-- based on Singa
+​																																																		-- based on SINGA
 
 
 
@@ -33,6 +33,9 @@ cd singa/examples/model_selection/TRAILS-Database-Native-Model-Selection/
 docker build -t trails-singa .
 ```
 
+Inside the docker image, the PostgreSQL and its extensions are installed according to https://github.com/pgcentralfoundation/pgrx
+
+
 ## Run Docker Image
 Download exp_data.zip from https://www.dropbox.com/scl/fi/xz4teosklwmfc5j4x2ug6/exp_data.zip?rlkey=5fk2ttib0zt49suyppcjhsrn2&dl=0
 and unzip the exp_data/ folder to a specific directory (path_to_exp_data_folder)
diff --git a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/pg_extension--0.1.0.sql b/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/pg_extension--0.1.0.sql
deleted file mode 100644
index 434082d9d5..0000000000
--- a/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/sql/pg_extension--0.1.0.sql
+++ /dev/null
@@ -1,160 +0,0 @@
-/************************************************************
-* 
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-/*
-This file is auto generated by pgrx.
-
-The ordering of items is not stable, it is driven by a dependency graph.
-*/
-
--- src/lib.rs:80
--- pg_extension::refinement_phase
-CREATE  FUNCTION "refinement_phase"(
-    "config_file" TEXT /* alloc::string::String */
-) RETURNS TEXT /* alloc::string::String */
-    IMMUTABLE STRICT PARALLEL SAFE
-LANGUAGE c /* Rust */
-AS 'MODULE_PATHNAME', 'refinement_phase_wrapper';
-
--- src/lib.rs:31
--- pg_extension::profiling_refinement_phase
-CREATE  FUNCTION "profiling_refinement_phase"(
-    "mini_batch" TEXT, /* alloc::string::String */
-    "config_file" TEXT /* alloc::string::String */
-) RETURNS TEXT /* alloc::string::String */
-    IMMUTABLE STRICT PARALLEL SAFE
-LANGUAGE c /* Rust */
-AS 'MODULE_PATHNAME', 'profiling_refinement_phase_wrapper';
-
--- src/lib.rs:16
--- pg_extension::profiling_filtering_phase
-CREATE  FUNCTION "profiling_filtering_phase"(
-    "mini_batch" TEXT, /* alloc::string::String */
-    "config_file" TEXT /* alloc::string::String */
-) RETURNS TEXT /* alloc::string::String */
-    IMMUTABLE STRICT PARALLEL SAFE
-LANGUAGE c /* Rust */
-AS 'MODULE_PATHNAME', 'profiling_filtering_phase_wrapper';
-
--- src/lib.rs:66
--- pg_extension::filtering_phase
-CREATE  FUNCTION "filtering_phase"(
-    "mini_batch" TEXT, /* alloc::string::String */
-    "n" INT, /* i32 */
-    "k" INT, /* i32 */
-    "config_file" TEXT /* alloc::string::String */
-) RETURNS TEXT /* alloc::string::String */
-    IMMUTABLE STRICT PARALLEL SAFE
-LANGUAGE c /* Rust */
-AS 'MODULE_PATHNAME', 'filtering_phase_wrapper';
-
--- src/lib.rs:46
--- pg_extension::coordinator
-CREATE  FUNCTION "coordinator"(
-    "time_score" TEXT, /* alloc::string::String */
-    "time_train" TEXT, /* alloc::string::String */
-    "time_budget" TEXT, /* alloc::string::String */
-    "only_phase1" bool, /* bool */
-    "config_file" TEXT /* alloc::string::String */
-) RETURNS TEXT /* alloc::string::String */
-    IMMUTABLE STRICT PARALLEL SAFE
-LANGUAGE c /* Rust */
-AS 'MODULE_PATHNAME', 'coordinator_wrapper';
-
-
--- src/lib.rs:110
--- pg_extension::model_selection_workloads
-CREATE  FUNCTION "model_selection_workloads"(
-    "mini_batch" TEXT, /* alloc::string::String */
-    "n" INT, /* i32 */
-    "k" INT, /* i32 */
-    "config_file" TEXT /* alloc::string::String */
-) RETURNS TEXT /* alloc::string::String */
-    IMMUTABLE STRICT PARALLEL SAFE
-LANGUAGE c /* Rust */
-AS 'MODULE_PATHNAME', 'model_selection_workloads_wrapper';
-
--- src/lib.rs:138
--- pg_extension::model_selection_trails_workloads
-CREATE  FUNCTION "model_selection_trails_workloads"(
-    "mini_batch" TEXT, /* alloc::string::String */
-    "n" INT, /* i32 */
-    "k" INT, /* i32 */
-    "config_file" TEXT /* alloc::string::String */
-) RETURNS TEXT /* alloc::string::String */
-    IMMUTABLE STRICT PARALLEL SAFE
-LANGUAGE c /* Rust */
-AS 'MODULE_PATHNAME', 'model_selection_trails_workloads_wrapper';
-
--- src/lib.rs:125
--- pg_extension::model_selection_trails
-CREATE  FUNCTION "model_selection_trails"(
-    "mini_batch" TEXT, /* alloc::string::String */
-    "time_budget" TEXT, /* alloc::string::String */
-    "config_file" TEXT /* alloc::string::String */
-) RETURNS TEXT /* alloc::string::String */
-    IMMUTABLE STRICT PARALLEL SAFE
-LANGUAGE c /* Rust */
-AS 'MODULE_PATHNAME', 'model_selection_trails_wrapper';
-
--- src/lib.rs:94
--- pg_extension::model_selection
-CREATE  FUNCTION "model_selection"(
-    "mini_batch" TEXT, /* alloc::string::String */
-    "time_budget" TEXT, /* alloc::string::String */
-    "config_file" TEXT /* alloc::string::String */
-) RETURNS TEXT /* alloc::string::String */
-    IMMUTABLE STRICT PARALLEL SAFE
-LANGUAGE c /* Rust */
-AS 'MODULE_PATHNAME', 'model_selection_wrapper';
-
--- src/lib.rs:153
--- pg_extension::benchmark_filtering_phase_latency
-CREATE  FUNCTION "benchmark_filtering_phase_latency"(
-    "explore_models" INT, /* i32 */
-    "config_file" TEXT /* alloc::string::String */
-) RETURNS TEXT /* alloc::string::String */
-    IMMUTABLE STRICT PARALLEL SAFE
-LANGUAGE c /* Rust */
-AS 'MODULE_PATHNAME', 'benchmark_filtering_phase_latency_wrapper';
-
-
--- src/lib.rs:152
--- pg_extension::benchmark_filtering_phase_latency
-CREATE  FUNCTION "benchmark_filtering_phase_latency"(
-    "explore_models" INT, /* i32 */
-    "config_file" TEXT /* alloc::string::String */
-) RETURNS TEXT /* alloc::string::String */
-    IMMUTABLE STRICT PARALLEL SAFE
-LANGUAGE c /* Rust */
-AS 'MODULE_PATHNAME', 'benchmark_filtering_phase_latency_wrapper';
-
--- src/lib.rs:163
--- pg_extension::benchmark_filtering_latency_in_db
-CREATE  FUNCTION "benchmark_filtering_latency_in_db"(
-    "explore_models" INT, /* i32 */
-    "dataset" TEXT, /* alloc::string::String */
-    "config_file" TEXT /* alloc::string::String */
-) RETURNS TEXT /* alloc::string::String */
-    IMMUTABLE STRICT PARALLEL SAFE
-LANGUAGE c /* Rust */
-AS 'MODULE_PATHNAME', 'benchmark_filtering_latency_in_db_wrapper';
-
diff --git a/examples/model_selection_psql/README.md b/examples/model_selection_psql/README.md
deleted file mode 100644
index c78fca5f18..0000000000
--- a/examples/model_selection_psql/README.md
+++ /dev/null
@@ -1,22 +0,0 @@
-<!--
-    Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
--->
-
-# Two-Phase Model Selection on PostgreSQL 
-
-Examples inside this folder show how to select a well-performing model using SInga inside RDBMS, such as PostgreSQL.
\ No newline at end of file
diff --git a/examples/model_selection_psql/ms_mlp/run.sh b/examples/model_selection_psql/ms_mlp/run.sh
deleted file mode 100644
index 5e78f5f2d4..0000000000
--- a/examples/model_selection_psql/ms_mlp/run.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-#!/usr/bin/env python -W ignore::DeprecationWarning
-
-### Static Models
-python train_mlp.py ms_model_mlp mnist
-
-### Dynamic Models
-python train_ms_model.py ms_model_mlp mnist
diff --git a/examples/model_selection_psql/ms_mlp/train_cnn.py b/examples/model_selection_psql/ms_mlp/train_cnn.py
deleted file mode 100644
index 6eab096b9e..0000000000
--- a/examples/model_selection_psql/ms_mlp/train_cnn.py
+++ /dev/null
@@ -1,329 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-from singa import singa_wrap as singa
-from singa import device
-from singa import tensor
-from singa import opt
-import numpy as np
-import time
-import argparse
-from PIL import Image
-
-np_dtype = {"float16": np.float16, "float32": np.float32}
-
-singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
-
-
-# Data augmentation
-def augmentation(x, batch_size):
-    xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
-    for data_num in range(0, batch_size):
-        offset = np.random.randint(8, size=2)
-        x[data_num, :, :, :] = xpad[data_num, :,
-                                    offset[0]:offset[0] + x.shape[2],
-                                    offset[1]:offset[1] + x.shape[2]]
-        if_flip = np.random.randint(2)
-        if (if_flip):
-            x[data_num, :, :, :] = x[data_num, :, :, ::-1]
-    return x
-
-
-# Calculate accuracy
-def accuracy(pred, target):
-    # y is network output to be compared with ground truth (int)
-    y = np.argmax(pred, axis=1)
-    a = y == target
-    correct = np.array(a, "int").sum()
-    return correct
-
-
-# Data partition according to the rank
-def partition(global_rank, world_size, train_x, train_y, val_x, val_y):
-    # Partition training data
-    data_per_rank = train_x.shape[0] // world_size
-    idx_start = global_rank * data_per_rank
-    idx_end = (global_rank + 1) * data_per_rank
-    train_x = train_x[idx_start:idx_end]
-    train_y = train_y[idx_start:idx_end]
-
-    # Partition evaluation data
-    data_per_rank = val_x.shape[0] // world_size
-    idx_start = global_rank * data_per_rank
-    idx_end = (global_rank + 1) * data_per_rank
-    val_x = val_x[idx_start:idx_end]
-    val_y = val_y[idx_start:idx_end]
-    return train_x, train_y, val_x, val_y
-
-
-# Function to all reduce NUMPY accuracy and loss from multiple devices
-def reduce_variable(variable, dist_opt, reducer):
-    reducer.copy_from_numpy(variable)
-    dist_opt.all_reduce(reducer.data)
-    dist_opt.wait()
-    output = tensor.to_numpy(reducer)
-    return output
-
-
-def resize_dataset(x, image_size):
-    num_data = x.shape[0]
-    dim = x.shape[1]
-    X = np.zeros(shape=(num_data, dim, image_size, image_size),
-                 dtype=np.float32)
-    for n in range(0, num_data):
-        for d in range(0, dim):
-            X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
-                (image_size, image_size), Image.BILINEAR),
-                                     dtype=np.float32)
-    return X
-
-
-def run(global_rank,
-        world_size,
-        local_rank,
-        max_epoch,
-        batch_size,
-        model,
-        data,
-        sgd,
-        graph,
-        verbosity,
-        dist_option='plain',
-        spars=None,
-        precision='float32'):
-    dev = device.create_cuda_gpu_on(local_rank)  # need to change to CPU device for CPU-only machines
-    dev.SetRandSeed(0)
-    np.random.seed(0)
-
-    if data == 'cifar10':
-        from data import cifar10
-        train_x, train_y, val_x, val_y = cifar10.load()
-    elif data == 'cifar100':
-        from data import cifar100
-        train_x, train_y, val_x, val_y = cifar100.load()
-    elif data == 'mnist':
-        from data import mnist
-        train_x, train_y, val_x, val_y = mnist.load()
-
-
-    num_channels = train_x.shape[1]
-    image_size = train_x.shape[2]
-    data_size = np.prod(train_x.shape[1:train_x.ndim]).item()
-    num_classes = (np.max(train_y) + 1).item()
-
-    if model == 'resnet':
-        from model import resnet
-        model = resnet.resnet50(num_channels=num_channels,
-                                num_classes=num_classes)
-    elif model == 'xceptionnet':
-        from model import xceptionnet
-        model = xceptionnet.create_model(num_channels=num_channels,
-                                         num_classes=num_classes)
-    elif model == 'cnn':
-        from model import cnn
-        model = cnn.create_model(num_channels=num_channels,
-                                 num_classes=num_classes)
-    elif model == 'alexnet':
-        from model import alexnet
-        model = alexnet.create_model(num_channels=num_channels,
-                                     num_classes=num_classes)
-    elif model == 'mlp':
-        import os, sys, inspect
-        current = os.path.dirname(
-            os.path.abspath(inspect.getfile(inspect.currentframe())))
-        parent = os.path.dirname(current)
-        sys.path.insert(0, parent)
-        from mlp import model
-        model = model.create_model(data_size=data_size,
-                                    num_classes=num_classes)
-
-    # For distributed training, sequential has better performance
-    if hasattr(sgd, "communicator"):
-        DIST = True
-        sequential = True
-    else:
-        DIST = False
-        sequential = False
-
-    if DIST:
-        train_x, train_y, val_x, val_y = partition(global_rank, world_size,
-                                                   train_x, train_y, val_x,
-                                                   val_y)
-
-    if model.dimension == 4:
-        tx = tensor.Tensor(
-            (batch_size, num_channels, model.input_size, model.input_size), dev,
-            singa_dtype[precision])
-    elif model.dimension == 2:
-        tx = tensor.Tensor((batch_size, data_size), dev, singa_dtype[precision])
-        np.reshape(train_x, (train_x.shape[0], -1))
-        np.reshape(val_x, (val_x.shape[0], -1))
-
-    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
-    num_train_batch = train_x.shape[0] // batch_size
-    num_val_batch = val_x.shape[0] // batch_size
-    idx = np.arange(train_x.shape[0], dtype=np.int32)
-
-    # Attach model to graph
-    model.set_optimizer(sgd)
-    model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
-    dev.SetVerbosity(verbosity)
-
-    # Training and evaluation loop
-    for epoch in range(max_epoch):
-        start_time = time.time()
-        np.random.shuffle(idx)
-
-        if global_rank == 0:
-            print('Starting Epoch %d:' % (epoch))
-
-        # Training phase
-        train_correct = np.zeros(shape=[1], dtype=np.float32)
-        test_correct = np.zeros(shape=[1], dtype=np.float32)
-        train_loss = np.zeros(shape=[1], dtype=np.float32)
-
-        model.train()
-        for b in range(num_train_batch):
-            # if b % 100 == 0:
-            #     print ("b: \n", b)
-            # Generate the patch data in this iteration
-            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
-            if model.dimension == 4:
-                x = augmentation(x, batch_size)
-                if (image_size != model.input_size):
-                    x = resize_dataset(x, model.input_size)
-            x = x.astype(np_dtype[precision])
-            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
-
-            # Copy the patch data into input tensors
-            tx.copy_from_numpy(x)
-            ty.copy_from_numpy(y)
-
-            # Train the model
-            out, loss = model(tx, ty, dist_option, spars)
-            train_correct += accuracy(tensor.to_numpy(out), y)
-            train_loss += tensor.to_numpy(loss)[0]
-
-        if DIST:
-            # Reduce the evaluation accuracy and loss from multiple devices
-            reducer = tensor.Tensor((1,), dev, tensor.float32)
-            train_correct = reduce_variable(train_correct, sgd, reducer)
-            train_loss = reduce_variable(train_loss, sgd, reducer)
-
-        if global_rank == 0:
-            print('Training loss = %f, training accuracy = %f' %
-                  (train_loss, train_correct /
-                   (num_train_batch * batch_size * world_size)),
-                  flush=True)
-
-        # Evaluation phase
-        model.eval()
-        for b in range(num_val_batch):
-            x = val_x[b * batch_size:(b + 1) * batch_size]
-            if model.dimension == 4:
-                if (image_size != model.input_size):
-                    x = resize_dataset(x, model.input_size)
-            x = x.astype(np_dtype[precision])
-            y = val_y[b * batch_size:(b + 1) * batch_size]
-            tx.copy_from_numpy(x)
-            ty.copy_from_numpy(y)
-            out_test = model(tx)
-            test_correct += accuracy(tensor.to_numpy(out_test), y)
-
-        if DIST:
-            # Reduce the evaulation accuracy from multiple devices
-            test_correct = reduce_variable(test_correct, sgd, reducer)
-
-        # Output the evaluation accuracy
-        if global_rank == 0:
-            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
-                  (test_correct / (num_val_batch * batch_size * world_size),
-                   time.time() - start_time),
-                  flush=True)
-
-    dev.PrintTimeProfiling()
-
-
-if __name__ == '__main__':
-    # Use argparse to get command config: max_epoch, model, data, etc., for single gpu training
-    parser = argparse.ArgumentParser(
-        description='Training using the autograd and graph.')
-    parser.add_argument(
-        'model',
-        choices=['cnn', 'resnet', 'xceptionnet', 'mlp', 'alexnet'],
-        default='cnn')
-    parser.add_argument('data',
-                        choices=['mnist', 'cifar10', 'cifar100'],
-                        default='mnist')
-    parser.add_argument('-p',
-                        choices=['float32', 'float16'],
-                        default='float32',
-                        dest='precision')
-    parser.add_argument('-m',
-                        '--max-epoch',
-                        default=100,
-                        type=int,
-                        help='maximum epochs',
-                        dest='max_epoch')
-    parser.add_argument('-b',
-                        '--batch-size',
-                        default=64,
-                        type=int,
-                        help='batch size',
-                        dest='batch_size')
-    parser.add_argument('-l',
-                        '--learning-rate',
-                        default=0.005,
-                        type=float,
-                        help='initial learning rate',
-                        dest='lr')
-    # Determine which gpu to use
-    parser.add_argument('-i',
-                        '--device-id',
-                        default=0,
-                        type=int,
-                        help='which GPU to use',
-                        dest='device_id')
-    parser.add_argument('-g',
-                        '--disable-graph',
-                        default='True',
-                        action='store_false',
-                        help='disable graph',
-                        dest='graph')
-    parser.add_argument('-v',
-                        '--log-verbosity',
-                        default=0,
-                        type=int,
-                        help='logging verbosity',
-                        dest='verbosity')
-
-    args = parser.parse_args()
-
-    sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5, dtype=singa_dtype[args.precision])
-    run(0,
-        1,
-        args.device_id,
-        args.max_epoch,
-        args.batch_size,
-        args.model,
-        args.data,
-        sgd,
-        args.graph,
-        args.verbosity,
-        precision=args.precision)
diff --git a/examples/model_selection_psql/ms_mlp/train_mlp.py b/examples/model_selection_psql/ms_mlp/train_mlp.py
deleted file mode 100644
index 3c084ab115..0000000000
--- a/examples/model_selection_psql/ms_mlp/train_mlp.py
+++ /dev/null
@@ -1,588 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-from singa import singa_wrap as singa
-from singa import device
-from singa import tensor
-from singa import opt
-from singa import autograd
-from singa.opt import Optimizer
-from singa.opt import DecayScheduler
-from singa.opt import Constant
-import numpy as np
-import time
-import argparse
-from PIL import Image
-
-np_dtype = {"float16": np.float16, "float32": np.float32}
-
-singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
-
-### MSOptimizer
-class MSOptimizer(Optimizer):
-    def __call__(self, loss):
-        pn_p_g_list = self.call_with_returns(loss)
-        self.step()
-        return pn_p_g_list
-
-    def call_with_returns(self, loss):
-        # print ("call_with_returns loss.data: \n", loss.data)
-        pn_p_g_list = []
-        for p, g in autograd.backward(loss):
-            if p.name is None:
-                p.name = id(p)
-            self.apply(p.name, p, g)
-            pn_p_g_list.append(p.name, p, g)
-            # print ("call with returns")
-            # print ("p.name: \n", p.name)
-            # print ("p.data: \n", p.data)
-            # print ("g.data: \n", g.data)
-        return pn_p_g_list
-
-class MSSGD(MSOptimizer):
-    """Implements stochastic gradient descent (optionally with momentum).
-
-    Nesterov momentum is based on the formula from `On the importance of initialization and momentum in deep learning`__.
-
-    Args:
-        lr(float): learning rate
-        momentum(float, optional): momentum factor(default: 0)
-        weight_decay(float, optional): weight decay(L2 penalty)(default: 0)
-        dampening(float, optional): dampening for momentum(default: 0)
-        nesterov(bool, optional): enables Nesterov momentum(default: False)
-
-    Typical usage example:
-        >> > from singa import opt
-        >> > optimizer = opt.SGD(lr=0.1, momentum=0.9)
-        >> > optimizer.update()
-
-    __ http: // www.cs.toronto.edu / %7Ehinton / absps / momentum.pdf
-
-    .. note::
-        The implementation of SGD with Momentum / Nesterov subtly differs from
-        Sutskever et. al. and implementations in some other frameworks.
-
-        Considering the specific case of Momentum, the update can be written as
-
-        .. math::
-                  v = \rho * v + g \\
-                  p = p - lr * v
-
-        where p, g, v and: math: `\rho` denote the parameters, gradient,
-        velocity, and momentum respectively.
-
-        This is in contrast to Sutskever et. al. and
-        other frameworks which employ an update of the form
-
-        .. math::
-             v = \rho * v + lr * g \\
-             p = p - v
-
-        The Nesterov version is analogously modified.
-    """
-
-    def __init__(self,
-                 lr=0.1,
-                 momentum=0,
-                 dampening=0,
-                 weight_decay=0,
-                 nesterov=False,
-                 dtype=tensor.float32):
-        super(MSSGD, self).__init__(lr, dtype)
-
-        # init momentum
-        if type(momentum) == float or type(momentum) == int:
-            if momentum < 0.0:
-                raise ValueError("Invalid momentum value: {}".format(momentum))
-            self.momentum = Constant(momentum)
-        elif isinstance(momentum, DecayScheduler):
-            self.momentum = momentum
-            momentum = momentum.init_value
-        else:
-            raise TypeError("Wrong momentum type")
-        self.mom_value = self.momentum(self.step_counter).as_type(self.dtype)
-
-        # init dampening
-        if type(dampening) == float or type(dampening) == int:
-            self.dampening = Constant(dampening)
-        elif isinstance(dampening, DecayScheduler):
-            self.dampening = dampening
-            dampening = dampening.init_value
-        else:
-            raise TypeError("Wrong dampening type")
-        self.dam_value = self.dampening(self.step_counter).as_type(self.dtype)
-
-        # init weight_decay
-        if type(weight_decay) == float or type(weight_decay) == int:
-            if weight_decay < 0.0:
-                raise ValueError(
-                    "Invalid weight_decay value: {}".format(weight_decay))
-            self.weight_decay = Constant(weight_decay)
-        elif isinstance(weight_decay, DecayScheduler):
-            self.weight_decay = weight_decay
-        else:
-            raise TypeError("Wrong weight_decay type")
-        self.decay_value = self.weight_decay(self.step_counter).as_type(
-            self.dtype)
-
-        # init other params
-        self.nesterov = nesterov
-        self.moments = dict()
-
-        # check value
-        if nesterov and (momentum <= 0 or dampening != 0):
-            raise ValueError(
-                "Nesterov momentum requires a momentum and zero dampening")
-
-    def apply(self, param_name, param_value, param_grad):
-        """Performs a single optimization step.
-
-        Args:
-                param_name(String): the name of the param
-                param_value(Tensor): param values to be update in-place
-                grad(Tensor): param gradients; the values may be updated
-                        in this function; cannot use it anymore
-        """
-        assert param_value.shape == param_grad.shape, ("shape mismatch",
-                                                       param_value.shape,
-                                                       param_grad.shape)
-        self.device_check(param_value, self.step_counter, self.lr_value,
-                          self.mom_value, self.dam_value, self.decay_value)
-
-        # derive dtype from input
-        assert param_value.dtype == self.dtype
-
-        # TODO add branch operator
-        # if self.decay_value != 0:
-        if self.weight_decay.init_value != 0:
-            singa.Axpy(self.decay_value.data, param_value.data, param_grad.data)
-
-        if self.momentum.init_value != 0:
-            if param_name not in self.moments:
-                flag = param_value.device.graph_enabled()
-                param_value.device.EnableGraph(False)
-                self.moments[param_name] = tensor.zeros_like(param_value)
-                param_value.device.EnableGraph(flag)
-
-            buf = self.moments[param_name]
-            buf *= self.mom_value
-            alpha = 1.0 - self.dam_value
-            singa.Axpy(alpha.data, param_grad.data, buf.data)
-
-            if self.nesterov:
-                singa.Axpy(self.mom_value.data, buf.data, param_grad.data)
-            else:
-                param_grad = buf
-
-        minus_lr = 0.0 - self.lr_value
-        singa.Axpy(minus_lr.data, param_grad.data, param_value.data)
-
-    def step(self):
-        # increment step counter, lr and moment
-        super().step()
-        mom_value = self.momentum(self.step_counter).as_type(self.dtype)
-        dam_value = self.dampening(self.step_counter).as_type(self.dtype)
-        decay_value = self.weight_decay(self.step_counter).as_type(self.dtype)
-        self.mom_value.copy_from(mom_value)
-        self.dam_value.copy_from(dam_value)
-        self.decay_value.copy_from(decay_value)
-
-    def get_states(self):
-        states = super().get_states()
-        if self.mom_value > 0:
-            states[
-                'moments'] = self.moments  # a dict for 1st order moments tensors
-        return states
-
-    def set_states(self, states):
-        super().set_states(states)
-        if 'moments' in states:
-            self.moments = states['moments']
-            self.mom_value = self.momentum(self.step_counter)
-
-
-# Data augmentation
-def augmentation(x, batch_size):
-    xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
-    for data_num in range(0, batch_size):
-        offset = np.random.randint(8, size=2)
-        x[data_num, :, :, :] = xpad[data_num, :,
-                               offset[0]:offset[0] + x.shape[2],
-                               offset[1]:offset[1] + x.shape[2]]
-        if_flip = np.random.randint(2)
-        if (if_flip):
-            x[data_num, :, :, :] = x[data_num, :, :, ::-1]
-    return x
-
-
-# Calculate accuracy
-def accuracy(pred, target):
-    # y is network output to be compared with ground truth (int)
-    y = np.argmax(pred, axis=1)
-    a = y == target
-    correct = np.array(a, "int").sum()
-    return correct
-
-
-# Data partition according to the rank
-def partition(global_rank, world_size, train_x, train_y, val_x, val_y):
-    # Partition training data
-    data_per_rank = train_x.shape[0] // world_size
-    idx_start = global_rank * data_per_rank
-    idx_end = (global_rank + 1) * data_per_rank
-    train_x = train_x[idx_start:idx_end]
-    train_y = train_y[idx_start:idx_end]
-
-    # Partition evaluation data
-    data_per_rank = val_x.shape[0] // world_size
-    idx_start = global_rank * data_per_rank
-    idx_end = (global_rank + 1) * data_per_rank
-    val_x = val_x[idx_start:idx_end]
-    val_y = val_y[idx_start:idx_end]
-    return train_x, train_y, val_x, val_y
-
-
-# Function to all reduce NUMPY accuracy and loss from multiple devices
-def reduce_variable(variable, dist_opt, reducer):
-    reducer.copy_from_numpy(variable)
-    dist_opt.all_reduce(reducer.data)
-    dist_opt.wait()
-    output = tensor.to_numpy(reducer)
-    return output
-
-
-def resize_dataset(x, image_size):
-    num_data = x.shape[0]
-    dim = x.shape[1]
-    X = np.zeros(shape=(num_data, dim, image_size, image_size),
-                 dtype=np.float32)
-    for n in range(0, num_data):
-        for d in range(0, dim):
-            X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
-                (image_size, image_size), Image.BILINEAR),
-                dtype=np.float32)
-    return X
-
-
-def run(global_rank,
-        world_size,
-        local_rank,
-        max_epoch,
-        batch_size,
-        model,
-        data,
-        mssgd,
-        graph,
-        verbosity,
-        dist_option='plain',
-        spars=None,
-        precision='float32'):
-    # dev = device.create_cuda_gpu_on(local_rank)  # need to change to CPU device for CPU-only machines
-    dev = device.get_default_device()
-    dev.SetRandSeed(0)
-    np.random.seed(0)
-
-    if data == 'cifar10':
-        from data import cifar10
-        train_x, train_y, val_x, val_y = cifar10.load()
-    elif data == 'cifar100':
-        from data import cifar100
-        train_x, train_y, val_x, val_y = cifar100.load()
-    elif data == 'mnist':
-        from data import mnist
-        train_x, train_y, val_x, val_y = mnist.load()
-
-
-    num_channels = train_x.shape[1]
-    image_size = train_x.shape[2]
-    data_size = np.prod(train_x.shape[1:train_x.ndim]).item()
-    num_classes = (np.max(train_y) + 1).item()
-
-    if model == 'resnet':
-        from model import resnet
-        model = resnet.resnet50(num_channels=num_channels,
-                                num_classes=num_classes)
-    elif model == 'xceptionnet':
-        from model import xceptionnet
-        model = xceptionnet.create_model(num_channels=num_channels,
-                                         num_classes=num_classes)
-    elif model == 'cnn':
-        from model import cnn
-        model = cnn.create_model(num_channels=num_channels,
-                                 num_classes=num_classes)
-    elif model == 'alexnet':
-        from model import alexnet
-        model = alexnet.create_model(num_channels=num_channels,
-                                     num_classes=num_classes)
-    elif model == 'mlp':
-        import os, sys, inspect
-        current = os.path.dirname(
-            os.path.abspath(inspect.getfile(inspect.currentframe())))
-        parent = os.path.dirname(current)
-        sys.path.insert(0, parent)
-        from mlp import model
-        model = model.create_model(data_size=data_size,
-                                   num_classes=num_classes)
-
-    elif model == 'msmlp':
-        import os, sys, inspect
-        current = os.path.dirname(
-            os.path.abspath(inspect.getfile(inspect.currentframe())))
-        parent = os.path.dirname(current)
-        sys.path.insert(0, parent)
-        from msmlp import model
-        model = model.create_model(data_size=data_size,
-                                   num_classes=num_classes)
-    
-    elif model == 'ms_model_mlp':
-        import os, sys, inspect
-        current = os.path.dirname(
-            os.path.abspath(inspect.getfile(inspect.currentframe())))
-        parent = os.path.dirname(current)
-        sys.path.insert(0, parent)
-        from ms_model_mlp import model
-        model = model.create_model(data_size=data_size,
-                                    num_classes=num_classes, 
-                                    layer_hidden_list=layer_hidden_list)
-    # print ("model: \n", model)
-
-
-    # For distributed training, sequential has better performance
-    if hasattr(mssgd, "communicator"):
-        DIST = True
-        sequential = True
-    else:
-        DIST = False
-        sequential = False
-
-    if DIST:
-        train_x, train_y, val_x, val_y = partition(global_rank, world_size,
-                                                   train_x, train_y, val_x,
-                                                   val_y)
-
-    if model.dimension == 4:
-        tx = tensor.Tensor(
-            (batch_size, num_channels, model.input_size, model.input_size), dev,
-            singa_dtype[precision])
-    elif model.dimension == 2:
-        tx = tensor.Tensor((batch_size, data_size), dev, singa_dtype[precision])
-        np.reshape(train_x, (train_x.shape[0], -1))
-        np.reshape(val_x, (val_x.shape[0], -1))
-
-    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
-    num_train_batch = train_x.shape[0] // batch_size
-    num_val_batch = val_x.shape[0] // batch_size
-    idx = np.arange(train_x.shape[0], dtype=np.int32)
-
-    # Attach model to graph
-    model.set_optimizer(mssgd)
-    model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
-    dev.SetVerbosity(verbosity)
-
-    # Training and evaluation loop
-    for epoch in range(max_epoch):
-        start_time = time.time()
-        np.random.shuffle(idx)
-
-        if global_rank == 0:
-            print('Starting Epoch %d:' % (epoch))
-
-        # Training phase
-        train_correct = np.zeros(shape=[1], dtype=np.float32)
-        test_correct = np.zeros(shape=[1], dtype=np.float32)
-        train_loss = np.zeros(shape=[1], dtype=np.float32)
-
-        model.train()
-        print ("num_train_batch: \n", num_train_batch)
-        print ()
-        for b in range(num_train_batch):
-            # if b % 200 == 0:
-            #     print ("b: \n", b)
-            # Generate the patch data in this iteration
-            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
-            if model.dimension == 4:
-                x = augmentation(x, batch_size)
-                if (image_size != model.input_size):
-                    x = resize_dataset(x, model.input_size)
-            x = x.astype(np_dtype[precision])
-            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
-
-
-            synflow_flag = False
-            # Train the model
-            if epoch == (max_epoch - 1) and b == (num_train_batch - 1):  ### synflow calcuation for the last batch
-                print ("last epoch calculate synflow")
-                synflow_flag = True
-                ### step 1: all one input
-                # Copy the patch data into input tensors
-                tx.copy_from_numpy(np.ones(x.shape, dtype=np.float32))
-                ty.copy_from_numpy(y)
-                ### step 2: all weights turned to positive (done)
-                ### step 3: new loss (done)
-                ### print ("before model forward ...")
-                pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
-                ### step 4: calculate the multiplication of weights
-                synflow_score = 0.0
-                for pn_p_g_item in pn_p_g_list:
-                    print ("calculate weight param * grad parameter name: \n", pn_p_g_item[0])
-                    if len(pn_p_g_item[1].shape) == 2: # param_value.data is "weight"
-                        print ("pn_p_g_item[1].shape: \n", pn_p_g_item[1].shape)
-                        synflow_score += np.sum(np.absolute(tensor.to_numpy(pn_p_g_item[1]) * tensor.to_numpy(pn_p_g_item[2])))
-                print ("layer_hidden_list: \n", layer_hidden_list)
-                print ("synflow_score: \n", synflow_score)
-            elif epoch == (max_epoch - 1) and b == (num_train_batch - 2): # all weights turned to positive
-                # Copy the patch data into input tensors
-                tx.copy_from_numpy(x)
-                ty.copy_from_numpy(y)
-                # print ("before model forward ...")
-                pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
-                train_correct += accuracy(tensor.to_numpy(out), y)
-                train_loss += tensor.to_numpy(loss)[0]
-                # all params turned to positive
-                for pn_p_g_item in pn_p_g_list:
-                    print ("absolute value parameter name: \n", pn_p_g_item[0])
-                    pn_p_g_item[1] = tensor.abs(pn_p_g_item[1])  # return tensor already
-            else:  # normal train steps
-                # Copy the patch data into input tensors
-                tx.copy_from_numpy(x)
-                ty.copy_from_numpy(y)
-                # print ("normal before model(tx, ty, synflow_flag, dist_option, spars)")
-                # print ("train_cnn tx: \n", tx)
-                # print ("train_cnn ty: \n", ty)
-                # print ("before model forward ...")
-                pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
-                # print ("normal after model(tx, ty, synflow_flag, dist_option, spars)")
-                train_correct += accuracy(tensor.to_numpy(out), y)
-                train_loss += tensor.to_numpy(loss)[0]
-
-        if DIST:
-            # Reduce the evaluation accuracy and loss from multiple devices
-            reducer = tensor.Tensor((1,), dev, tensor.float32)
-            train_correct = reduce_variable(train_correct, mssgd, reducer)
-            train_loss = reduce_variable(train_loss, mssgd, reducer)
-
-        if global_rank == 0:
-            print('Training loss = %f, training accuracy = %f' %
-                  (train_loss, train_correct /
-                   (num_train_batch * batch_size * world_size)),
-                  flush=True)
-
-        # Evaluation phase
-        model.eval()
-        for b in range(num_val_batch):
-            x = val_x[b * batch_size:(b + 1) * batch_size]
-            if model.dimension == 4:
-                if (image_size != model.input_size):
-                    x = resize_dataset(x, model.input_size)
-            x = x.astype(np_dtype[precision])
-            y = val_y[b * batch_size:(b + 1) * batch_size]
-            tx.copy_from_numpy(x)
-            ty.copy_from_numpy(y)
-            out_test = model(tx)
-            test_correct += accuracy(tensor.to_numpy(out_test), y)
-
-        if DIST:
-            # Reduce the evaulation accuracy from multiple devices
-            test_correct = reduce_variable(test_correct, mssgd, reducer)
-
-        # Output the evaluation accuracy
-        if global_rank == 0:
-            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
-                  (test_correct / (num_val_batch * batch_size * world_size),
-                   time.time() - start_time),
-                  flush=True)
-
-    dev.PrintTimeProfiling()
-
-
-if __name__ == '__main__':
-    # Use argparse to get command config: max_epoch, model, data, etc., for single gpu training
-    parser = argparse.ArgumentParser(
-        description='Training using the autograd and graph.')
-    parser.add_argument(
-        'model',
-        choices=['cnn', 'resnet', 'xceptionnet', 'mlp', 'msmlp', 'alexnet', 'ms_model_mlp'],
-        default='cnn')
-    parser.add_argument('data',
-                        choices=['mnist', 'cifar10', 'cifar100'],
-                        default='mnist')
-    parser.add_argument('-p',
-                        choices=['float32', 'float16'],
-                        default='float32',
-                        dest='precision')
-    parser.add_argument('-m',
-                        '--max-epoch',
-                        default=3,
-                        type=int,
-                        help='maximum epochs',
-                        dest='max_epoch')
-    parser.add_argument('-b',
-                        '--batch-size',
-                        default=64,
-                        type=int,
-                        help='batch size',
-                        dest='batch_size')
-    parser.add_argument('-l',
-                        '--learning-rate',
-                        default=0.005,
-                        type=float,
-                        help='initial learning rate',
-                        dest='lr')
-    # Determine which gpu to use
-    parser.add_argument('-i',
-                        '--device-id',
-                        default=0,
-                        type=int,
-                        help='which GPU to use',
-                        dest='device_id')
-    parser.add_argument('-g',
-                        '--disable-graph',
-                        default='True',
-                        action='store_false',
-                        help='disable graph',
-                        dest='graph')
-    parser.add_argument('-v',
-                        '--log-verbosity',
-                        default=0,
-                        type=int,
-                        help='logging verbosity',
-                        dest='verbosity')
-
-    args = parser.parse_args()
-
-    DEFAULT_LAYER_CHOICES_4 = [8, 16, 24, 32]
-    for layer1 in DEFAULT_LAYER_CHOICES_4:
-        for layer2 in DEFAULT_LAYER_CHOICES_4:
-            for layer3 in DEFAULT_LAYER_CHOICES_4:
-                for layer4 in DEFAULT_LAYER_CHOICES_4:
-                    layer_hidden_list = [layer1, layer2+1, layer3+2, layer4+3]
-                    # print ("layer_hidden_list: \n", layer_hidden_list)
-                    mssgd = MSSGD(lr=args.lr, momentum=0.9, weight_decay=1e-5, dtype=singa_dtype[args.precision])
-                    run(0,
-                        1,
-                        args.device_id,
-                        layer_hidden_list,
-                        args.max_epoch,
-                        args.batch_size,
-                        args.model,
-                        args.data,
-                        mssgd,
-                        args.graph,
-                        args.verbosity,
-                        precision=args.precision)
diff --git a/examples/model_selection_psql/ms_mlp/train_mpi.py b/examples/model_selection_psql/ms_mlp/train_mpi.py
deleted file mode 100644
index 563d4b2c51..0000000000
--- a/examples/model_selection_psql/ms_mlp/train_mpi.py
+++ /dev/null
@@ -1,91 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-
-from singa import singa_wrap as singa
-from singa import opt
-from singa import tensor
-import argparse
-import train_cnn
-
-singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
-
-if __name__ == '__main__':
-    # Use argparse to get command config: max_epoch, model, data, etc., for single gpu training
-    parser = argparse.ArgumentParser(
-        description='Training using the autograd and graph.')
-    parser.add_argument('model',
-                        choices=['cnn', 'resnet', 'xceptionnet', 'mlp'],
-                        default='cnn')
-    parser.add_argument('data', choices=['mnist', 'cifar10', 'cifar100'], default='mnist')
-    parser.add_argument('-p',
-                        choices=['float32', 'float16'],
-                        default='float32',
-                        dest='precision')
-    parser.add_argument('-m',
-                        '--max-epoch',
-                        default=10,
-                        type=int,
-                        help='maximum epochs',
-                        dest='max_epoch')
-    parser.add_argument('-b',
-                        '--batch-size',
-                        default=64,
-                        type=int,
-                        help='batch size',
-                        dest='batch_size')
-    parser.add_argument('-l',
-                        '--learning-rate',
-                        default=0.005,
-                        type=float,
-                        help='initial learning rate',
-                        dest='lr')
-    parser.add_argument('-d',
-                        '--dist-option',
-                        default='plain',
-                        choices=['plain','half','partialUpdate','sparseTopK','sparseThreshold'],
-                        help='distibuted training options',
-                        dest='dist_option')  # currently partialUpdate support graph=False only
-    parser.add_argument('-s',
-                        '--sparsification',
-                        default='0.05',
-                        type=float,
-                        help='the sparsity parameter used for sparsification, between 0 to 1',
-                        dest='spars')
-    parser.add_argument('-g',
-                        '--disable-graph',
-                        default='True',
-                        action='store_false',
-                        help='disable graph',
-                        dest='graph')
-    parser.add_argument('-v',
-                        '--log-verbosity',
-                        default=0,
-                        type=int,
-                        help='logging verbosity',
-                        dest='verbosity')
-
-    args = parser.parse_args()
-
-    sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5, dtype=singa_dtype[args.precision])
-    sgd = opt.DistOpt(sgd)
-
-    train_cnn.run(sgd.global_rank, sgd.world_size, sgd.local_rank, args.max_epoch,
-              args.batch_size, args.model, args.data, sgd, args.graph,
-              args.verbosity, args.dist_option, args.spars, args.precision)
diff --git a/examples/model_selection_psql/ms_mlp/train_ms_model.py b/examples/model_selection_psql/ms_mlp/train_ms_model.py
deleted file mode 100644
index 3da53b257d..0000000000
--- a/examples/model_selection_psql/ms_mlp/train_ms_model.py
+++ /dev/null
@@ -1,584 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-from singa import singa_wrap as singa
-from singa import device
-from singa import tensor
-from singa import opt
-from singa import autograd
-from singa.opt import Optimizer
-from singa.opt import DecayScheduler
-from singa.opt import Constant
-import numpy as np
-import time
-import argparse
-from PIL import Image
-
-np_dtype = {"float16": np.float16, "float32": np.float32}
-
-singa_dtype = {"float32": tensor.float32}
-# singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
-
-### MSOptimizer
-class MSOptimizer(Optimizer):
-    def __call__(self, loss):
-        pn_p_g_list = self.call_with_returns(loss)
-        # print ("optimizer1 before self.step()")
-        # print ("optimizer1 before print len(pn_p_g_list): \n", len(pn_p_g_list))
-        self.step()
-        # print ("optimizer1 after print len(pn_p_g_list): \n", len(pn_p_g_list))
-        # print ("optimizer1 after self.step()")
-        return pn_p_g_list
-
-    def call_with_returns(self, loss):
-        # print ("call_with_returns before apply loss.data: \n", loss.data)
-        pn_p_g_list = []
-        for p, g in autograd.backward(loss):
-            if p.name is None:
-                p.name = id(p)
-            self.apply(p.name, p, g)
-            # print ("call with returns")
-            # print ("p.name: \n", p.name)
-            # print ("p.data: \n", p.data)
-            # print ("g.data: \n", g.data)
-            pn_p_g_list.append([p.name, p, g])  # need iterables
-        # print ("call_with_returns after apply loss.data: \n", loss.data)
-        return pn_p_g_list
-
-# MSSGD -- sub class of MSOptimizer
-class MSSGD(MSOptimizer):
-    """Implements stochastic gradient descent (optionally with momentum).
-
-    Nesterov momentum is based on the formula from `On the importance of initialization and momentum in deep learning`__.
-
-    Args:
-        lr(float): learning rate
-        momentum(float, optional): momentum factor(default: 0)
-        weight_decay(float, optional): weight decay(L2 penalty)(default: 0)
-        dampening(float, optional): dampening for momentum(default: 0)
-        nesterov(bool, optional): enables Nesterov momentum(default: False)
-
-    Typical usage example:
-        >> > from singa import opt
-        >> > optimizer = opt.SGD(lr=0.1, momentum=0.9)
-        >> > optimizer.update()
-
-    __ http: // www.cs.toronto.edu / %7Ehinton / absps / momentum.pdf
-
-    .. note::
-        The implementation of SGD with Momentum / Nesterov subtly differs from
-        Sutskever et. al. and implementations in some other frameworks.
-
-        Considering the specific case of Momentum, the update can be written as
-
-        .. math::
-                  v = \rho * v + g \\
-                  p = p - lr * v
-
-        where p, g, v and: math: `\rho` denote the parameters, gradient,
-        velocity, and momentum respectively.
-
-        This is in contrast to Sutskever et. al. and
-        other frameworks which employ an update of the form
-
-        .. math::
-             v = \rho * v + lr * g \\
-             p = p - v
-
-        The Nesterov version is analogously modified.
-    """
-
-    def __init__(self,
-                 lr=0.1,
-                 momentum=0,
-                 dampening=0,
-                 weight_decay=0,
-                 nesterov=False,
-                 dtype=tensor.float32):
-        super(MSSGD, self).__init__(lr)
-
-        # init momentum
-        if type(momentum) == float or type(momentum) == int:
-            if momentum < 0.0:
-                raise ValueError("Invalid momentum value: {}".format(momentum))
-            self.momentum = Constant(momentum)
-        elif isinstance(momentum, DecayScheduler):
-            self.momentum = momentum
-            momentum = momentum.init_value
-        else:
-            raise TypeError("Wrong momentum type")
-        # self.dtype = dtype
-        # self.mom_value = self.momentum(self.step_counter).as_type(self.dtype)
-        self.mom_value = self.momentum(self.step_counter)
-
-        # init dampening
-        if type(dampening) == float or type(dampening) == int:
-            self.dampening = Constant(dampening)
-        elif isinstance(dampening, DecayScheduler):
-            self.dampening = dampening
-            dampening = dampening.init_value
-        else:
-            raise TypeError("Wrong dampening type")
-        # self.dam_value = self.dampening(self.step_counter).as_type(self.dtype)
-        self.dam_value = self.dampening(self.step_counter)
-
-        # init weight_decay
-        if type(weight_decay) == float or type(weight_decay) == int:
-            if weight_decay < 0.0:
-                raise ValueError(
-                    "Invalid weight_decay value: {}".format(weight_decay))
-            self.weight_decay = Constant(weight_decay)
-        elif isinstance(weight_decay, DecayScheduler):
-            self.weight_decay = weight_decay
-        else:
-            raise TypeError("Wrong weight_decay type")
-        # self.decay_value = self.weight_decay(self.step_counter).as_type(self.dtype)
-        self.decay_value = self.weight_decay(self.step_counter)
-
-        # init other params
-        self.nesterov = nesterov
-        self.moments = dict()
-
-        # check value
-        if nesterov and (momentum <= 0 or dampening != 0):
-            raise ValueError(
-                "Nesterov momentum requires a momentum and zero dampening")
-        
-    def apply(self, param_name, param_value, param_grad):
-        """Performs a single optimization step.
-
-        Args:
-                param_name(String): the name of the param
-                param_value(Tensor): param values to be update in-place
-                grad(Tensor): param gradients; the values may be updated
-                        in this function; cannot use it anymore
-        """
-        assert param_value.shape == param_grad.shape, ("shape mismatch",
-                                                       param_value.shape,
-                                                       param_grad.shape)
-        self.device_check(param_value, self.step_counter, self.lr_value,
-                          self.mom_value, self.dam_value, self.decay_value)
-
-        # derive dtype from input
-        # assert param_value.dtype == self.dtype
-
-        # TODO add branch operator
-        # if self.decay_value != 0:
-        if self.weight_decay.init_value != 0:
-            singa.Axpy(self.decay_value.data, param_value.data, param_grad.data)
-
-        if self.momentum.init_value != 0:
-            if param_name not in self.moments:
-                flag = param_value.device.graph_enabled()
-                param_value.device.EnableGraph(False)
-                self.moments[param_name] = tensor.zeros_like(param_value)
-                param_value.device.EnableGraph(flag)
-
-            buf = self.moments[param_name]
-            buf *= self.mom_value
-            alpha = 1.0 - self.dam_value
-            singa.Axpy(alpha.data, param_grad.data, buf.data)
-
-            if self.nesterov:
-                singa.Axpy(self.mom_value.data, buf.data, param_grad.data)
-            else:
-                param_grad = buf
-
-        minus_lr = 0.0 - self.lr_value
-        singa.Axpy(minus_lr.data, param_grad.data, param_value.data)
-
-# Data augmentation
-def augmentation(x, batch_size):
-    xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
-    for data_num in range(0, batch_size):
-        offset = np.random.randint(8, size=2)
-        x[data_num, :, :, :] = xpad[data_num, :,
-                                    offset[0]:offset[0] + x.shape[2],
-                                    offset[1]:offset[1] + x.shape[2]]
-        if_flip = np.random.randint(2)
-        if (if_flip):
-            x[data_num, :, :, :] = x[data_num, :, :, ::-1]
-    return x
-
-
-# Calculate accuracy
-def accuracy(pred, target):
-    # y is network output to be compared with ground truth (int)
-    y = np.argmax(pred, axis=1)
-    a = y == target
-    correct = np.array(a, "int").sum()
-    return correct
-
-
-# Data partition according to the rank
-def partition(global_rank, world_size, train_x, train_y, val_x, val_y):
-    # Partition training data
-    data_per_rank = train_x.shape[0] // world_size
-    idx_start = global_rank * data_per_rank
-    idx_end = (global_rank + 1) * data_per_rank
-    train_x = train_x[idx_start:idx_end]
-    train_y = train_y[idx_start:idx_end]
-
-    # Partition evaluation data
-    data_per_rank = val_x.shape[0] // world_size
-    idx_start = global_rank * data_per_rank
-    idx_end = (global_rank + 1) * data_per_rank
-    val_x = val_x[idx_start:idx_end]
-    val_y = val_y[idx_start:idx_end]
-    return train_x, train_y, val_x, val_y
-
-
-# Function to all reduce NUMPY accuracy and loss from multiple devices
-def reduce_variable(variable, dist_opt, reducer):
-    reducer.copy_from_numpy(variable)
-    dist_opt.all_reduce(reducer.data)
-    dist_opt.wait()
-    output = tensor.to_numpy(reducer)
-    return output
-
-
-def resize_dataset(x, image_size):
-    num_data = x.shape[0]
-    dim = x.shape[1]
-    X = np.zeros(shape=(num_data, dim, image_size, image_size),
-                 dtype=np.float32)
-    for n in range(0, num_data):
-        for d in range(0, dim):
-            X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
-                (image_size, image_size), Image.BILINEAR),
-                                     dtype=np.float32)
-    return X
-
-def run(global_rank,
-        world_size,
-        local_rank,
-        layer_hidden_list,
-        max_epoch,
-        batch_size,
-        model,
-        data,
-        mssgd,
-        graph,
-        verbosity,
-        dist_option='plain',
-        spars=None,
-        precision='float32'):
-    # dev = device.create_cuda_gpu_on(local_rank)  # need to change to CPU device for CPU-only machines
-    dev = device.get_default_device()
-    dev.SetRandSeed(0)
-    np.random.seed(0)
-
-    if data == 'cifar10':
-        from data import cifar10
-        train_x, train_y, val_x, val_y = cifar10.load()
-    elif data == 'cifar100':
-        from data import cifar100
-        train_x, train_y, val_x, val_y = cifar100.load()
-    elif data == 'mnist':
-        from data import mnist
-        train_x, train_y, val_x, val_y = mnist.load()
-
-
-    num_channels = train_x.shape[1]
-    image_size = train_x.shape[2]
-    data_size = np.prod(train_x.shape[1:train_x.ndim]).item()
-    num_classes = (np.max(train_y) + 1).item()
-
-    if model == 'resnet':
-        from model import resnet
-        model = resnet.resnet50(num_channels=num_channels,
-                                num_classes=num_classes)
-    elif model == 'xceptionnet':
-        from model import xceptionnet
-        model = xceptionnet.create_model(num_channels=num_channels,
-                                         num_classes=num_classes)
-    elif model == 'cnn':
-        from model import cnn
-        model = cnn.create_model(num_channels=num_channels,
-                                 num_classes=num_classes)
-    elif model == 'alexnet':
-        from model import alexnet
-        model = alexnet.create_model(num_channels=num_channels,
-                                     num_classes=num_classes)
-    elif model == 'mlp':
-        import os, sys, inspect
-        current = os.path.dirname(
-            os.path.abspath(inspect.getfile(inspect.currentframe())))
-        parent = os.path.dirname(current)
-        sys.path.insert(0, parent)
-        from mlp import model
-        model = model.create_model(data_size=data_size,
-                                    num_classes=num_classes)
-    
-    elif model == 'msmlp':
-        import os, sys, inspect
-        current = os.path.dirname(
-            os.path.abspath(inspect.getfile(inspect.currentframe())))
-        parent = os.path.dirname(current)
-        sys.path.insert(0, parent)
-        from msmlp import model
-        model = model.create_model(data_size=data_size,
-                                    num_classes=num_classes)
-
-    elif model == 'ms_model_mlp':
-        import os, sys, inspect
-        current = os.path.dirname(
-            os.path.abspath(inspect.getfile(inspect.currentframe())))
-        parent = os.path.dirname(current)
-        sys.path.insert(0, parent)
-        from ms_model_mlp import model
-        model = model.create_model(data_size=data_size,
-                                    num_classes=num_classes, 
-                                    layer_hidden_list=layer_hidden_list)
-    # print ("model: \n", model)
-
-    # For distributed training, sequential has better performance
-    if hasattr(mssgd, "communicator"):
-        DIST = True
-        sequential = True
-    else:
-        DIST = False
-        sequential = False
-
-    if DIST:
-        train_x, train_y, val_x, val_y = partition(global_rank, world_size,
-                                                   train_x, train_y, val_x,
-                                                   val_y)
-
-    if model.dimension == 4:
-        tx = tensor.Tensor(
-            (batch_size, num_channels, model.input_size, model.input_size), dev,
-            singa_dtype[precision])
-    elif model.dimension == 2:
-        tx = tensor.Tensor((batch_size, data_size), dev, singa_dtype[precision])
-        np.reshape(train_x, (train_x.shape[0], -1))
-        np.reshape(val_x, (val_x.shape[0], -1))
-
-    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
-    num_train_batch = train_x.shape[0] // batch_size
-    num_val_batch = val_x.shape[0] // batch_size
-    idx = np.arange(train_x.shape[0], dtype=np.int32)
-
-    # Attach model to graph
-    model.set_optimizer(mssgd)
-    model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
-    dev.SetVerbosity(verbosity)
-
-    # Training and evaluation loop
-    for epoch in range(max_epoch):
-        start_time = time.time()
-        np.random.shuffle(idx)
-
-        if global_rank == 0:
-            print('Starting Epoch %d:' % (epoch))
-
-        # Training phase
-        train_correct = np.zeros(shape=[1], dtype=np.float32)
-        test_correct = np.zeros(shape=[1], dtype=np.float32)
-        train_loss = np.zeros(shape=[1], dtype=np.float32)
-
-        model.train()
-        print ("num_train_batch: \n", num_train_batch)
-        print ()
-        for b in range(num_train_batch):
-            if b % 100 == 0:
-                print ("b: \n", b)
-            # Generate the patch data in this iteration
-            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
-            if model.dimension == 4:
-                x = augmentation(x, batch_size)
-                if (image_size != model.input_size):
-                    x = resize_dataset(x, model.input_size)
-            x = x.astype(np_dtype[precision])
-            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
-
-
-            synflow_flag = False
-            # Train the model
-            if epoch == (max_epoch - 1) and b == (num_train_batch - 1):  ### synflow calcuation for the last batch
-                # print ("last epoch calculate synflow")
-                synflow_flag = True
-                ### step 1: all one input
-                # Copy the patch data into input tensors
-                tx.copy_from_numpy(np.ones(x.shape, dtype=np.float32))
-                ty.copy_from_numpy(y)
-                ### step 2: all weights turned to positive (done)
-                ### step 3: new loss (done)
-                # print ("before model forward ...")
-                pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
-                ### step 4: calculate the multiplication of weights
-                synflow_score = 0.0
-                for pn_p_g_item in pn_p_g_list:
-                    # print ("calculate weight param * grad parameter name: \n", pn_p_g_item[0])
-                    if len(pn_p_g_item[1].shape) == 2: # param_value.data is "weight"
-                        # print ("pn_p_g_item[1].shape: \n", pn_p_g_item[1].shape)
-                        synflow_score += np.sum(np.absolute(tensor.to_numpy(pn_p_g_item[1]) * tensor.to_numpy(pn_p_g_item[2])))
-                # print ("layer_hidden_list: \n", layer_hidden_list)
-                # print ("synflow_score: \n", synflow_score)
-            elif epoch == (max_epoch - 1) and b == (num_train_batch - 2): # all weights turned to positive
-                # Copy the patch data into input tensors
-                # print ("all weights turned to positive\n")
-                # print ("x: \n", x)
-                # print ("y: \n", y)
-                tx.copy_from_numpy(x)
-                ty.copy_from_numpy(y)
-                # print ("before model forward ...")
-                pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
-                # print ("after model forward ...")
-                train_correct += accuracy(tensor.to_numpy(out), y)
-                train_loss += tensor.to_numpy(loss)[0]
-                # all params turned to positive
-                for pn_p_g_item in pn_p_g_list:
-                    # print ("absolute value parameter name: \n", pn_p_g_item[0])
-                    pn_p_g_item[1] = tensor.abs(pn_p_g_item[1])  # tensor actually ...
-            else:  # normal train steps
-                # Copy the patch data into input tensors
-                # print ("normal train steps\n")
-                # print ("x.astype(np.float32): \n", x.astype(np.float32))
-                # print ("y: \n", y)
-                tx.copy_from_numpy(x.astype(np.float32))
-                # print ("tx: \n", tx)
-                ty.copy_from_numpy(y)
-                # print ("ty: \n", ty)
-                # print ("normal before model(tx, ty, synflow_flag, dist_option, spars)")
-                # print ("train_cnn tx: \n", tx)
-                # print ("train_cnn ty: \n", ty)
-                # print ("before model forward ...")
-                pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
-                # print ("normal after model(tx, ty, synflow_flag, dist_option, spars)")
-                train_correct += accuracy(tensor.to_numpy(out), y)
-                train_loss += tensor.to_numpy(loss)[0]
-
-        if DIST:
-            # Reduce the evaluation accuracy and loss from multiple devices
-            reducer = tensor.Tensor((1,), dev, tensor.float32)
-            train_correct = reduce_variable(train_correct, mssgd, reducer)
-            train_loss = reduce_variable(train_loss, mssgd, reducer)
-
-        if global_rank == 0:
-            print('Training loss = %f, training accuracy = %f' %
-                  (train_loss, train_correct /
-                   (num_train_batch * batch_size * world_size)),
-                  flush=True)
-
-        # Evaluation phase
-        model.eval()
-        for b in range(num_val_batch):
-            x = val_x[b * batch_size:(b + 1) * batch_size]
-            if model.dimension == 4:
-                if (image_size != model.input_size):
-                    x = resize_dataset(x, model.input_size)
-            x = x.astype(np_dtype[precision])
-            y = val_y[b * batch_size:(b + 1) * batch_size]
-            tx.copy_from_numpy(x)
-            ty.copy_from_numpy(y)
-            out_test = model(tx)
-            test_correct += accuracy(tensor.to_numpy(out_test), y)
-
-        if DIST:
-            # Reduce the evaulation accuracy from multiple devices
-            test_correct = reduce_variable(test_correct, mssgd, reducer)
-
-        # Output the evaluation accuracy
-        if global_rank == 0:
-            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
-                  (test_correct / (num_val_batch * batch_size * world_size),
-                   time.time() - start_time),
-                  flush=True)
-
-    dev.PrintTimeProfiling()
-
-
-if __name__ == '__main__':
-    # Use argparse to get command config: max_epoch, model, data, etc., for single gpu training
-    parser = argparse.ArgumentParser(
-        description='Training using the autograd and graph.')
-    parser.add_argument(
-        'model',
-        choices=['cnn', 'resnet', 'xceptionnet', 'mlp', 'msmlp', 'alexnet', 'ms_model_mlp'],
-        default='cnn')
-    parser.add_argument('data',
-                        choices=['mnist', 'cifar10', 'cifar100'],
-                        default='mnist')
-    parser.add_argument('-p',
-                        choices=['float32', 'float16'],
-                        default='float32',
-                        dest='precision')
-    parser.add_argument('-m',
-                        '--max-epoch',
-                        default=2,
-                        type=int,
-                        help='maximum epochs',
-                        dest='max_epoch')
-    parser.add_argument('-b',
-                        '--batch-size',
-                        default=64,
-                        type=int,
-                        help='batch size',
-                        dest='batch_size')
-    parser.add_argument('-l',
-                        '--learning-rate',
-                        default=0.005,
-                        type=float,
-                        help='initial learning rate',
-                        dest='lr')
-    # Determine which gpu to use
-    parser.add_argument('-i',
-                        '--device-id',
-                        default=0,
-                        type=int,
-                        help='which GPU to use',
-                        dest='device_id')
-    parser.add_argument('-g',
-                        '--disable-graph',
-                        default='True',
-                        action='store_false',
-                        help='disable graph',
-                        dest='graph')
-    parser.add_argument('-v',
-                        '--log-verbosity',
-                        default=0,
-                        type=int,
-                        help='logging verbosity',
-                        dest='verbosity')
-
-    args = parser.parse_args()
-
-    # mssgd = MSSGD(lr=args.lr, momentum=0.9, weight_decay=1e-5, dtype=singa_dtype[args.precision])
-
-    DEFAULT_LAYER_CHOICES_4 = [8, 16, 24, 32]
-    for layer1 in DEFAULT_LAYER_CHOICES_4:
-        for layer2 in DEFAULT_LAYER_CHOICES_4:
-            for layer3 in DEFAULT_LAYER_CHOICES_4:
-                for layer4 in DEFAULT_LAYER_CHOICES_4:
-                    layer_hidden_list = [layer1, layer2+1, layer3+2, layer4+3]
-                    mssgd = MSSGD(lr=args.lr, momentum=0.9, weight_decay=1e-5, dtype=singa_dtype[args.precision])
-                    run(0,
-                        1,
-                        args.device_id,
-                        layer_hidden_list,
-                        args.max_epoch,
-                        args.batch_size,
-                        args.model,
-                        args.data,
-                        mssgd,
-                        args.graph,
-                        args.verbosity,
-                        precision=args.precision)
-
diff --git a/examples/model_selection_psql/ms_mlp/train_multiprocess.py b/examples/model_selection_psql/ms_mlp/train_multiprocess.py
deleted file mode 100644
index 182dd35eed..0000000000
--- a/examples/model_selection_psql/ms_mlp/train_multiprocess.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-
-from singa import singa_wrap as singa
-from singa import opt
-from singa import tensor
-import argparse
-import train_cnn
-import multiprocessing
-
-singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
-
-def run(args, local_rank, world_size, nccl_id):
-    sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5, dtype=singa_dtype[args.precision])
-    sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size)
-    train_cnn.run(sgd.global_rank, sgd.world_size, sgd.local_rank, args.max_epoch,
-              args.batch_size, args.model, args.data, sgd, args.graph,
-              args.verbosity, args.dist_option, args.spars, args.precision)
-
-
-if __name__ == '__main__':
-    # Use argparse to get command config: max_epoch, model, data, etc., for single gpu training
-    parser = argparse.ArgumentParser(
-        description='Training using the autograd and graph.')
-    parser.add_argument('model',
-                        choices=['resnet', 'xceptionnet', 'cnn', 'mlp'],
-                        default='cnn')
-    parser.add_argument('data', choices=['cifar10', 'cifar100', 'mnist'], default='mnist')
-    parser.add_argument('-p',
-                        choices=['float32', 'float16'],
-                        default='float32',
-                        dest='precision')
-    parser.add_argument('-m',
-                        '--max-epoch',
-                        default=10,
-                        type=int,
-                        help='maximum epochs',
-                        dest='max_epoch')
-    parser.add_argument('-b',
-                        '--batch-size',
-                        default=64,
-                        type=int,
-                        help='batch size',
-                        dest='batch_size')
-    parser.add_argument('-l',
-                        '--learning-rate',
-                        default=0.005,
-                        type=float,
-                        help='initial learning rate',
-                        dest='lr')
-    parser.add_argument('-w',
-                        '--world-size',
-                        default=2,
-                        type=int,
-                        help='number of gpus to be used',
-                        dest='world_size')
-    parser.add_argument('-d',
-                        '--dist-option',
-                        default='plain',
-                        choices=['plain','half','partialUpdate','sparseTopK','sparseThreshold'],
-                        help='distibuted training options',
-                        dest='dist_option') # currently partialUpdate support graph=False only
-    parser.add_argument('-s',
-                        '--sparsification',
-                        default='0.05',
-                        type=float,
-                        help='the sparsity parameter used for sparsification, between 0 to 1',
-                        dest='spars')
-    parser.add_argument('-g',
-                        '--disable-graph',
-                        default='True',
-                        action='store_false',
-                        help='disable graph',
-                        dest='graph')
-    parser.add_argument('-v',
-                        '--log-verbosity',
-                        default=0,
-                        type=int,
-                        help='logging verbosity',
-                        dest='verbosity')
-
-    args = parser.parse_args()
-
-    # Generate a NCCL ID to be used for collective communication
-    nccl_id = singa.NcclIdHolder()
-
-    process = []
-    for local_rank in range(0, args.world_size):
-        process.append(
-            multiprocessing.Process(target=run,
-                                    args=(args, local_rank, args.world_size, nccl_id)))
-
-    for p in process:
-        p.start()
diff --git a/examples/model_selection_psql/ms_model_mlp/model.py b/examples/model_selection_psql/ms_model_mlp/model.py
deleted file mode 100644
index 70d1a17487..0000000000
--- a/examples/model_selection_psql/ms_model_mlp/model.py
+++ /dev/null
@@ -1,224 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-from singa import layer
-from singa import model
-from singa import tensor
-from singa import opt
-from singa import device
-from singa.autograd import Operator
-from singa.layer import Layer
-from singa import singa_wrap as singa
-import argparse
-import numpy as np
-
-np_dtype = {"float16": np.float16, "float32": np.float32}
-
-singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
-
-#### self-defined loss begin
-
-### from autograd.py
-class SumError(Operator):
-
-    def __init__(self):
-        super(SumError, self).__init__()
-        # self.t = t.data
-
-    def forward(self, x):
-        # self.err = singa.__sub__(x, self.t)
-        self.data_x = x
-        # sqr = singa.Square(self.err)
-        # loss = singa.SumAll(sqr)
-        loss = singa.SumAll(x)
-        # self.n = 1
-        # for s in x.shape():
-        #     self.n *= s
-        # loss /= self.n
-        return loss
-
-    def backward(self, dy=1.0):
-        # dx = self.err
-        dev = device.get_default_device()
-        dx = tensor.Tensor(self.data_x.shape, dev, singa_dtype['float32'])
-        dx.copy_from_numpy(np.ones(self.data_x.shape))
-        # dx *= float(2 / self.n)
-        dx *= dy
-        return dx
-
-def se_loss(x):
-    # assert x.shape == t.shape, "input and target shape different: %s, %s" % (
-    #     x.shape, t.shape)
-    return SumError()(x)[0]
-
-### from layer.py
-class SumErrorLayer(Layer):
-    """
-    Generate a MeanSquareError operator
-    """
-
-    def __init__(self):
-        super(SumErrorLayer, self).__init__()
-
-    def forward(self, x):
-        return se_loss(x)
-
-class MSMLP(model.Model):
-
-    def __init__(self, data_size=10, perceptron_size=100, num_classes=10, layer_hidden_list=[10,10,10,10]):
-        super(MSMLP, self).__init__()
-        self.num_classes = num_classes
-        self.dimension = 2
-
-        self.relu = layer.ReLU()
-        self.linear1 = layer.Linear(layer_hidden_list[0])
-        self.linear2 = layer.Linear(layer_hidden_list[1])
-        self.linear3 = layer.Linear(layer_hidden_list[2])
-        self.linear4 = layer.Linear(layer_hidden_list[3])
-        self.linear5 = layer.Linear(num_classes)
-        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
-        self.sum_error = SumErrorLayer()
-
-    def forward(self, inputs):
-        y = self.linear1(inputs)
-        y = self.relu(y)
-        y = self.linear2(y)
-        y = self.relu(y)
-        y = self.linear3(y)
-        y = self.relu(y)
-        y = self.linear4(y)
-        y = self.relu(y)
-        y = self.linear5(y)
-        return y
-
-    def train_one_batch(self, x, y, dist_option, spars, synflow_flag):
-        # print ("in train_one_batch")
-        out = self.forward(x)
-        # print ("train_one_batch x.data: \n", x.data)
-        # print ("train_one_batch y.data: \n", y.data)
-        # print ("train_one_batch out.data: \n", out.data)
-        if synflow_flag:
-            # print ("sum_error")
-            loss = self.sum_error(out)
-        else:  # normal training
-            # print ("softmax_cross_entropy")
-            loss = self.softmax_cross_entropy(out, y)
-        # print ("train_one_batch loss.data: \n", loss.data)
-
-        if dist_option == 'plain':
-            # print ("before pn_p_g_list = self.optimizer(loss)")
-            pn_p_g_list = self.optimizer(loss)
-            # print ("after pn_p_g_list = self.optimizer(loss)")
-        elif dist_option == 'half':
-            self.optimizer.backward_and_update_half(loss)
-        elif dist_option == 'partialUpdate':
-            self.optimizer.backward_and_partial_update(loss)
-        elif dist_option == 'sparseTopK':
-            self.optimizer.backward_and_sparse_update(loss,
-                                                      topK=True,
-                                                      spars=spars)
-        elif dist_option == 'sparseThreshold':
-            self.optimizer.backward_and_sparse_update(loss,
-                                                      topK=False,
-                                                      spars=spars)
-        # print ("len(pn_p_g_list): \n", len(pn_p_g_list))
-        # print ("len(pn_p_g_list[0]): \n", len(pn_p_g_list[0]))
-        # print ("pn_p_g_list[0][0]: \n", pn_p_g_list[0][0])
-        # print ("pn_p_g_list[0][1].data: \n", pn_p_g_list[0][1].data)
-        # print ("pn_p_g_list[0][2].data: \n", pn_p_g_list[0][2].data)
-        return pn_p_g_list, out, loss
-        # return pn_p_g_list[0], pn_p_g_list[1], pn_p_g_list[2], out, loss
-
-    def set_optimizer(self, optimizer):
-        self.optimizer = optimizer
-
-
-def create_model(pretrained=False, **kwargs):
-    """Constructs a CNN model.
-
-    Args:
-        pretrained (bool): If True, returns a pre-trained model.
-    
-    Returns:
-        The created CNN model.
-    """
-    model = MSMLP(**kwargs)
-
-    return model
-
-
-__all__ = ['MLP', 'create_model']
-
-if __name__ == "__main__":
-    np.random.seed(0)
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-p',
-                        choices=['float32', 'float16'],
-                        default='float32',
-                        dest='precision')
-    parser.add_argument('-g',
-                        '--disable-graph',
-                        default='True',
-                        action='store_false',
-                        help='disable graph',
-                        dest='graph')
-    parser.add_argument('-m',
-                        '--max-epoch',
-                        default=1001,
-                        type=int,
-                        help='maximum epochs',
-                        dest='max_epoch')
-    args = parser.parse_args()
-
-    # generate the boundary
-    f = lambda x: (5 * x + 1)
-    bd_x = np.linspace(-1.0, 1, 200)
-    bd_y = f(bd_x)
-
-    # generate the training data
-    x = np.random.uniform(-1, 1, 400)
-    y = f(x) + 2 * np.random.randn(len(x))
-
-    # choose one precision
-    precision = singa_dtype[args.precision]
-    np_precision = np_dtype[args.precision]
-
-    # convert training data to 2d space
-    label = np.asarray([5 * a + 1 > b for (a, b) in zip(x, y)]).astype(np.int32)
-    data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np_precision)
-
-    dev = device.create_cuda_gpu_on(0)
-    sgd = opt.SGD(0.1, 0.9, 1e-5, dtype=singa_dtype[args.precision])
-    tx = tensor.Tensor((400, 2), dev, precision)
-    ty = tensor.Tensor((400,), dev, tensor.int32)
-    model = MLP(data_size=2, perceptron_size=3, num_classes=2)
-
-    # attach model to graph
-    model.set_optimizer(sgd)
-    model.compile([tx], is_train=True, use_graph=args.graph, sequential=True)
-    model.train()
-
-    for i in range(args.max_epoch):
-        tx.copy_from_numpy(data)
-        ty.copy_from_numpy(label)
-        out, loss = model(tx, ty, 'fp32', spars=None)
-
-        if i % 100 == 0:
-            print("training loss = ", tensor.to_numpy(loss)[0])
\ No newline at end of file
diff --git a/examples/model_selection_psql/ms_model_mlp/native.py b/examples/model_selection_psql/ms_model_mlp/native.py
deleted file mode 100644
index a82ec3b24c..0000000000
--- a/examples/model_selection_psql/ms_model_mlp/native.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-from singa import tensor
-from singa.tensor import Tensor
-from singa import autograd
-from singa import opt
-import numpy as np
-from singa import device
-import argparse
-
-np_dtype = {"float16": np.float16, "float32": np.float32}
-
-singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-p',
-                        choices=['float32', 'float16'],
-                        default='float32',
-                        dest='precision')
-    parser.add_argument('-m',
-                        '--max-epoch',
-                        default=1001,
-                        type=int,
-                        help='maximum epochs',
-                        dest='max_epoch')
-    args = parser.parse_args()
-
-    np.random.seed(0)
-
-    autograd.training = True
-
-    # prepare training data in numpy array
-
-    # generate the boundary
-    f = lambda x: (5 * x + 1)
-    bd_x = np.linspace(-1.0, 1, 200)
-    bd_y = f(bd_x)
-
-    # generate the training data
-    x = np.random.uniform(-1, 1, 400)
-    y = f(x) + 2 * np.random.randn(len(x))
-
-    # convert training data to 2d space
-    label = np.asarray([5 * a + 1 > b for (a, b) in zip(x, y)])
-    data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np.float32)
-
-    def to_categorical(y, num_classes):
-        """
-        Converts a class vector (integers) to binary class matrix.
-
-        Args:
-            y: class vector to be converted into a matrix
-                (integers from 0 to num_classes).
-            num_classes: total number of classes.
-
-        Returns:
-            A binary matrix representation of the input.
-        """
-        y = np.array(y, dtype="int")
-        n = y.shape[0]
-        categorical = np.zeros((n, num_classes))
-        categorical[np.arange(n), y] = 1
-        return categorical
-
-    label = to_categorical(label, 2).astype(np.float32)
-    print("train_data_shape:", data.shape)
-    print("train_label_shape:", label.shape)
-
-    precision = singa_dtype[args.precision]
-    np_precision = np_dtype[args.precision]
-
-    dev = device.create_cuda_gpu()
-
-    inputs = Tensor(data=data, device=dev)
-    target = Tensor(data=label, device=dev)
-
-    inputs = inputs.as_type(precision)
-    target = target.as_type(tensor.int32)
-
-    w0_np = np.random.normal(0, 0.1, (2, 3)).astype(np_precision)
-    w0 = Tensor(data=w0_np,
-                device=dev,
-                dtype=precision,
-                requires_grad=True,
-                stores_grad=True)
-    b0 = Tensor(shape=(3,),
-                device=dev,
-                dtype=precision,
-                requires_grad=True,
-                stores_grad=True)
-    b0.set_value(0.0)
-
-    w1_np = np.random.normal(0, 0.1, (3, 2)).astype(np_precision)
-    w1 = Tensor(data=w1_np,
-                device=dev,
-                dtype=precision,
-                requires_grad=True,
-                stores_grad=True)
-    b1 = Tensor(shape=(2,),
-                device=dev,
-                dtype=precision,
-                requires_grad=True,
-                stores_grad=True)
-    b1.set_value(0.0)
-
-    sgd = opt.SGD(0.05, 0.8)
-
-    # training process
-    for i in range(args.max_epoch):
-        x = autograd.matmul(inputs, w0)
-        x = autograd.add_bias(x, b0)
-        x = autograd.relu(x)
-        x = autograd.matmul(x, w1)
-        x = autograd.add_bias(x, b1)
-        loss = autograd.softmax_cross_entropy(x, target)
-        sgd(loss)
-
-        if i % 100 == 0:
-            print("%d, training loss = " % i, tensor.to_numpy(loss)[0])
diff --git a/examples/model_selection_psql/msmlp/model.py b/examples/model_selection_psql/msmlp/model.py
deleted file mode 100644
index 70bc2341d4..0000000000
--- a/examples/model_selection_psql/msmlp/model.py
+++ /dev/null
@@ -1,209 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-from singa import layer
-from singa import model
-from singa import tensor
-from singa import opt
-from singa import device
-from singa.autograd import Operator
-from singa.layer import Layer
-from singa import singa_wrap as singa
-import argparse
-import numpy as np
-
-np_dtype = {"float16": np.float16, "float32": np.float32}
-
-singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
-
-### refer to autograd.py
-class SumError(Operator):
-
-    def __init__(self):
-        super(SumError, self).__init__()
-        # self.t = t.data
-
-    def forward(self, x):
-        # self.err = singa.__sub__(x, self.t)
-        self.data_x = x
-        # sqr = singa.Square(self.err)
-        # loss = singa.SumAll(sqr)
-        loss = singa.SumAll(x)
-        # self.n = 1
-        # for s in x.shape():
-        #     self.n *= s
-        # loss /= self.n
-        return loss
-    
-    def backward(self, dy=1.0):
-        # dx = self.err
-        dev = device.get_default_device()
-        dx = tensor.Tensor(self.data_x.shape, dev, singa_dtype['float32'])
-        dx.copy_from_numpy(np.ones(self.data_x.shape))
-        # dx *= float(2 / self.n)
-        dx *= dy
-        return dx
-
-def se_loss(x):
-    return SumError()(x)[0]
-
-### refer to layer.py
-class SumErrorLayer(Layer):
-    """
-    Generate a SumError Layer
-    """
-
-    def __init__(self):
-        super(SumErrorLayer, self).__init__()
-
-    def forward(self, x):
-        return se_loss(x)
-
-class MSMLP(model.Model):
-
-    def __init__(self, data_size=10, perceptron_size=100, num_classes=10):
-        super(MSMLP, self).__init__()
-        self.num_classes = num_classes
-        self.dimension = 2
-
-        self.relu = layer.ReLU()
-        self.linear1 = layer.Linear(perceptron_size)
-        self.linear2 = layer.Linear(num_classes)
-        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
-        self.sum_error = SumErrorLayer()  # for synflow backward
-    
-    def forward(self, inputs):
-        y = self.linear1(inputs)
-        y = self.relu(y)
-        y = self.linear2(y)
-        return y
-
-    def train_one_batch(self, x, y, synflow_flag, dist_option, spars):
-        # print ("in train_one_batch")
-        out = self.forward(x)
-        # print ("train_one_batch x.data: \n", x.data)
-        # print ("train_one_batch y.data: \n", y.data)
-        # print ("train_one_batch out.data: \n", out.data)
-        if synflow_flag:
-            loss = self.sum_error(out)
-            # print ("sum_error")
-        else:  # normal training
-            loss = self.softmax_cross_entropy(out, y)
-
-        if dist_option == 'plain':
-            # print ("before pn_p_g_list = self.optimizer(loss)")
-            pn_p_g_list = self.optimizer(loss)
-            # print ("after pn_p_g_list = self.optimizer(loss)")
-        elif dist_option == 'half':
-            self.optimizer.backward_and_update_half(loss)
-        elif dist_option == 'partialUpdate':
-            self.optimizer.backward_and_partial_update(loss)
-        elif dist_option == 'sparseTopK':
-            self.optimizer.backward_and_sparse_update(loss,
-                                                      topK=True,
-                                                      spars=spars)
-        elif dist_option == 'sparseThreshold':
-            self.optimizer.backward_and_sparse_update(loss,
-                                                      topK=False,
-                                                      spars=spars)
-        # print ("len(pn_p_g_list): \n", len(pn_p_g_list))
-        # print ("len(pn_p_g_list[0]): \n", len(pn_p_g_list[0]))
-        # print ("pn_p_g_list[0][0]: \n", pn_p_g_list[0][0])
-        # print ("pn_p_g_list[0][1].data: \n", pn_p_g_list[0][1].data)
-        # print ("pn_p_g_list[0][2].data: \n", pn_p_g_list[0][2].data)
-        return pn_p_g_list, out, loss
-        # return pn_p_g_list[0], pn_p_g_list[1], pn_p_g_list[2], out, loss
-
-    def set_optimizer(self, optimizer):
-        self.optimizer = optimizer
-
-
-def create_model(pretrained=False, **kwargs):
-    """Constructs a CNN model.
-
-    Args:
-        pretrained (bool): If True, returns a pre-trained model.
-    
-    Returns:
-        The created CNN model.
-    """
-    model = MSMLP(**kwargs)
-
-    return model
-
-
-__all__ = ['MLP', 'create_model']
-
-if __name__ == "__main__":
-    np.random.seed(0)
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-p',
-                        choices=['float32', 'float16'],
-                        default='float32',
-                        dest='precision')
-    parser.add_argument('-g',
-                        '--disable-graph',
-                        default='True',
-                        action='store_false',
-                        help='disable graph',
-                        dest='graph')
-    parser.add_argument('-m',
-                        '--max-epoch',
-                        default=1001,
-                        type=int,
-                        help='maximum epochs',
-                        dest='max_epoch')
-    args = parser.parse_args()
-
-    # generate the boundary
-    f = lambda x: (5 * x + 1)
-    bd_x = np.linspace(-1.0, 1, 200)
-    bd_y = f(bd_x)
-
-    # generate the training data
-    x = np.random.uniform(-1, 1, 400)
-    y = f(x) + 2 * np.random.randn(len(x))
-
-    # choose one precision
-    precision = singa_dtype[args.precision]
-    np_precision = np_dtype[args.precision]
-
-    # convert training data to 2d space
-    label = np.asarray([5 * a + 1 > b for (a, b) in zip(x, y)]).astype(np.int32)
-    data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np_precision)
-
-    dev = device.create_cuda_gpu_on(0)
-    sgd = opt.SGD(0.1, 0.9, 1e-5, dtype=singa_dtype[args.precision])
-    tx = tensor.Tensor((400, 2), dev, precision)
-    ty = tensor.Tensor((400,), dev, tensor.int32)
-    model = MLP(data_size=2, perceptron_size=3, num_classes=2)
-
-    # attach model to graph
-    model.set_optimizer(sgd)
-    model.compile([tx], is_train=True, use_graph=args.graph, sequential=True)
-    model.train()
-
-    for i in range(args.max_epoch):
-        tx.copy_from_numpy(data)
-        ty.copy_from_numpy(label)
-        out, loss = model(tx, ty, 'fp32', spars=None)
-
-        if i % 100 == 0:
-            print("training loss = ", tensor.to_numpy(loss)[0])
diff --git a/examples/model_selection_psql/msmlp/native.py b/examples/model_selection_psql/msmlp/native.py
deleted file mode 100644
index a82ec3b24c..0000000000
--- a/examples/model_selection_psql/msmlp/native.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-from singa import tensor
-from singa.tensor import Tensor
-from singa import autograd
-from singa import opt
-import numpy as np
-from singa import device
-import argparse
-
-np_dtype = {"float16": np.float16, "float32": np.float32}
-
-singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-p',
-                        choices=['float32', 'float16'],
-                        default='float32',
-                        dest='precision')
-    parser.add_argument('-m',
-                        '--max-epoch',
-                        default=1001,
-                        type=int,
-                        help='maximum epochs',
-                        dest='max_epoch')
-    args = parser.parse_args()
-
-    np.random.seed(0)
-
-    autograd.training = True
-
-    # prepare training data in numpy array
-
-    # generate the boundary
-    f = lambda x: (5 * x + 1)
-    bd_x = np.linspace(-1.0, 1, 200)
-    bd_y = f(bd_x)
-
-    # generate the training data
-    x = np.random.uniform(-1, 1, 400)
-    y = f(x) + 2 * np.random.randn(len(x))
-
-    # convert training data to 2d space
-    label = np.asarray([5 * a + 1 > b for (a, b) in zip(x, y)])
-    data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np.float32)
-
-    def to_categorical(y, num_classes):
-        """
-        Converts a class vector (integers) to binary class matrix.
-
-        Args:
-            y: class vector to be converted into a matrix
-                (integers from 0 to num_classes).
-            num_classes: total number of classes.
-
-        Returns:
-            A binary matrix representation of the input.
-        """
-        y = np.array(y, dtype="int")
-        n = y.shape[0]
-        categorical = np.zeros((n, num_classes))
-        categorical[np.arange(n), y] = 1
-        return categorical
-
-    label = to_categorical(label, 2).astype(np.float32)
-    print("train_data_shape:", data.shape)
-    print("train_label_shape:", label.shape)
-
-    precision = singa_dtype[args.precision]
-    np_precision = np_dtype[args.precision]
-
-    dev = device.create_cuda_gpu()
-
-    inputs = Tensor(data=data, device=dev)
-    target = Tensor(data=label, device=dev)
-
-    inputs = inputs.as_type(precision)
-    target = target.as_type(tensor.int32)
-
-    w0_np = np.random.normal(0, 0.1, (2, 3)).astype(np_precision)
-    w0 = Tensor(data=w0_np,
-                device=dev,
-                dtype=precision,
-                requires_grad=True,
-                stores_grad=True)
-    b0 = Tensor(shape=(3,),
-                device=dev,
-                dtype=precision,
-                requires_grad=True,
-                stores_grad=True)
-    b0.set_value(0.0)
-
-    w1_np = np.random.normal(0, 0.1, (3, 2)).astype(np_precision)
-    w1 = Tensor(data=w1_np,
-                device=dev,
-                dtype=precision,
-                requires_grad=True,
-                stores_grad=True)
-    b1 = Tensor(shape=(2,),
-                device=dev,
-                dtype=precision,
-                requires_grad=True,
-                stores_grad=True)
-    b1.set_value(0.0)
-
-    sgd = opt.SGD(0.05, 0.8)
-
-    # training process
-    for i in range(args.max_epoch):
-        x = autograd.matmul(inputs, w0)
-        x = autograd.add_bias(x, b0)
-        x = autograd.relu(x)
-        x = autograd.matmul(x, w1)
-        x = autograd.add_bias(x, b1)
-        loss = autograd.softmax_cross_entropy(x, target)
-        sgd(loss)
-
-        if i % 100 == 0:
-            print("%d, training loss = " % i, tensor.to_numpy(loss)[0])
diff --git a/examples/model_selection_psql/pkg_model_code/model.py b/examples/model_selection_psql/pkg_model_code/model.py
deleted file mode 100644
index 5e84e47ca4..0000000000
--- a/examples/model_selection_psql/pkg_model_code/model.py
+++ /dev/null
@@ -1,366 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# =============================================================================
-'''
-This script includes Model class for python users
-to use Computational Graph in their model.
-'''
-
-import os
-import gc
-import time
-import json
-import zipfile
-import numpy as np
-from functools import wraps
-# from collections import Iterable
-try:
-    from collections.abc import Iterable
-except ImportError:
-    from collections import Iterable
-
-from singa import tensor
-from singa import autograd
-from singa import layer
-from .tensor import Tensor
-from . import singa_wrap as singa
-
-class ModelMeta(layer.LayerMeta):
-
-    def buffer_operation(func):
-
-        def remove_creator(tensors):
-            if not tensors:
-                return
-
-            # if isinstance(tensors, Iterable):
-            #     if isinstance(tensors, str):
-            #         return
-            #     else:
-            #         for item in tensors:
-            #             if isinstance(item, Iterable):
-            #                 remove_creator(item)
-            #             elif isinstance(item, tensor.Tensor):
-            #                 item.creator = None
-            if isinstance(tensors, Iterable):
-                for item in tensors:
-                    if isinstance(item, Iterable):
-                        remove_creator(item)
-                    elif isinstance(item, tensor.Tensor):
-                        item.creator = None
-            elif isinstance(tensors, tensor.Tensor):
-                tensors.creator = None
-
-        @wraps(func)
-        def wrapper(self, *args, **kwargs):
-            if self.graph_mode and self.training:
-                if len(args) == 0:
-                    raise ValueError('expect at least one input tensor')
-
-                if isinstance(args[0], list):
-                    assert isinstance(
-                        args[0][0],
-                        Tensor), ('function expects PlaceHolders or Tensors')
-                    dev = args[0][0].device
-                else:
-                    assert isinstance(
-                        args[0],
-                        Tensor), ('function expects PlaceHolders or Tensors')
-                    dev = args[0].device
-
-                if not self._buffered:
-                    # buffer operations
-                    dev.EnableGraph(True)
-                    self._results = func(self, *args, **kwargs)
-                    dev.Sync()
-                    dev.EnableGraph(False)
-                    self._buffered = True
-
-                    # deconstruct Operations before running the entire graph
-                    remove_creator(self._results)
-
-                    # make sure all Operations are deallocated
-                    gc.collect()
-
-                # run graph
-                dev.RunGraph(self.sequential)
-                return self._results
-            else:
-                return func(self, *args, **kwargs)
-
-        return wrapper
-
-    def __new__(cls, name, bases, attr):
-        if 'train_one_batch' in attr:
-            attr['train_one_batch'] = ModelMeta.buffer_operation(
-                attr['train_one_batch'])
-
-        return super(ModelMeta, cls).__new__(cls, name, bases, attr)
-
-
-class Model(layer.Layer, metaclass=ModelMeta):
-    """ Base class for your neural network models.
-
-    Example usage::
-
-        import numpy as np
-        from singa import opt
-        from singa import tensor
-        from singa import device
-        from singa import autograd
-        from singa import layer
-        from singa import model
-
-        class MyModel(model.Model):
-            def __init__(self):
-                super(MyModel, self).__init__()
-
-                self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
-                self.conv1 = layer.Conv2d(1, 20, 5, padding=0)
-                self.conv2 = layer.Conv2d(20, 50, 5, padding=0)
-                self.sgd = opt.SGD(lr=0.01)
-
-            def forward(self, x):
-                y = self.conv1(x)
-                y = self.conv2(y)
-                return y
-
-            def train_one_batch(self, x, y):
-                out = self.forward(x)
-                loss = self.softmax_cross_entropy(out, y)
-                self.sgd(loss)
-                return out, loss
-
-    """
-
-    # save load states constant
-    TENSOR_DICT_FILENAME = '/tensor_dict.npz'
-    STATES_ATTR_FILENAME = '/states_attr.json'
-    MODEL_STATE_TYPE = 0
-    AUX_STATE_TYPE = 1
-
-    def __init__(self):
-        """
-        Initializes internal Model state
-        """
-        super(Model, self).__init__()
-
-        self.training = True
-        self.graph_mode = True
-        self.sequential = False
-        self._buffered = False
-        self._results = None
-
-    def compile(self, inputs, is_train=True, use_graph=False, sequential=False):
-        """ Compile and initialize the model
-
-        This function will automatically derive the shape of parameters
-        in each sublayer based on the shape of input placeholders. It will
-        also do some settings.
-
-        Args:
-            inputs(list): the list of input tensors(placeholders)
-            is_train(bool): when is_trainis True, this model will enter
-            training mode, otherwise it will enter the evaluation mode
-            use_graph(bool): when use_graph is True, computational graph
-            will be used to train this model
-            sequential(bool): when sequential is True, model will execute ops
-            in the graph follow the order of joining the graph
-        """
-        assert len(inputs) > 0 and isinstance(inputs[0], Tensor), (
-            'compile function expects PlaceHolders or Tensors')
-
-        dev = inputs[0].device
-        dev.EnableGraph(True)
-        self.forward(*inputs)
-        dev.EnableGraph(False)
-        dev.ResetGraph()
-
-        autograd.training = is_train
-        self.training = is_train
-        self.graph_mode = use_graph
-        self.sequential = sequential
-
-    def forward(self, *input):
-        """Defines the computation performed in every forward propagation.
-
-        Should be overridden by all subclasses.
-
-        Args:
-            *input: the input training data for the model
-
-        Returns:
-            out: the outputs of the forward propagation.
-        """
-        raise NotImplementedError
-
-    def train_one_batch(self, *input, **kwargs):
-        """Defines the computation performed in every training iteration
-
-        Should be overridden by all subclasses.
-
-        Args:
-            *input: the arguments of train_one_batch
-            **kwargs: the keyword arguments of train_one_batch
-        """
-        raise NotImplementedError
-
-    def train(self, mode=True):
-        """Set the model in evaluation mode.
-
-        Args:
-            mode(bool): when mode is True, this model will enter training mode
-        """
-        self.training = mode
-        autograd.training = mode
-
-    def eval(self):
-        """Sets the model in evaluation mode.
-        """
-        self.train(mode=False)
-
-    def graph(self, mode=True, sequential=False):
-        """ Turn on the computational graph. Specify execution mode.
-
-        Args:
-            mode(bool): when mode is True, model will use computational graph
-            sequential(bool): when sequential is True, model will execute ops
-            in the graph follow the order of joining the graph
-        """
-        self.graph_mode = mode
-        self.sequential = sequential
-
-    def __get_name__(self):
-        return self.__class__.__name__
-
-    def __call__(self, *input, **kwargs):
-        if self.training:
-            return self.train_one_batch(*input, **kwargs)
-        else:
-            return self.forward(*input, **kwargs)
-
-    def save_states(self, fpath, aux_states={}):
-        """Save states.
-
-        Args:
-            fpath: output file path (without the extension)
-            aux_states(dict): values are standard data types or Tensor,
-                              e.g., epoch ID, learning rate, optimizer states
-        """
-        assert not os.path.isfile(fpath), (
-            "Failed to save states, %s is already existed." % fpath)
-
-        states = self.get_states()
-
-        # save states data and attr
-        tensor_dict = {}
-        states_attr = {}
-        for k, v in states.items():
-            assert isinstance(v, tensor.Tensor), "Only tensor state is allowed"
-            tensor_dict[k] = tensor.to_numpy(v)
-            states_attr[k] = {
-                'state_type': self.MODEL_STATE_TYPE,
-                'shape': v.shape,
-                'dtype': v.dtype
-            }
-
-        for k, v in aux_states.items():
-            assert isinstance(v,
-                              tensor.Tensor), "Only tensor aux state is allowed"
-            tensor_dict[k] = tensor.to_numpy(v)
-            states_attr[k] = {
-                'state_type': self.AUX_STATE_TYPE,
-                'shape': v.shape,
-                'dtype': v.dtype
-            }
-
-        # save to files
-        timestamp = time.time()
-        tmp_dir = '/tmp/singa_save_states_%s' % timestamp
-        os.mkdir(tmp_dir)
-        tensor_dict_fp = tmp_dir + self.TENSOR_DICT_FILENAME
-        states_attr_fp = tmp_dir + self.STATES_ATTR_FILENAME
-
-        np.savez(tensor_dict_fp, **tensor_dict)
-
-        with open(states_attr_fp, 'w') as fp:
-            json.dump(states_attr, fp)
-
-        compression = zipfile.ZIP_DEFLATED
-        with zipfile.ZipFile(fpath, mode="w") as zf:
-            zf.write(tensor_dict_fp,
-                     os.path.basename(tensor_dict_fp),
-                     compress_type=compression)
-            zf.write(states_attr_fp,
-                     os.path.basename(states_attr_fp),
-                     compress_type=compression)
-
-        # clean up tmp files
-        os.remove(tensor_dict_fp)
-        os.remove(states_attr_fp)
-        os.rmdir(tmp_dir)
-
-    def load_states(self, fpath):
-        """Load the model states and auxiliary states from disk.
-
-        Usage:
-            m = MyModel()
-            m.compile(...)
-            aux_states = m.load_states('mymodel.zip')
-
-        Args:
-            path: input file path (without the extension)
-        Returns:
-            dict
-        """
-
-        assert os.path.isfile(fpath), (
-            "Failed to load states, %s is not exist." % fpath)
-
-        timestamp = time.time()
-        tmp_dir = '/tmp/singa_load_states_%s' % timestamp
-        os.mkdir(tmp_dir)
-
-        with zipfile.ZipFile(fpath, 'r') as zf:
-            zf.extractall(tmp_dir)
-
-        tensor_dict_fp = tmp_dir + self.TENSOR_DICT_FILENAME
-        states_attr_fp = tmp_dir + self.STATES_ATTR_FILENAME
-
-        with open(states_attr_fp) as f:
-            states_attr = json.load(f)
-
-        tensor_dict = np.load(tensor_dict_fp)
-
-        # restore singa tensor from numpy
-        model_states = dict()
-        aux_states = dict()
-
-        for k in tensor_dict.files:
-            if states_attr[k]['state_type'] == self.MODEL_STATE_TYPE:
-                model_states[k] = tensor.from_numpy(tensor_dict[k])
-            elif states_attr[k]['state_type'] == self.AUX_STATE_TYPE:
-                aux_states[k] = tensor.from_numpy(tensor_dict[k])
-
-        # restore model_states
-        self.set_states(model_states)
-
-        # clean up tmp files
-        os.remove(tensor_dict_fp)
-        os.remove(states_attr_fp)
-        os.rmdir(tmp_dir)
-        return aux_states
diff --git a/java/pom.xml b/java/pom.xml
index 1f68adbc66..6ff9568537 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -87,6 +87,18 @@
                             <exclude>java/target/*</exclude>
                             <exclude>miniconda.sh</exclude>
                             <exclude>**/*.json</exclude>
+                            <exclude>doc/_static/*.png</exclude>
+                            <exclude>doc/_static/*.gif</exclude>
+                            <exclude>doc/_static/*.ai</exclude>
+                            <exclude>doc/_static/images/*.png</exclude>
+                            <exclude>examples/model_selection/TRAILS-Database-Native-Model-Selection/requirement.txt</exclude>
+                            <exclude>examples/model_selection/TRAILS-Database-Native-Model-Selection/documents/*.png</exclude>
+                            <exclude>examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/init_env</exclude>
+                            <exclude>examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/requirement.txt</exclude>
+                            <exclude>examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/Cargo.toml</exclude>
+                            <exclude>examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/pg_extension.control</exclude>
+                            <exclude>examples/singa_easy/examples/data/SampleQuestion.json</exclude>
+                            <exclude>examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/documents/imgs/*.png</exclude>
                         </excludes>
                         <consoleOutput>True</consoleOutput>
                     </configuration>