From e0e8f3d45abd449d38be6e4c031426a8df97b50d Mon Sep 17 00:00:00 2001
From: Yukuo Cen <cenyk1230@qq.com>
Date: Wed, 27 Oct 2021 17:33:24 +0800
Subject: [PATCH] [Docs] Update docs & examples (#292)

* Fix docs

* Update docs
---
 README.md                                    |   2 +-
 README_CN.md                                 |   2 +-
 cogdl/__init__.py                            |   2 +-
 cogdl/datasets/__init__.py                   |   2 +
 cogdl/datasets/customized_data.py            |  11 +-
 cogdl/experiments.py                         |   5 +-
 cogdl/models/README.md                       | 144 -------------------
 cogdl/models/__init__.py                     |   2 +
 cogdl/options.py                             |   9 ++
 cogdl/trainer/embed_trainer.py               |  24 ++--
 cogdl/trainer/trainer.py                     |  17 ++-
 docs/source/tutorial/custom_dataset.rst      |  67 ++++-----
 docs/source/tutorial/custom_gnn.rst          |   2 -
 docs/source/tutorial/graph.rst               |   2 +-
 docs/source/tutorial/node_classification.rst |  50 +++----
 examples/custom_dataset.py                   |  16 ++-
 gnn_papers.md                                |   2 +-
 17 files changed, 106 insertions(+), 253 deletions(-)

diff --git a/README.md b/README.md
index 23c421b9..e0f88cbc 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ We summarize the contributions of CogDL as follows:
 
 ## ❗ News
 
-- The new **v0.5.0b1 pre-release** designs and implements a unified training loop for GNN. It introduces `DataWrapper` to help prepare the training/validation/test data and `ModelWrapper` to define the training/validation/test steps. 
+- The new **v0.5.0-alpha0 pre-release** designs and implements a unified training loop for GNN. It introduces `DataWrapper` to help prepare the training/validation/test data and `ModelWrapper` to define the training/validation/test steps. 
 
 - The new **v0.4.1 release** adds the implementation of Deep GNNs and the recommendation task. It also supports new pipelines for generating embeddings and recommendation. Welcome to join our tutorial on KDD 2021 at 10:30 am - 12:00 am, Aug. 14th (Singapore Time). More details can be found in https://kdd2021graph.github.io/. 🎉
 
diff --git a/README_CN.md b/README_CN.md
index 8b2262cf..4cd3e0a7 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -21,7 +21,7 @@ CogDL的特性包括：
 
 ## ❗ 最新
 
-- 最新的 **v0.5.0b1 pre-release** 为图神经网络的训练设计了一套统一的流程. 这个版本去除了原先的`Task`类，引入了`DataWrapper`来准备training/validation/test过程中所需的数据，引入了`ModelWrapper`来定义模型training/validation/test的步骤.
+- 最新的 **v0.5.0-alpha0 pre-release** 为图神经网络的训练设计了一套统一的流程. 这个版本去除了原先的`Task`类，引入了`DataWrapper`来准备training/validation/test过程中所需的数据，引入了`ModelWrapper`来定义模型training/validation/test的步骤.
 
 - 最新的 **v0.4.1 release** 增加了深层GNN的实现和推荐任务。这个版本同时提供了新的一些pipeline用于直接获取图表示和搭建推荐应用。欢迎大家参加我们在KDD 2021上的tutorial，时间是8月14号上午10:30 - 12:00（北京时间）。 更多的内容可以查看 https://kdd2021graph.github.io/. 🎉
 
diff --git a/cogdl/__init__.py b/cogdl/__init__.py
index 14cb1f51..5fbebbca 100644
--- a/cogdl/__init__.py
+++ b/cogdl/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.5.0b1"
+__version__ = "0.5.0-alpha0"
 
 from .experiments import experiment
 from .oag import oagbert
diff --git a/cogdl/datasets/__init__.py b/cogdl/datasets/__init__.py
index 0d851455..1020ce21 100644
--- a/cogdl/datasets/__init__.py
+++ b/cogdl/datasets/__init__.py
@@ -38,6 +38,8 @@ def try_adding_dataset_args(dataset, parser):
 
 
 def build_dataset_from_name(dataset):
+    if isinstance(dataset, list):
+        dataset = dataset[0]
     if dataset in SUPPORTED_DATASETS:
         path = ".".join(SUPPORTED_DATASETS[dataset].split(".")[:-1])
         module = importlib.import_module(path)
diff --git a/cogdl/datasets/customized_data.py b/cogdl/datasets/customized_data.py
index 242349d4..89292809 100644
--- a/cogdl/datasets/customized_data.py
+++ b/cogdl/datasets/customized_data.py
@@ -64,11 +64,12 @@ def __init__(self, path="data.pt", data=None, scale_feat=True, metric="auto"):
         self.path = path
         self.data = data
         super(NodeDataset, self).__init__(root=path)
-        try:
-            self.data = torch.load(path)
-        except Exception as e:
-            print(e)
-            exit(1)
+        if self.data is None:
+            try:
+                self.data = torch.load(path)
+            except Exception as e:
+                print(e)
+                exit(1)
         if scale_feat:
             self.data = scale_feats(self.data)
         self.metric = metric
diff --git a/cogdl/experiments.py b/cogdl/experiments.py
index 5f3b26cd..4e64e777 100644
--- a/cogdl/experiments.py
+++ b/cogdl/experiments.py
@@ -190,7 +190,6 @@ def train(args):  # noqa: C901
     else:
         model_wrapper = mw_class(model, optimizer_cfg, **model_wrapper_args)
 
-    save_embedding_path = args.emb_path if hasattr(args, "emb_path") else None
     os.makedirs("./checkpoints", exist_ok=True)
 
     # setup controller
@@ -198,12 +197,14 @@ def train(args):  # noqa: C901
         max_epoch=args.max_epoch,
         device_ids=args.devices,
         cpu=args.cpu,
-        save_embedding_path=save_embedding_path,
+        save_emb_path=args.save_emb_path,
+        load_emb_path=args.load_emb_path,
         cpu_inference=args.cpu_inference,
         # monitor=args.monitor,
         progress_bar=args.progress_bar,
         distributed_training=args.distributed,
         checkpoint_path=args.checkpoint_path,
+        resume_training=args.resume_training,
         patience=args.patience,
         logger=args.logger,
         log_path=args.log_path,
diff --git a/cogdl/models/README.md b/cogdl/models/README.md
index 0ef9ea3b..7597e184 100644
--- a/cogdl/models/README.md
+++ b/cogdl/models/README.md
@@ -9,8 +9,6 @@ CogDL now supports the following models for different tasks:
 
 - heterogeneous node classification (异构结点分类): GTN [(Yun et al, NeurIPS'19)](https://arxiv.org/abs/1911.06455), HAN [(Xiao et al, WWW'19)](https://arxiv.org/abs/1903.07293), PTE [(Tang et al, KDD'15)](https://arxiv.org/abs/1508.00200), Metapath2vec [(Dong et al, KDD'17)](https://ericdongyx.github.io/papers/KDD17-dong-chawla-swami-metapath2vec.pdf), Hin2vec [(Fu et al, CIKM'17)](https://dl.acm.org/doi/10.1145/3132847.3132953).
 
-- link prediction (链接预测): ProNE [(Zhang et al, IJCAI'19)](https://www.ijcai.org/Proceedings/2019/0594.pdf), NetMF [(Qiu et al, WSDM'18)](http://arxiv.org/abs/1710.02971), Node2vec [(Grover et al, KDD'16)](http://dl.acm.org/citation.cfm?doid=2939672.2939754), DeepWalk [(Perozzi et al, KDD'14)](http://arxiv.org/abs/1403.6652), LINE [(Tang et al, WWW'15)](http://arxiv.org/abs/1503.03578), Hope [(Ou et al, KDD'16)](http://dl.acm.org/citation.cfm?doid=2939672.2939751), NetSMF [(Qiu et at, WWW'19)](https://arxiv.org/abs/1906.11156), SDNE [(Wang et al, KDD'16)](https://www.kdd.org/kdd2016/papers/files/rfp0191-wangAemb.pdf).
-
 - multiplex link prediction (多重边链接预测): GATNE [(Cen et al, KDD'19)](https://arxiv.org/abs/1905.01669), NetMF [(Qiu et al, WSDM'18)](http://arxiv.org/abs/1710.02971), ProNE [(Zhang et al, IJCAI'19)](https://www.ijcai.org/Proceedings/2019/0594.pdf), Node2vec [(Grover et al, KDD'16)](http://dl.acm.org/citation.cfm?doid=2939672.2939754), DeepWalk [(Perozzi et al, KDD'14)](http://arxiv.org/abs/1403.6652), LINE [(Tang et al, WWW'15)](http://arxiv.org/abs/1503.03578), Hope [(Ou et al, KDD'16)](http://dl.acm.org/citation.cfm?doid=2939672.2939751), GraRep [(Cao et al, CIKM'15)](http://dl.acm.org/citation.cfm?doid=2806416.2806512).
 
 - unsupervised graph classification (无监督图分类): Infograph [(Sun et al, ICLR'20)](https://openreview.net/forum?id=r1lfF2NYvH), Graph2Vec [(Narayanan et al, CoRR'17)](https://arxiv.org/abs/1707.05005), DGK [(Yanardag et al, KDD'15)](https://dl.acm.org/doi/10.1145/2783258.2783417).
@@ -24,145 +22,3 @@ CogDL now supports the following models for different tasks:
 4) make install
 5) export METIS_DLL=~/.local/lib/libmetis.so
 6) pip install metis
-
-## Specific parameters
-
-for DeepWalk and node2vec:
-
-- --walk-num, the number of random walks to start at each node; the default is 10;
-- --walk-length, Length of walk start at each node. Default is 50;
-- --worker, Number of parallel workers. Default is 10;
-- --window-size, Window size of skip-gram model. Default is 10;
-- --iteration, Number of iterations. Default is 10;
-- --q, Parameter in node2vec. Default is 1.0;
-- --p, Parameter in node2vec. Default is 1.0;
-
-for LINE:
-
-- --order, Order of proximity in LINE. Default is 3 for 1+2;
-- --alpha, Initial earning rate of SGD. Default is 0.025;
-- --batch-size, Batch size in SGD training process. Default is 100;
-- --negative, Number of negative nodes in sampling. Default is 5;
-
-for SGC-PN:
-
-- --dropout, Dropout rate in SGC-PN. Default is 0.01;
-- --num-layers, Number of layers in SGC-PN. Default is 40;
-- --norm-mode, Mode for PairNorm in SGC-PN. Default is "PN";
-- --norm-scale, Row-normalization scale in SGC-PN. Default is 10;
-
-for HOPE:
-
-- --beta, Parameter of katz for HOPE. Default is 0.01;
-
-for Grarep:
-
-- --step, Number of matrix step in GraRep and ProNE. Default is 5;
-
-for NetMF:
-
-- --window-size, Window size of deepwalk matrix. Default is 10;
-- --is-large, Large or small for NetMF;
-- --negative, Number of negative nodes in sampling. Default is 5;
-- --rank, Number of Eigenpairs in NetMF, default is 256;
-
-for NetSMF:
-
-- --window-size, Window size of approximate matrix. Default is 10;
-- --negative, Number of negative nodes in sampling. Default is 5;
-- --round, Number of round in NetSMF. Default is 100;
-- --worker, Number of parallel workers. Default is 10;
-
-for ProNE:
-
-- --step, Number of items in the chebyshev expansion. Default is 5;
-- --theta, Parameter of ProNE. Default is 0.5;
-- --mu, Parameter of ProNE. Default is 0.2;
-
-for GCN and DR-GCN:
-
-- --hidden-size, The size of hidden layer. Default=16;
-- --num-layers, The number of GCN layer. Default=2;
-- --dropout, The dropout probability. Default=0.5;
-
-for GAT and DR-GAT:
-
-- --hidden-size, The size of hidden layer. Default=8;
-- --num-heads, The number of heads in attention mechanism. Default=8;
-- --dropout, The dropout probability. Default=0.6;
-
-for Graphsage:
-
-- --hidden-size, The size of hidden layer. Default=8;
-- --num-layers, The number of Graphsage. Default=2;
-- --sample-size, The List of number of neighbor samples for each node in Graphsage. Default=10, 10;
-- --dropout, The dropout probability. Default=0.5;
-
-
-## References
-[1] Zhao, Lingxiao, and Leman Akoglu. "Pairnorm: Tackling oversmoothing in gnns." arXiv preprint arXiv:1909.12223 (2019).
-
-[2] Sun, Fan-Yun, Jordan Hoffmann, and Jian Tang. "InfoGraph: Unsupervised and Semi-supervised Graph-Level Representation Learning via Mutual Information Maximization." arXiv preprint arXiv:1908.01000 (2019).
-
-[3] Qiu, Jiezhong, Yuxiao Dong, Hao Ma, Jian Li, Chi Wang, Kuansan Wang, and Jie Tang. "Netsmf: Large-scale network embedding as sparse matrix factorization." In The World Wide Web Conference, pp. 1509-1520. 2019.
-
-[4] Cen, Yukuo, Xu Zou, Jianwei Zhang, Hongxia Yang, Jingren Zhou, and Jie Tang. "Representation learning for attributed multiplex heterogeneous network." In Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 1358-1368. 2019.
-
-[5] Zhang, Jie, Yuxiao Dong, Yan Wang, Jie Tang, and Ming Ding. "ProNE: fast and scalable network representation learning." In Proc. 28th Int. Joint Conf. Artif. Intell., IJCAI, pp. 4278-4284. 2019.
-
-[6] Zou, Xu, Qiuye Jia, Jianwei Zhang, Chang Zhou, Hongxia Yang, and Jie Tang. "Dimensional Reweighting Graph Convolutional Networks." arXiv preprint arXiv:1907.02237 (2019).
-
-[7] Gao, Hongyang, and Shuiwang Ji. "Graph u-nets." arXiv preprint arXiv:1905.05178 (2019).
-
-[8] Abu-El-Haija, Sami, Bryan Perozzi, Amol Kapoor, Nazanin Alipourfard, Kristina Lerman, Hrayr Harutyunyan, Greg Ver Steeg, and Aram Galstyan. "Mixhop: Higher-order graph convolutional architectures via sparsified neighborhood mixing." arXiv preprint arXiv:1905.00067 (2019).
-
-[9] Veličković, Petar, William Fedus, William L. Hamilton, Pietro Liò, Yoshua Bengio, and R. Devon Hjelm. "Deep graph infomax." arXiv preprint arXiv:1809.10341 (2018).
-
-[10] Yun, Seongjun, Minbyul Jeong, Raehyun Kim, Jaewoo Kang, and Hyunwoo J. Kim. "Graph Transformer Networks." In Advances in Neural Information Processing Systems, pp. 11960-11970. 2019.
-
-[11] Wang, Xiao, Houye Ji, Chuan Shi, Bai Wang, Yanfang Ye, Peng Cui, and Philip S. Yu. "Heterogeneous graph attention network." In The World Wide Web Conference, pp. 2022-2032. 2019.
-
-[12] Xu, Keyulu, Weihua Hu, Jure Leskovec, and Stefanie Jegelka. "How powerful are graph neural networks?." arXiv preprint arXiv:1810.00826 (2018).
-
-[13] Qiu, Jiezhong, Yuxiao Dong, Hao Ma, Jian Li, Kuansan Wang, and Jie Tang. "Network embedding as matrix factorization: Unifying deepwalk, line, pte, and node2vec." In Proceedings of the Eleventh ACM International Conference on Web Search and Data Mining, pp. 459-467. 2018.
-
-[14] Veličković, Petar, Guillem Cucurull, Arantxa Casanova, Adriana Romero, Pietro Lio, and Yoshua Bengio. "Graph attention networks." arXiv preprint arXiv:1710.10903 (2017).
-
-[15] Ying, Zhitao, Jiaxuan You, Christopher Morris, Xiang Ren, Will Hamilton, and Jure Leskovec. "Hierarchical graph representation learning with differentiable pooling." In Advances in neural information processing systems, pp. 4800-4810. 2018.
-
-[16] Zhang, Muhan, Zhicheng Cui, Marion Neumann, and Yixin Chen. "An end-to-end deep learning architecture for graph classification." In Thirty-Second AAAI Conference on Artificial Intelligence. 2018.
-
-[17] Kipf, Thomas N., and Max Welling. "Semi-supervised classification with graph convolutional networks." arXiv preprint arXiv:1609.02907 (2016).
-
-[18] Hamilton, Will, Zhitao Ying, and Jure Leskovec. "Inductive representation learning on large graphs." In Advances in neural information processing systems, pp. 1024-1034. 2017.
-
-[19] Dong, Yuxiao, Nitesh V. Chawla, and Ananthram Swami. "metapath2vec: Scalable representation learning for heterogeneous networks." In Proceedings of the 23rd ACM SIGKDD international conference on knowledge discovery and data mining, pp. 135-144. 2017.
-
-[20] Fu, Tao-yang, Wang-Chien Lee, and Zhen Lei. "Hin2vec: Explore meta-paths in heterogeneous information networks for representation learning." In Proceedings of the 2017 ACM on Conference on Information and Knowledge Management, pp. 1797-1806. 2017.
-
-[21] Narayanan, Annamalai, Mahinthan Chandramohan, Rajasekar Venkatesan, Lihui Chen, Yang Liu, and Shantanu Jaiswal. "graph2vec: Learning distributed representations of graphs." arXiv preprint arXiv:1707.05005 (2017).
-
-[22] Wang, Yue, Yongbin Sun, Ziwei Liu, Sanjay E. Sarma, Michael M. Bronstein, and Justin M. Solomon. "Dynamic graph cnn for learning on point clouds." ACM Transactions on Graphics (TOG) 38, no. 5 (2019): 1-12.
-
-[23] Grover, Aditya, and Jure Leskovec. "node2vec: Scalable feature learning for networks." In Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining, pp. 855-864. 2016.
-
-[24] Ou, Mingdong, Peng Cui, Jian Pei, Ziwei Zhang, and Wenwu Zhu. "Asymmetric transitivity preserving graph embedding." In Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining, pp. 1105-1114. 2016.
-
-[25] Wang, Daixin, Peng Cui, and Wenwu Zhu. "Structural deep network embedding." In Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining, pp. 1225-1234. 2016.
-
-[26] Cao, Shaosheng, Wei Lu, and Qiongkai Xu. "Deep neural networks for learning graph representations." In Thirtieth AAAI conference on artificial intelligence. 2016.
-
-[27] Defferrard, Michaël, Xavier Bresson, and Pierre Vandergheynst. "Convolutional neural networks on graphs with fast localized spectral filtering." In Advances in neural information processing systems, pp. 3844-3852. 2016.
-
-[28] Niepert, Mathias, Mohamed Ahmed, and Konstantin Kutzkov. "Learning convolutional neural networks for graphs." In International conference on machine learning, pp. 2014-2023. 2016.
-
-[29] Tang, Jian, Meng Qu, Mingzhe Wang, Ming Zhang, Jun Yan, and Qiaozhu Mei. "Line: Large-scale information network embedding." In Proceedings of the 24th international conference on world wide web, pp. 1067-1077. 2015.
-
-[30] Cao, Shaosheng, Wei Lu, and Qiongkai Xu. "Grarep: Learning graph representations with global structural information." In Proceedings of the 24th ACM international on conference on information and knowledge management, pp. 891-900. 2015.
-
-[31] Tang, Jian, Meng Qu, and Qiaozhu Mei. "Pte: Predictive text embedding through large-scale heterogeneous text networks." In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 1165-1174. 2015.
-
-[32] Yanardag, Pinar, and S. V. N. Vishwanathan. "Deep graph kernels." In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 1365-1374. 2015.
-
-[33] Perozzi, Bryan, Rami Al-Rfou, and Steven Skiena. "Deepwalk: Online learning of social representations." In Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining, pp. 701-710. 2014.
-
diff --git a/cogdl/models/__init__.py b/cogdl/models/__init__.py
index 85d7d127..454e71c9 100644
--- a/cogdl/models/__init__.py
+++ b/cogdl/models/__init__.py
@@ -36,6 +36,8 @@ def try_adding_model_args(model, parser):
 
 def build_model(args):
     model = args.model
+    if isinstance(model, list):
+        model = model[0]
     if model in SUPPORTED_MODELS:
         path = ".".join(SUPPORTED_MODELS[model].split(".")[:-1])
         module = importlib.import_module(path)
diff --git a/cogdl/options.py b/cogdl/options.py
index 5115f395..49ecfab1 100644
--- a/cogdl/options.py
+++ b/cogdl/options.py
@@ -38,6 +38,9 @@ def get_parser():
     parser.add_argument("--n-warmup-steps", type=int, default=0)
 
     parser.add_argument("--checkpoint-path", type=str, default="./checkpoints/model.pt", help="path to save model")
+    parser.add_argument("--save-emb-path", type=str, default=None, help="path to save embeddings")
+    parser.add_argument("--load-emb-path", type=str, default=None, help="path to load embeddings")
+    parser.add_argument("--resume-training", action="store_true")
     parser.add_argument("--logger", type=str, default=None)
     parser.add_argument("--log-path", type=str, default=".", help="path to save logs")
     parser.add_argument("--project", type=str, default="cogdl-exp", help="project name for wandb")
@@ -170,11 +173,17 @@ def parse_args_and_arch(parser, args):
 
     if args.dw is not None:
         dw = args.dw
+    if dw is None:
+        warnings.warn("Using default data wrapper ('node_classification_dw') for training!")
+        dw = "node_classification_dw"
     if hasattr(fetch_data_wrapper(dw), "add_args"):
         fetch_data_wrapper(dw).add_args(parser)
 
     if args.mw is not None:
         mw = args.mw
+    if mw is None:
+        warnings.warn("Using default model wrapper ('node_classification_mw') for training!")
+        mw = "node_classification_mw"
     if hasattr(fetch_model_wrapper(mw), "add_args"):
         fetch_model_wrapper(mw).add_args(parser)
 
diff --git a/cogdl/trainer/embed_trainer.py b/cogdl/trainer/embed_trainer.py
index 96a31f51..38cdf1d0 100644
--- a/cogdl/trainer/embed_trainer.py
+++ b/cogdl/trainer/embed_trainer.py
@@ -8,24 +8,25 @@
 class EmbeddingTrainer(object):
     def __init__(
         self,
-        save_embedding_path: Optional[str] = None,
-        load_embedding_path: Optional[str] = None,
+        save_emb_path: Optional[str] = None,
+        load_emb_path: Optional[str] = None,
     ):
-        self.save_embedding_path = save_embedding_path
-        self.default_embedding_dir = "./embeddings"
-        self.load_embedding_path = load_embedding_path
+        self.save_emb_path = save_emb_path
+        self.load_emb_path = load_emb_path
+        self.default_emb_dir = "./embeddings"
 
     def run(self, model_w, dataset_w):
         self.prepare_data_wrapper(dataset_w)
-        if self.load_embedding_path is not None:
-            embedding = np.load(self.load_embedding_path)
+        if self.load_emb_path is not None:
+            print(f"Loading embeddings from {self.load_emb_path} ...")
+            embedding = np.load(self.load_emb_path)
             return self.test(model_w, dataset_w, embedding)
 
-        if self.save_embedding_path is None:
+        if self.save_emb_path is None:
             cur_time = time.strftime("%m-%d_%H.%M.%S", time.localtime())
             name = f"{model_w.wrapped_model.__class__.__name__}_{cur_time}.emb"
-            self.save_embedding_path = os.path.join(self.default_embedding_dir, name)
-            os.makedirs(self.default_embedding_dir, exist_ok=True)
+            self.save_emb_path = os.path.join(self.default_emb_dir, name)
+            os.makedirs(self.default_emb_dir, exist_ok=True)
         embeddings = self.train(model_w, dataset_w)
         self.save_embedding(embeddings)
         return self.test(model_w, dataset_w, embeddings)
@@ -42,7 +43,6 @@ def train(self, model_w, dataset_w):
         embeddings = []
         for batch in train_data:
             embeddings.append(model_w.train_step(batch))
-        # embeddings = model_w.train_step(train_data)
         assert len(embeddings) == 1
         embeddings = embeddings[0]
         return embeddings
@@ -55,4 +55,4 @@ def test(self, model_w, dataset_w, embeddings):
         return result
 
     def save_embedding(self, embeddings):
-        np.save(self.save_embedding_path, embeddings)
+        np.save(self.save_emb_path, embeddings)
diff --git a/cogdl/trainer/trainer.py b/cogdl/trainer/trainer.py
index f2f09f99..6e748497 100644
--- a/cogdl/trainer/trainer.py
+++ b/cogdl/trainer/trainer.py
@@ -49,6 +49,7 @@ def __init__(
         nstage: int = 1,
         cpu: bool = False,
         checkpoint_path: str = "./checkpoints/model.pt",
+        resume_training: str = False,
         device_ids: Optional[list] = None,
         distributed_training: bool = False,
         distributed_inference: bool = False,
@@ -58,7 +59,8 @@ def __init__(
         early_stopping: bool = True,
         patience: int = 100,
         eval_step: int = 1,
-        save_embedding_path: Optional[str] = None,
+        save_emb_path: Optional[str] = None,
+        load_emb_path: Optional[str] = None,
         cpu_inference: bool = False,
         progress_bar: str = "epoch",
         clip_grad_norm: float = 5.0,
@@ -81,6 +83,7 @@ def __init__(
         self.cpu = cpu
         self.devices, self.world_size = self.set_device(device_ids)
         self.checkpoint_path = checkpoint_path
+        self.resume_training = resume_training
 
         self.distributed_training = distributed_training
         self.distributed_inference = distributed_inference
@@ -98,7 +101,8 @@ def __init__(
         self.on_eval_batch_transform = None
         self.clip_grad_norm = clip_grad_norm
 
-        self.save_embedding_path = save_embedding_path
+        self.save_emb_path = save_emb_path
+        self.load_emb_path = load_emb_path
 
         self.data_controller = DataController(world_size=self.world_size, distributed=self.distributed_training)
 
@@ -155,7 +159,7 @@ def set_device(self, device_ids: Optional[list]):
     def run(self, model_w: ModelWrapper, dataset_w: DataWrapper):
         # for network/graph embedding models
         if isinstance(model_w, EmbeddingModelWrapper):
-            return EmbeddingTrainer(self.save_embedding_path).run(model_w, dataset_w)
+            return EmbeddingTrainer(self.save_emb_path, self.load_emb_path).run(model_w, dataset_w)
 
         # for deep learning models
         # set default loss_fn and evaluator for model_wrapper
@@ -165,6 +169,9 @@ def run(self, model_w: ModelWrapper, dataset_w: DataWrapper):
         model_w.default_evaluator = dataset_w.get_default_evaluator()
         model_w.set_evaluation_metric()
 
+        if self.resume_training:
+            model_w = load_model(model_w, self.checkpoint_path).to(self.devices[0])
+
         if self.distributed_training:  # and self.world_size > 1:
             torch.multiprocessing.set_sharing_strategy("file_system")
             self.dist_train(model_w, dataset_w)
@@ -179,10 +186,6 @@ def run(self, model_w: ModelWrapper, dataset_w: DataWrapper):
         return final_test
 
     def evaluate(self, model_w: ModelWrapper, dataset_w: DataWrapper, cpu=False):
-        # for network/graph embedding models
-        if isinstance(model_w, EmbeddingModelWrapper):
-            return EmbeddingTrainer(self.save_embedding_path).run(model_w, dataset_w)
-
         if cpu:
             self.devices = [torch.device("cpu")]
 
diff --git a/docs/source/tutorial/custom_dataset.rst b/docs/source/tutorial/custom_dataset.rst
index 63a84880..c747cd5e 100644
--- a/docs/source/tutorial/custom_dataset.rst
+++ b/docs/source/tutorial/custom_dataset.rst
@@ -8,8 +8,7 @@ We provide ``NodeDataset`` and ``GraphDataset`` as abstract classes and implemen
 
 Dataset for node_classification
 ---------------------------------
-To create a dataset for node_classification, you need to inherit ``NodeDataset``. ``NodeDataset`` is for tasks like `node_classification`
-or `unsupervised_node_classification`, which focus on node-level prediction. Then you need to implement ``process`` method.
+To create a dataset for node_classification, you need to inherit ``NodeDataset``. ``NodeDataset`` is for node-level prediction. Then you need to implement ``process`` method.
 In this method, you are expected to read in your data and preprocess raw data to the format available to CogDL with ``Graph``.
 Afterwards, we suggest you to save the processed data (we will also help you do it as you return the data) to avoid doing
 the preprocessing again. Next time you run the code, CogDL will directly load it.
@@ -33,15 +32,14 @@ If ``scale_feat`` is set to be `True`, CogDL will normalize node features with m
     z = (x - u) / s
 
 
-Here is an example:
+Here is an `example <https://github.com/THUDM/cogdl/blob/master/examples/custom_dataset.py>`_:
 
 
 .. code-block:: python
 
     from cogdl.data import Graph
-    from cogdl.datasets import NodeDataset, register_dataset
+    from cogdl.datasets import NodeDataset, generate_random_graph
 
-    @register_dataset("node_dataset")
     class MyNodeDataset(NodeDataset):
         def __init__(self, path="data.pt"):
             self.path = path
@@ -49,16 +47,32 @@ Here is an example:
 
         def process(self):
             """You need to load your dataset and transform to `Graph`"""
-            # Load and preprocess data
-            edge_index = torch.tensor([[0, 1], [0, 2], [1, 2], [1, 3]).t()
-            x = torch.randn(4, 10)
-            mask = torch.bool(4)
-            # Provide attributes as you need and save the data into `Graph`
-            data = Graph(x=x, edge_index=edge_index)
-            torch.save(data, self.path)
+            num_nodes, num_edges, feat_dim = 100, 300, 30
+
+            # load or generate your dataset
+            edge_index = torch.randint(0, num_nodes, (2, num_edges))
+            x = torch.randn(num_nodes, feat_dim)
+            y = torch.randint(0, 2, (num_nodes,))
+
+            # set train/val/test mask in node_classification task
+            train_mask = torch.zeros(num_nodes).bool()
+            train_mask[0 : int(0.3 * num_nodes)] = True
+            val_mask = torch.zeros(num_nodes).bool()
+            val_mask[int(0.3 * num_nodes) : int(0.7 * num_nodes)] = True
+            test_mask = torch.zeros(num_nodes).bool()
+            test_mask[int(0.7 * num_nodes) :] = True
+            data = Graph(x=x, edge_index=edge_index, y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
             return data
 
-    dataset = MyNodeDataset("data.pt")
+    if __name__ == "__main__":
+        # Train customized dataset via defining a new class
+        dataset = MyNodeDataset()
+        experiment(dataset=dataset, model="gcn")
+
+        # Train customized dataset via feeding the graph data to NodeDataset
+        data = generate_random_graph(num_nodes=100, num_edges=300, num_feats=30)
+        dataset = NodeDataset(data=data)
+        experiment(dataset=dataset, model="gcn")
 
 
 
@@ -71,9 +85,9 @@ An example is shown as follows:
 
 .. code-block:: python
 
+    from cogdl.data import Graph
     from cogdl.datasets import GraphDataset
 
-    @register_dataset("graph_dataset")
     class MyGraphDataset(GraphDataset):
         def __init__(self, path="data.pt"):
             self.path = path
@@ -87,27 +101,8 @@ An example is shown as follows:
                 edges = torch.randint(0, 20, (2, 30))
                 label = torch.randint(0, 7, (1,))
                 graphs.append(Graph(edge_index=edges, y=label))
-            torch.save(graphs, self.path)
             return graphs
 
-
-
-
-Use custom dataset with CogDL
----------------------------------
-Now that you have set up your dataset, you can use models/task in CogDL immediately to get results.
-
-.. code-block:: python
-
-    # Use the GCN model with the dataset we define above
-    dataset = MyNodeDataset("data.pt")
-    args.model = "gcn"
-    task = build_task(args, dataset=dataset)
-    task.train()
-
-    # Or you may simple run the command after `register_dataset`
-    experiment(model="gcn", task="node_classification", dataset="node_dataset")
-
-    # That's the same for other tasks
-    experiment(model="gin", task="graph_classification", dataset="graph_dataset")
-
+    if __name__ == "__main__":
+        dataset = MyGraphDataset()
+        experiment(model="gin", dataset=dataset)
diff --git a/docs/source/tutorial/custom_gnn.rst b/docs/source/tutorial/custom_gnn.rst
index 92f57701..95e990fe 100644
--- a/docs/source/tutorial/custom_gnn.rst
+++ b/docs/source/tutorial/custom_gnn.rst
@@ -32,7 +32,6 @@ JKNet collects the output of all layers and concatenate them together to get the
         def __init__(self, in_feats, out_feats, hidden_size, num_layers):
             super(JKNet, self).__init__()
             shapes = [in_feats] + [hidden_size] * num_layers
-            #
             self.layers = nn.ModuleList([
                 GCNLayer(shape[i], shape[i+1])
                 for i in range(num_layers)
@@ -40,7 +39,6 @@ JKNet collects the output of all layers and concatenate them together to get the
             self.fc = nn.Linear(hidden_size * num_layers, out_feats)
 
         def forward(self, graph):
-            graph.add_remaining_self_loops()
             graph.sym_norm()
             h = graph.x
             out = []
diff --git a/docs/source/tutorial/graph.rst b/docs/source/tutorial/graph.rst
index 4dbaa575..107b5733 100644
--- a/docs/source/tutorial/graph.rst
+++ b/docs/source/tutorial/graph.rst
@@ -103,7 +103,7 @@ We also implement commonly used operations in ``Graph``:
 
     # inference_step
     model.eval()
-    data.eval()
+    graph.eval()
 
 
 
diff --git a/docs/source/tutorial/node_classification.rst b/docs/source/tutorial/node_classification.rst
index 7295185e..42599008 100644
--- a/docs/source/tutorial/node_classification.rst
+++ b/docs/source/tutorial/node_classification.rst
@@ -14,26 +14,23 @@ results in different datasets.
 .. code-block:: python
 
     from cogdl import experiment
-    experiment(model="gcn", dataset="cora", task="node_classification")
+    experiment(model="gcn", dataset="cora")
 
 
 
-
-Or you can create each component separately and manually run the process using ``build_dataset``, ``build_model``, ``build_task``
-in CogDL.
+Or you can create each component separately and manually run the process using ``build_dataset``, ``build_model`` in CogDL.
 
 .. code-block:: python
 
+    from cogdl import experiment
     from cogdl.datasets import build_dataset
     from cogdl.models import build_model
-    from cogdl.tasks import build_task
+    from cogdl.options import get_default_args 
 
-    args = build_args_from_dict(dict(task="node_classification", model="gcn", dataset="cora"))
+    args = get_default_args(model="gcn", dataset="cora")
     dataset = build_dataset(args)
     model = build_model(args)
-    task = build_task(args, dataset=dataset, model=model)
-    task.train()
-
+    experiment(model=model, dataset=dataset)
 
 
 As show above, model/dataset/task are 3 key components in establishing a training process. In fact, CogDL also supports
@@ -44,13 +41,11 @@ of each component.
 Save trained model
 -------------------
 
-CogDL supports saving the trained model with ``save_model`` in command line or notebook. For example:
+CogDL supports saving the trained model with ``checkpoint_path`` in command line or API usage. For example:
 
 .. code-block:: python
 
-    experiment(model="gcn", task="node_classification", dataset="cora", save_model="gcn_cora.pt")
-
-
+    experiment(model="gcn", dataset="cora", checkpoint_path="gcn_cora.pt")
 
 
 When the training stops, the model will be saved in `gcn_cora.py`. If you want to continue the training from previous checkpoint
@@ -60,32 +55,20 @@ and do it as follows:
 
 .. code-block:: python
 
-    experiment(model="gcn", task="node_classification", dataset="cora", checkpoint="gcn_cora.pt")
-
-
-
-Or you may just want to do the inference to get prediction results without training. The prediction results will be automatically
-saved in `gcn_cora.pred`.
-
-
-.. code-block:: python
-
-    experiment(model="gcn", task="node_classification", dataset="cora", checkpoint="gcn_cora.pt", inference=True)
-
+    experiment(model="gcn", dataset="cora", checkpoint_path="gcn_cora.pt", resume_training=True)
 
 
+In command line usage, the same results can be achieved with ``--checkpoint-path {path}`` and ``--resume-training``.
 
-In command line usage, the same results can be achieved with ``--save-model {path}``, ``--checkpoint {path}`` and ``--inference`` set.
 
 Save embeddings
 ----------------
 Graph representation learning (network embedding and unsupervised GNNs) aims to get node representation. The embeddings
-can be used in various downstream applications. CogDL will save node embeddings in directory `./embedding`. As shown below,
-the embeddings will be save in `./embedding/prone_blogcatalog.npy`.
+can be used in various downstream applications. CogDL will save node embeddings in the given path specified by ``--save-emb-path {path}``. 
 
 .. code-block:: python
 
-    experiment(model="prone", dataset="blogcatalog", task="unsupervised_node_classification")
+    experiment(model="prone", dataset="blogcatalog", save_emb_path="./embeddings/prone_blog.npy")
 
 
 Evaluation on node classification will run as the end of training. We follow the same experimental settings used in DeepWalk, Node2Vec and ProNE.
@@ -101,21 +84,20 @@ code snippet evaluates the embedding we get above:
     experiment(
         model="prone",
         dataset="blogcatalog",
-        task="unsupervised_node_classification",
-        load_emb_path="./embedding/prone_blogcatalog.npy",
+        load_emb_path="./embeddings/prone_blog.npy",
         num_shuffle=5,
         training_percents=[0.1, 0.5, 0.9]
     )
 
 
 
-You can also use command line to achieve the same quickly
+You can also use command line to achieve the same results
 
 .. code-block:: bash
 
     # Get embedding
-    python script/train.py --model prone --task unsupervised_node_classification --dataset blogcatalog
+    python script/train.py --model prone --dataset blogcatalog
 
     # Evaluate only
-    python script/train.py --model prone --task unsupervised_node_classification --dataset blogcatalog --load-emb-path ./embedding/prone_blogcatalog.npy --num-shuffle 5 --training-percents 0.1 0.5 0.9
+    python script/train.py --model prone --dataset blogcatalog --load-emb-path ./embeddings/prone_blog.npy --num-shuffle 5 --training-percents 0.1 0.5 0.9
 
diff --git a/examples/custom_dataset.py b/examples/custom_dataset.py
index 2e3d4ab7..03838b9a 100644
--- a/examples/custom_dataset.py
+++ b/examples/custom_dataset.py
@@ -2,17 +2,15 @@
 
 from cogdl.experiments import experiment
 from cogdl.data import Graph
-from cogdl.datasets import NodeDataset
+from cogdl.datasets import NodeDataset, generate_random_graph
 
 
 class MyNodeClassificationDataset(NodeDataset):
-    def __init__(self, path="mydata.pt"):
+    def __init__(self, path="data.pt"):
         super(MyNodeClassificationDataset, self).__init__(path)
 
     def process(self):
-        num_nodes = 100
-        num_edges = 300
-        feat_dim = 30
+        num_nodes, num_edges, feat_dim = 100, 300, 30
 
         # load or generate your dataset
         edge_index = torch.randint(0, num_nodes, (2, num_edges))
@@ -31,5 +29,11 @@ def process(self):
 
 
 if __name__ == "__main__":
+    # Train customized dataset via defining a new class
     dataset = MyNodeClassificationDataset()
-    experiment(dw="node_classification_dw", mw="node_classification_mw", dataset=dataset, model="gcn")
+    experiment(dataset=dataset, model="gcn")
+
+    # Train customized dataset via feeding the graph data to NodeDataset
+    data = generate_random_graph(num_nodes=100, num_edges=300, num_feats=30)
+    dataset = NodeDataset(data=data)
+    experiment(dataset=dataset, model="gcn")
diff --git a/gnn_papers.md b/gnn_papers.md
index a4af09ee..4e678f45 100644
--- a/gnn_papers.md
+++ b/gnn_papers.md
@@ -171,7 +171,7 @@ We select the 100 most influential GNN papers and 100 recent SOTA GNN papers wit
 3. **Explainability methods for graph convolutional neural networks**. *Pope Phillip E, Kolouri Soheil, Rostami Mohammad, Martin Charles E, Hoffmann Heiko*. CVPR 2019.[paper](https://openaccess.thecvf.com/content_CVPR_2019/papers/Pope_Explainability_Methods_for_Graph_Convolutional_Neural_Networks_CVPR_2019_paper.pdf)
 4. **Parameterized Explainer for Graph Neural Network**. *Luo Dongsheng, Cheng Wei, Xu Dongkuan, Yu Wenchao, Zong Bo, Chen Haifeng, Zhang Xiang*. NeurIPS 2020. [paper](https://arxiv.org/abs/2011.04573) [code](https://github.com/flyingdoog/PGExplainer)
 5. **Xgnn: Towards model-level explanations of graph neural networks**. *Yuan Hao, Tang Jiliang, Hu Xia, Ji Shuiwang*. KDD 2020. [paper](https://dl.acm.org/doi/pdf/10.1145/3394486.3403085). 
-6. **Attribution for Graph Neural Networks**. *Sanchez-Lengeling Benjamin, Wei Jennifer, Lee Brian, Reif Emily, Wang Peter, Qian Wesley, McCloskey Kevin, Colwell  Lucy, Wiltschko Alexander*. NeurIPS  2020.[paper](https://proceedings.neurips.cc/paper/2020/file/417fbbf2e9d5a28a855a11894b2e795a-Paper.pdf)
+6. **Evaluating Attribution for Graph Neural Networks**. *Sanchez-Lengeling Benjamin, Wei Jennifer, Lee Brian, Reif Emily, Wang Peter, Qian Wesley, McCloskey Kevin, Colwell  Lucy, Wiltschko Alexander*. NeurIPS  2020.[paper](https://proceedings.neurips.cc/paper/2020/file/417fbbf2e9d5a28a855a11894b2e795a-Paper.pdf)
 7. **PGM-Explainer: Probabilistic Graphical Model Explanations for Graph Neural Networks**. *Vu Minh, Thai My T.*. NeurIPS  2020.[paper](https://arxiv.org/pdf/2010.05788.pdf)
 8. **Explanation-based Weakly-supervised Learning of Visual Relations with Graph Networks**. *Federico Baldassarre and Kevin Smith and Josephine Sullivan and Hossein Azizpour*. ECCV 2020.[paper](https://arxiv.org/pdf/2010.05788.pdf)
 9. **GCAN: Graph-aware Co-Attention Networks for Explainable Fake News Detection on Social Media**. *Lu, Yi-Ju and Li, Cheng-Te*. ACL 2020.[paper](https://arxiv.org/pdf/2004.11648.pdf)