Merge remote-tracking branch 'origin/master'

alibaba · Mar 4, 2022 · bed6d9e · bed6d9e
2 parents ec56cb7 + dfa375b
commit bed6d9e
Show file tree

Hide file tree

Showing 67 changed files with 6,393 additions and 203 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -5,6 +5,7 @@ data/test/inference/fg_export_multi/variables/variables.index filter=lfs diff=lf
 data/test/inference/tb_multitower_export/assets/pipeline.config filter=lfs diff=lfs merge=lfs -text
 data/test/latest_ckpt_test/model.ckpt-500.meta filter=lfs diff=lfs merge=lfs -text
 data/test/tb_data/taobao_test_data filter=lfs diff=lfs merge=lfs -text
+data/test/tb_data/taobao_multi_seq_test_data filter=lfs diff=lfs merge=lfs -text
 data/test/test.csv filter=lfs diff=lfs merge=lfs -text
 data/test/inference/tb_multitower_placeholder_rename_export/assets/pipeline.config filter=lfs diff=lfs merge=lfs -text
 data/test/inference/tb_multitower_export/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
@@ -15,6 +16,7 @@ data/test/criteo_sample.tfrecord filter=lfs diff=lfs merge=lfs -text
 data/test/rtp/taobao_valid.csv filter=lfs diff=lfs merge=lfs -text
 data/test/rtp/taobao_train_feature.txt filter=lfs diff=lfs merge=lfs -text
 data/test/tb_data/taobao_train_data filter=lfs diff=lfs merge=lfs -text
+data/test/tb_data/taobao_multi_seq_train_data filter=lfs diff=lfs merge=lfs -text
 data/test/inference/fg_export_single/variables/variables.index filter=lfs diff=lfs merge=lfs -text
 data/test/inference/lookup_data_test80.csv filter=lfs diff=lfs merge=lfs -text
 data/test/inference/tb_multitower_export/variables/variables.index filter=lfs diff=lfs merge=lfs -text

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -21,11 +21,11 @@ jobs:
           TEST_DEVICES: ""
         run: |
            source activate /home/admin/tf12_py2/
-           if [ ! -e "/tmp/easyrec_data_20210818.tar.gz" ]
+           if [ ! -e "/tmp/easyrec_data_20220113.tar.gz" ]
            then
-             wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20210818.tar.gz -O /tmp/easyrec_data_20210818.tar.gz
+             wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20220113.tar.gz -O /tmp/easyrec_data_20220113.tar.gz
            fi
-           tar -zvxf /tmp/easyrec_data_20210818.tar.gz
+           tar -zvxf /tmp/easyrec_data_20220113.tar.gz
            source scripts/ci_test.sh
       - name: LabelAndComment
         env:
@@ -53,51 +53,52 @@ jobs:
              if (labels != null) {
                 pass_label = labels.find(label=>label.name=='ci_test_passed');
              }
+
              var fail_label = null;
              if (labels != null) {
                 fail_label = labels.find(label=>label.name=='ci_test_failed');
              }
 
+             if (pass_label) {
+               github.rest.issues.removeLabel({
+                 issue_number: context.issue.number,
+                 owner: context.repo.owner,
+                 repo: context.repo.repo,
+                 name: 'ci_test_passed'
+               })
+             }
+
+             if (fail_label) {
+               github.rest.issues.removeLabel({
+                  issue_number: context.issue.number,
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  name: 'ci_test_failed'
+               })
+             }
+
              if (CI_TEST_PASSED == 1) {
-               if (! pass_label) {
-                 github.rest.issues.addLabels({
-                    issue_number: context.issue.number,
-                    owner: context.repo.owner,
-                    repo: context.repo.repo,
-                    labels: ['ci_test_passed']
-                 })
-               }
-               if (fail_label) {
-                 github.rest.issues.removeLabel({
-                    issue_number: context.issue.number,
-                    owner: context.repo.owner,
-                    repo: context.repo.repo,
-                    name: 'ci_test_failed'
-                 })
-               }
+               github.rest.issues.addLabels({
+                  issue_number: context.issue.number,
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  labels: ['ci_test_passed']
+               })
+
                github.rest.issues.createComment({
                   owner: context.repo.owner,
                   repo: context.repo.repo,
                   issue_number: context.issue.number,
                   body: "CI Test Passed"
                })
              } else {
-               if (!fail_label) {
-                 github.rest.issues.addLabels({
-                    issue_number: context.issue.number,
-                    owner: context.repo.owner,
-                    repo: context.repo.repo,
-                    labels: ['ci_test_failed']
-                 })
-               }
-               if (pass_label) {
-                 github.rest.issues.removeLabel({
-                    issue_number: context.issue.number,
-                    owner: context.repo.owner,
-                    repo: context.repo.repo,
-                    name: 'ci_test_passed'
-                 })
-               }
+               github.rest.issues.addLabels({
+                  issue_number: context.issue.number,
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  labels: ['ci_test_failed']
+               })
+
                github.rest.issues.createComment({
                   owner: context.repo.owner,
                   repo: context.repo.repo,

diff --git a/.gitignore b/.gitignore
@@ -22,7 +22,12 @@ log
 # pai job
 pai_jobs/easy_rec
 pai_jobs/easy_rec.tar.gz
+pai_jobs/easy_rec*.tar.gz
 
 
 # idea files
 .idea
+
+# unit test
+/data
+/UNIT_TEST_CASE_LIST
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
 
 ### EasyRec is an easy to use framework for Recommendation
 
-EasyRec implements state of the art deep learning models used in common recommedation tasks: candidate generation(matching), scoring(ranking), and multi-task learning. It improves the efficiency of generating high performance models by simple configuration and hyper parameter tuning(HPO).
+EasyRec implements state of the art deep learning models used in common recommendation tasks: candidate generation(matching), scoring(ranking), and multi-task learning. It improves the efficiency of generating high performance models by simple configuration and hyper parameter tuning(HPO).
 
 &#160;
 

diff --git a/docs/source/benchmark.md b/docs/source/benchmark.md
@@ -0,0 +1,77 @@
+# benchmark介绍
+
+为了验证算法的准确性、帮助用户更好的使用EasyRec，我们做了大量的benchmark测试。我们还提供公开数据集、EasyRec配置文件，供用户更好的理解和使用EasyRec。
+
+# 单目标数据集
+
+## Taobao 数据集介绍
+
+- 该数据集是淘宝展示广告点击率预估数据集，包含用户、广告特征和行为日志。[天池比赛链接](https://tianchi.aliyun.com/dataset/dataDetail?dataId=56)
+- 训练数据表：pai_online_project.easyrec_demo_taobao_train_data
+- 测试数据表：pai_online_project.easyrec_demo_taobao_test_data
+
+## Avazu CTR 数据集
+
+- 该数据集是DSP广告公司Avazu在Kaggle平台举办的移动广告点击率预测模型挑战赛中使用的。[Click-Through Rate Prediction比赛链接](https://www.kaggle.com/c/avazu-ctr-prediction)
+- 训练数据表：pai_online_project.dwd_avazu_ctr_deepmodel_train
+- 测试数据表：pai_online_project.dwd_avazu_ctr_deepmodel_test
+
+# 多目标数据集
+
+## AliCCP 数据集
+
+- 数据集采集自手机淘宝移动客户端的推荐系统日志，其中包含点击和与之关联的转化数据。[天池比赛链接](https://tianchi.aliyun.com/dataset/dataDetail?dataId=408)
+- 训练数据表：pai_rec_dev.AliCCP_sample_train_data_processed
+- 测试数据表：pai_rec_dev.AliCCP_sample_test_data_processeds
+
+## CENSUS
+
+- CENSUS有48842个样本数据，每个样本14个属性，包括age, occupation, education, income等。样本的标注值为收入水平，例如>50K、\<=50K。[Census Income数据集链接](https://archive.ics.uci.edu/ml/datasets/census+income)
+- 训练数据表：pai_rec_dev.census_income_train
+- 测试数据表：pai_rec_dev.census_income_test
+
+# 单目标模型在taobao数据集上的测试结果
+
+- 在PAI上面测试使用的资源包括2个parameter server，9个worker，其中一个worker做评估:
+  ```json
+  {"ps":{"count":2,
+         "cpu":1000,
+         "memory":40000},
+  "worker":{"count":9,
+            "cpu":1000,
+            "memory":40000}
+  }
+  ```
+
+## 单目标测试结果
+
+| model      | global_step | best_auc | config                                                                                                        |
+| ---------- | ----------- | -------- | ------------------------------------------------------------------------------------------------------------- |
+| MultiTower | 1800        | 0.614680 | [taobao_mutiltower.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_mutiltower.config) |
+| DIN        | 1600        | 0.617049 | [din.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_din.config)                      |
+| DeepFM     | 1600        | 0.580521 | [deepfm.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_deepfm.config)                |
+| DCN        | 1500        | 0.596816 | [dcn.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_dcn.config)                      |
+| BST        | 3500        | 0.566251 | [bst.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_bst.config)                      |
+| AutoInt    | 700         | 0.605982 | [autoint.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_autoint.config)              |
+
+# 多目标模型在Ali-CCP数据集上的测试结果
+
+- 在PAI上面测试使用的资源包括2个parameter server，9个worker，其中一个worker做评估:
+  ```json
+  {"ps":{"count":2,
+         "cpu":1000,
+         "memory":40000},
+  "worker":{"count":9,
+            "cpu":1000,
+            "memory":40000}
+  }
+  ```
+
+## 多目标测试结果
+
+| model           | global_step | ctr auc   | masked cvr auc | ctcvr auc | 训练时间 | config                                                                                                               |
+| --------------- | ----------- | --------- | -------------- | --------- | ---- | -------------------------------------------------------------------------------------------------------------------- |
+| SimpleMultiTask | 4100        | 0.592606  |                | 0.6306802 | 1小时  | [simple_multi_task.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/multi_task/simple_multi_task.config) |
+| MMoE            | 3100        | 0.5869702 |                | 0.6330008 | 1小时  | [mmoe.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/multi_task/mmoe.config)                           |
+| ESMM            | 800         | 0.5974812 | 0.6841141      | 0.6362526 | 3小时  | [esmm.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/multi_task/esmm.config)                           |
+| PLE             | 3200        | 0.5874    |                | 0.6159    | 2小时  | [ple.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/multi_task/ple.config)                             |
diff --git a/docs/source/develop.md b/docs/source/develop.md
@@ -55,8 +55,8 @@ TEMPDIR=/tmp python -m easy_rec.python.test.odps_run --oss_config ~/.ossutilconf
 下载测试数据
 
 ```bash
-wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20210818.tar.gz
-tar -xvzf easyrec_data_20210818.tar.gz
+wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20220113.tar.gz
+tar -xvzf easyrec_data_20220113.tar.gz
 ```
 
 如果您要添加新数据，请在“git commit”之前执行以下操作,以将其提交到 git-lfs：

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -88,7 +88,7 @@ Welcome to easy_rec's documentation!
    faq
    tf_on_yarn
    get_role_arn
-
+   benchmark
 
 
 Indices and tables

diff --git a/docs/source/models/dlrm.md b/docs/source/models/dlrm.md
@@ -0,0 +1,117 @@
+# DLRM
+
+### 简介
+
+DLRM(Deep Learning Recommendation Model for Personalization and Recommendation Systems\[Facebook\])是一种DNN模型, 支持使用连续值特征(price/age/...)和ID类特征(user_id/item_id/...), 并对特征之间的交互(interaction)进行了建模(基于内积的方式).
+
+```
+output:
+                    probability of a click
+model:                       |
+       _________________>DNN(top)<___________
+      /                      |               \
+     /_________________>INTERACTION <_________\
+    //                                        \\
+  DNN(bot)                         ____________\\_________
+   |                              |                       |
+   |                         _____|_______           _____|______
+   |                        |_Emb_|____|__|    ...  |_Emb_|__|___|
+input:
+[ dense features ]          [sparse indices] , ..., [sparse indices]
+```
+
+### 配置说明
+
+```protobuf
+model_config {
+  model_class: 'DLRM'
+
+  feature_groups {
+    group_name: 'dense'
+    feature_names: 'age_level'
+    feature_names: 'pvalue_level'
+    feature_names: 'shopping_level'
+    feature_names: 'new_user_class_level'
+    feature_names: 'price'
+
+    wide_deep: DEEP
+  }
+
+  feature_groups {
+    group_name: 'sparse'
+    feature_names: 'user_id'
+    feature_names: 'cms_segid'
+    feature_names: 'cms_group_id'
+    feature_names: 'occupation'
+    feature_names: 'adgroup_id'
+    feature_names: 'cate_id'
+    feature_names: 'campaign_id'
+    feature_names: 'customer'
+    feature_names: 'brand'
+    feature_names: 'pid'
+    feature_names: 'tag_category_list'
+    feature_names: 'tag_brand_list'
+
+    wide_deep: DEEP
+  }
+
+  dlrm {
+    bot_dnn {
+      hidden_units: [64, 32, 16]
+    }
+
+    top_dnn {
+      hidden_units: [128, 64]
+    }
+
+    l2_regularization: 1e-5
+  }
+
+  embedding_regularization: 1e-5
+}
+```
+
+- model_class: 'DLRM', 不需要修改
+
+- feature_groups: 特征组
+
+  - 包含两个feature_group: dense 和sparse group, **group name不能变**
+
+  - wide_deep: dlrm模型使用的都是Deep features, 所以都设置成DEEP
+
+- dlrm: dlrm模型相关的参数
+
+- bot_dnn: dense mlp的参数配置
+
+  - hidden_units: dnn每一层的channel数目，即神经元的数目
+
+- top_dnn: 输出(logits)之前的mlp, 输入为dense features, sparse features and interact features.
+
+  - hidden_units: dnn每一层的channel数目，即神经元的数目
+
+- arch_interaction_op: cat or dot
+
+  - cat: 将dense_features和sparse features concat起来, 然后输入bot_dnn
+  - dot: 将dense_features和sparse features做内积interaction, 并将interaction的结果和sparse features concat起来, 然后输入bot_dnn
+
+- arch_interaction_itself:
+
+  - 仅当arch_interaction_op = 'dot'时有效, features是否和自身做内积
+
+- arch_with_dense_feature:
+
+  - 仅当arch_interaction_op = 'dot'时有效,
+    - if true, dense features也会和sparse features以及interact features concat起来, 然后进入bot_dnn.
+    - 默认是false, 即仅将sparse features和interact features concat起来，输入bot_dnn.
+
+- l2_regularization: 对DNN参数的regularization, 减少overfit
+
+- embedding_regularization: 对embedding部分加regularization, 减少overfit
+
+### 示例Config
+
+[DLRM_demo.config](https://easyrec.oss-cn-beijing.aliyuncs.com/config/dlrm_on_taobao.config)
+
+### 参考论文
+
+[DLRM](https://arxiv.org/abs/1906.00091)
diff --git a/docs/source/models/rank.rst b/docs/source/models/rank.rst
@@ -8,6 +8,7 @@
    deepfm
    fm
    wide_and_deep
+   dlrm
    dcn
    autoint
    din

diff --git a/docs/source/quick_start/local_tutorial.md b/docs/source/quick_start/local_tutorial.md
@@ -5,7 +5,7 @@
 ```bash
 git clone https://github.com/alibaba/EasyRec.git
 cd EasyRec
-wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20210818.tar.gz
+wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20220113.tar.gz
 bash scripts/gen_proto.sh # 根据proto文件生成 配置解析.py文件
 python setup.py install
 ```

diff --git a/docs/source/vector_retrieve.md b/docs/source/vector_retrieve.md
@@ -43,7 +43,7 @@ pai -name easy_rec_ext -project algo_public
 create table doc_table(pk BIGINT,vector string) partitioned by (pt string);
 
 INSERT OVERWRITE TABLE query_table PARTITION(pt='20190410')
-VALUES 
+VALUES
     (1, '0.1,0.2,-0.4,0.5'),
     (2, '-0.1,0.8,0.4,0.5'),
     (3, '0.59,0.2,0.4,0.15'),
@@ -59,7 +59,7 @@ VALUES
 create table query_table(pk BIGINT,vector string) partitioned by (pt string);
 
 INSERT OVERWRITE TABLE doc_table PARTITION(pt='20190410')
-VALUES 
+VALUES
     (1, '0.1,0.2,0.4,0.5'),
     (2, '-0.1,0.2,0.4,0.5'),
     (3, '0.5,0.2,0.4,0.5'),
@@ -113,4 +113,4 @@ SELECT * from knn_result_table where pt='20190410';
 -- 20	2	0.3800000250339508
 -- 30	3	0.5370000004768372
 -- 30	30	0.4973999857902527
-```
+```