From 313216060c7df3c70cd3b17831e486eefd897811 Mon Sep 17 00:00:00 2001 From: chengmengli06 <31561586+chengmengli06@users.noreply.github.com> Date: Mon, 21 Feb 2022 16:52:58 +0800 Subject: [PATCH 01/10] [bug] update ci.yml (#95) update ci.yml to remove all labels before ci_test actually run, and add the labels after ci_test. --- .github/workflows/ci.yml | 73 ++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a2ac94e19..8dc01a6e9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,29 +52,39 @@ jobs: var pass_label = null; if (labels != null) { pass_label = labels.find(label=>label.name=='ci_test_passed'); - } + } + var fail_label = null; if (labels != null) { fail_label = labels.find(label=>label.name=='ci_test_failed'); - } - - if (CI_TEST_PASSED == 1) { - if (! pass_label) { - github.rest.issues.addLabels({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - labels: ['ci_test_passed'] - }) - } - if (fail_label) { - github.rest.issues.removeLabel({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - name: 'ci_test_failed' - }) - } + } + + if (pass_label) { + github.rest.issues.removeLabel({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + name: 'ci_test_passed' + }) + } + + if (fail_label) { + github.rest.issues.removeLabel({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + name: 'ci_test_failed' + }) + } + + if (CI_TEST_PASSED == 1) { + github.rest.issues.addLabels({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + labels: ['ci_test_passed'] + }) + github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, @@ -82,22 +92,13 @@ jobs: body: "CI Test Passed" }) } else { - if (!fail_label) { - github.rest.issues.addLabels({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - labels: ['ci_test_failed'] - }) - } - if (pass_label) { - github.rest.issues.removeLabel({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - name: 'ci_test_passed' - }) - } + github.rest.issues.addLabels({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + labels: ['ci_test_failed'] + }) + github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, From cad5d50ebaddab2f2cf747cb2127474df44b4e10 Mon Sep 17 00:00:00 2001 From: chengmengli06 <31561586+chengmengli06@users.noreply.github.com> Date: Mon, 21 Feb 2022 20:00:21 +0800 Subject: [PATCH 02/10] [feat] add facebook dlrm model (#118) * add facebook dlrm model and document --- docs/source/models/dlrm.md | 117 +++++++++ docs/source/models/rank.rst | 1 + easy_rec/python/input/input.py | 7 +- easy_rec/python/model/dlrm.py | 73 ++++++ easy_rec/python/protos/dlrm.proto | 21 ++ easy_rec/python/protos/easy_rec_model.proto | 2 + easy_rec/python/test/train_eval_test.py | 4 + samples/model_config/dlrm_on_taobao.config | 268 ++++++++++++++++++++ 8 files changed, 488 insertions(+), 5 deletions(-) create mode 100644 docs/source/models/dlrm.md create mode 100755 easy_rec/python/model/dlrm.py create mode 100644 easy_rec/python/protos/dlrm.proto create mode 100644 samples/model_config/dlrm_on_taobao.config diff --git a/docs/source/models/dlrm.md b/docs/source/models/dlrm.md new file mode 100644 index 000000000..a9d9a203f --- /dev/null +++ b/docs/source/models/dlrm.md @@ -0,0 +1,117 @@ +# DLRM + +### 简介 + +DLRM(Deep Learning Recommendation Model for Personalization and Recommendation Systems\[Facebook\])是一种DNN模型, 支持使用连续值特征(price/age/...)和ID类特征(user_id/item_id/...), 并对特征之间的交互(interaction)进行了建模(基于内积的方式). + +``` +output: + probability of a click +model: | + _________________>DNN(top)<___________ + / | \ + /_________________>INTERACTION <_________\ + // \\ + DNN(bot) ____________\\_________ + | | | + | _____|_______ _____|______ + | |_Emb_|____|__| ... |_Emb_|__|___| +input: +[ dense features ] [sparse indices] , ..., [sparse indices] +``` + +### 配置说明 + +```protobuf +model_config { + model_class: 'DLRM' + + feature_groups { + group_name: 'dense' + feature_names: 'age_level' + feature_names: 'pvalue_level' + feature_names: 'shopping_level' + feature_names: 'new_user_class_level' + feature_names: 'price' + + wide_deep: DEEP + } + + feature_groups { + group_name: 'sparse' + feature_names: 'user_id' + feature_names: 'cms_segid' + feature_names: 'cms_group_id' + feature_names: 'occupation' + feature_names: 'adgroup_id' + feature_names: 'cate_id' + feature_names: 'campaign_id' + feature_names: 'customer' + feature_names: 'brand' + feature_names: 'pid' + feature_names: 'tag_category_list' + feature_names: 'tag_brand_list' + + wide_deep: DEEP + } + + dlrm { + bot_dnn { + hidden_units: [64, 32, 16] + } + + top_dnn { + hidden_units: [128, 64] + } + + l2_regularization: 1e-5 + } + + embedding_regularization: 1e-5 +} +``` + +- model_class: 'DLRM', 不需要修改 + +- feature_groups: 特征组 + + - 包含两个feature_group: dense 和sparse group, **group name不能变** + + - wide_deep: dlrm模型使用的都是Deep features, 所以都设置成DEEP + +- dlrm: dlrm模型相关的参数 + +- bot_dnn: dense mlp的参数配置 + + - hidden_units: dnn每一层的channel数目,即神经元的数目 + +- top_dnn: 输出(logits)之前的mlp, 输入为dense features, sparse features and interact features. + + - hidden_units: dnn每一层的channel数目,即神经元的数目 + +- arch_interaction_op: cat or dot + + - cat: 将dense_features和sparse features concat起来, 然后输入bot_dnn + - dot: 将dense_features和sparse features做内积interaction, 并将interaction的结果和sparse features concat起来, 然后输入bot_dnn + +- arch_interaction_itself: + + - 仅当arch_interaction_op = 'dot'时有效, features是否和自身做内积 + +- arch_with_dense_feature: + + - 仅当arch_interaction_op = 'dot'时有效, + - if true, dense features也会和sparse features以及interact features concat起来, 然后进入bot_dnn. + - 默认是false, 即仅将sparse features和interact features concat起来,输入bot_dnn. + +- l2_regularization: 对DNN参数的regularization, 减少overfit + +- embedding_regularization: 对embedding部分加regularization, 减少overfit + +### 示例Config + +[DLRM_demo.config](https://easyrec.oss-cn-beijing.aliyuncs.com/config/dlrm_on_taobao.config) + +### 参考论文 + +[DLRM](https://arxiv.org/abs/1906.00091) diff --git a/docs/source/models/rank.rst b/docs/source/models/rank.rst index 2bb14a7ad..355b34765 100644 --- a/docs/source/models/rank.rst +++ b/docs/source/models/rank.rst @@ -8,6 +8,7 @@ deepfm fm wide_and_deep + dlrm dcn autoint din diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py index f80194eed..500c6ed95 100644 --- a/easy_rec/python/input/input.py +++ b/easy_rec/python/input/input.py @@ -1,21 +1,17 @@ # -*- encoding:utf-8 -*- # Copyright (c) Alibaba, Inc. and its affiliates. -import json import logging -import os from abc import abstractmethod from collections import OrderedDict import six import tensorflow as tf -import easy_rec from easy_rec.python.core import sampler as sampler_lib from easy_rec.python.protos.dataset_pb2 import DatasetConfig from easy_rec.python.utils import config_util from easy_rec.python.utils import constant from easy_rec.python.utils.input_utils import get_type_defaults -from easy_rec.python.utils.input_utils import string_to_number from easy_rec.python.utils.load_class import get_register_class_meta if tf.__version__ >= '2.0': @@ -212,7 +208,8 @@ def create_placeholders(self, export_config): name='input_str_to_%s' % tf_type.name) else: if ftype not in [DatasetConfig.STRING]: - logging.warning('unexpected field type: ftype=%s tf_type=%s' % (ftype, tf_type)) + logging.warning('unexpected field type: ftype=%s tf_type=%s' % + (ftype, tf_type)) features[input_name] = input_vals[:, tmp_id] features = self._preprocess(features) return {'features': inputs_placeholder}, features diff --git a/easy_rec/python/model/dlrm.py b/easy_rec/python/model/dlrm.py new file mode 100755 index 000000000..1b542ac58 --- /dev/null +++ b/easy_rec/python/model/dlrm.py @@ -0,0 +1,73 @@ +# -*- encoding:utf-8 -*- +# Copyright (c) Alibaba, Inc. and its affiliates. +import logging + +import tensorflow as tf + +from easy_rec.python.layers import dnn +from easy_rec.python.model.rank_model import RankModel + +from easy_rec.python.protos.dlrm_pb2 import DLRM as DLRMConfig # NOQA + +if tf.__version__ >= '2.0': + tf = tf.compat.v1 + + +class DLRM(RankModel): + """Implements Deep Learning Recommendation Model for Personalization and Recommendation Systems(FaceBook).""" + + def __init__(self, + model_config, + feature_configs, + features, + labels=None, + is_training=False): + super(DLRM, self).__init__(model_config, feature_configs, features, labels, + is_training) + assert model_config.WhichOneof('model') == 'dlrm', \ + 'invalid model config: %s' % model_config.WhichOneof('model') + self._model_config = model_config.dlrm + assert isinstance(self._model_config, DLRMConfig) + assert self._input_layer.has_group( + 'sparse'), 'sparse group is not specified' + _, self._sparse_features = self._input_layer(self._feature_dict, 'sparse') + assert self._input_layer.has_group('dense'), 'dense group is not specified' + self._dense_feature, _ = self._input_layer(self._feature_dict, 'dense') + + def build_predict_graph(self): + bot_dnn = dnn.DNN(self._model_config.bot_dnn, self._l2_reg, 'bot_dnn', + self._is_training) + dense_fea = bot_dnn(self._dense_feature) + logging.info('arch_interaction_op = %s' % + self._model_config.arch_interaction_op) + if self._model_config.arch_interaction_op == 'cat': + all_fea = tf.concat([dense_fea] + self._sparse_features, axis=1) + elif self._model_config.arch_interaction_op == 'dot': + assert dense_fea.get_shape()[1] == self._sparse_features[0].get_shape()[1], \ + 'bot_dnn last hidden[%d] != sparse feature embedding_dim[%d]' % ( + dense_fea.get_shape()[1], self._sparse_features[0].get_shape()[1]) + + all_feas = [dense_fea] + self._sparse_features + all_feas = [x[:, None, :] for x in all_feas] + all_feas = tf.concat(all_feas, axis=1) + num_fea = all_feas.get_shape()[1] + interaction = tf.einsum('bne,bme->bnm', all_feas, all_feas) + offset = 0 if self._model_config.arch_interaction_itself else 1 + upper_tri = [] + for i in range(num_fea): + upper_tri.append(interaction[:, i, (i + offset):num_fea]) + upper_tri = tf.concat(upper_tri, axis=1) + concat_feas = [upper_tri] + self._sparse_features + if self._model_config.arch_with_dense_feature: + concat_feas.append(dense_fea) + all_fea = tf.concat(concat_feas, axis=1) + + top_dnn = dnn.DNN(self._model_config.top_dnn, self._l2_reg, 'top_dnn', + self._is_training) + all_fea = top_dnn(all_fea) + logits = tf.layers.dense( + all_fea, 1, kernel_regularizer=self._l2_reg, name='output') + + self._add_to_prediction_dict(logits) + + return self._prediction_dict diff --git a/easy_rec/python/protos/dlrm.proto b/easy_rec/python/protos/dlrm.proto new file mode 100644 index 000000000..2d9a38540 --- /dev/null +++ b/easy_rec/python/protos/dlrm.proto @@ -0,0 +1,21 @@ +syntax="proto2"; +package protos; + +import "easy_rec/python/protos/dnn.proto"; + +message DLRM { + required DNN top_dnn = 1; + + required DNN bot_dnn = 2; + + // options are: dot and cat + optional string arch_interaction_op = 3 [default='dot']; + + // whether a feature will interact with itself + optional bool arch_interaction_itself = 4 [default=false]; + + // whether to include dense features after interaction + optional bool arch_with_dense_feature = 5 [default=false]; + + optional float l2_regularization = 10 [default=1e-5]; +} diff --git a/easy_rec/python/protos/easy_rec_model.proto b/easy_rec/python/protos/easy_rec_model.proto index ebd8c4e71..6f8ca590d 100644 --- a/easy_rec/python/protos/easy_rec_model.proto +++ b/easy_rec/python/protos/easy_rec_model.proto @@ -5,6 +5,7 @@ import "easy_rec/python/protos/fm.proto"; import "easy_rec/python/protos/deepfm.proto"; import "easy_rec/python/protos/wide_and_deep.proto"; import "easy_rec/python/protos/multi_tower.proto"; +import "easy_rec/python/protos/dlrm.proto"; import "easy_rec/python/protos/feature_config.proto"; import "easy_rec/python/protos/dropoutnet.proto"; import "easy_rec/python/protos/dssm.proto"; @@ -58,6 +59,7 @@ message EasyRecModel { FM fm = 105; DCN dcn = 106; AutoInt autoint = 107; + DLRM dlrm = 108; DSSM dssm = 201; MIND mind = 202; diff --git a/easy_rec/python/test/train_eval_test.py b/easy_rec/python/test/train_eval_test.py index a41b901d9..a3bc6fc92 100644 --- a/easy_rec/python/test/train_eval_test.py +++ b/easy_rec/python/test/train_eval_test.py @@ -71,6 +71,10 @@ def test_wide_and_deep(self): self._test_dir) self.assertTrue(self._success) + def test_dlrm(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/dlrm_on_taobao.config', self._test_dir) + def test_adamw_optimizer(self): self._success = test_utils.test_single_train_eval( 'samples/model_config/deepfm_combo_on_avazu_adamw_ctr.config', diff --git a/samples/model_config/dlrm_on_taobao.config b/samples/model_config/dlrm_on_taobao.config new file mode 100644 index 000000000..f94c78005 --- /dev/null +++ b/samples/model_config/dlrm_on_taobao.config @@ -0,0 +1,268 @@ +train_input_path: "data/test/tb_data/taobao_train_data" +eval_input_path: "data/test/tb_data/taobao_test_data" +model_dir: "experiments/dlrm_taobao_ckpt" + +train_config { + optimizer_config: { + adam_optimizer: { + learning_rate: { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1000 + decay_factor: 0.5 + min_learning_rate: 0.00001 + } + } + } + use_moving_average: false + } + save_checkpoints_steps: 100 + log_step_count_steps: 10 + sync_replicas: True + num_steps: 2500 +} + +eval_config { + metrics_set: { + auc {} + } +} + +data_config { + input_fields { + input_name:'clk' + input_type: INT32 + } + input_fields { + input_name:'buy' + input_type: INT32 + } + input_fields { + input_name: 'pid' + input_type: STRING + } + input_fields { + input_name: 'adgroup_id' + input_type: STRING + } + input_fields { + input_name: 'cate_id' + input_type: STRING + } + input_fields { + input_name: 'campaign_id' + input_type: STRING + } + input_fields { + input_name: 'customer' + input_type: STRING + } + input_fields { + input_name: 'brand' + input_type: STRING + } + input_fields { + input_name: 'user_id' + input_type: STRING + } + input_fields { + input_name: 'cms_segid' + input_type: STRING + } + input_fields { + input_name: 'cms_group_id' + input_type: STRING + } + input_fields { + input_name: 'final_gender_code' + input_type: STRING + } + input_fields { + input_name: 'age_level' + input_type: DOUBLE + } + input_fields { + input_name: 'pvalue_level' + input_type: DOUBLE + } + input_fields { + input_name: 'shopping_level' + input_type: DOUBLE + } + input_fields { + input_name: 'occupation' + input_type: STRING + } + input_fields { + input_name: 'new_user_class_level' + input_type: DOUBLE + } + input_fields { + input_name: 'tag_category_list' + input_type: STRING + } + input_fields { + input_name: 'tag_brand_list' + input_type: STRING + } + input_fields { + input_name: 'price' + input_type: DOUBLE + } + + label_fields: 'clk' + batch_size: 4096 + num_epochs: 10000 + prefetch_size: 32 + input_type: CSVInput +} + +feature_config: { + features: { + input_names: 'pid' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'adgroup_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'cate_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 + } + features: { + input_names: 'campaign_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'customer' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'brand' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'user_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'cms_segid' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: 'cms_group_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: 'final_gender_code' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'age_level' + feature_type: RawFeature + } + features: { + input_names: 'pvalue_level' + feature_type: RawFeature + } + features: { + input_names: 'shopping_level' + feature_type: RawFeature + } + features: { + input_names: 'occupation' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'new_user_class_level' + feature_type: RawFeature + } + features: { + input_names: 'tag_category_list' + feature_type: TagFeature + separator: '|' + hash_bucket_size: 10000 + embedding_dim: 16 + } + features: { + input_names: 'tag_brand_list' + feature_type: TagFeature + separator: '|' + hash_bucket_size: 100000 + embedding_dim: 16 + } + features: { + input_names: 'price' + feature_type: RawFeature + } +} +model_config { + model_class: 'DLRM' + + feature_groups { + group_name: 'dense' + feature_names: 'age_level' + feature_names: 'pvalue_level' + feature_names: 'shopping_level' + feature_names: 'new_user_class_level' + feature_names: 'price' + + wide_deep: DEEP + } + + feature_groups { + group_name: 'sparse' + feature_names: 'user_id' + feature_names: 'cms_segid' + feature_names: 'cms_group_id' + feature_names: 'occupation' + feature_names: 'adgroup_id' + feature_names: 'cate_id' + feature_names: 'campaign_id' + feature_names: 'customer' + feature_names: 'brand' + feature_names: 'pid' + feature_names: 'tag_category_list' + feature_names: 'tag_brand_list' + + wide_deep: DEEP + } + + dlrm { + bot_dnn { + hidden_units: [64, 32, 16] + } + + top_dnn { + hidden_units: [128, 64] + } + } + + embedding_regularization: 1e-5 +} + +export_config { +} From 1cef8b882f53869c660a5816c6179adcfe19f5bb Mon Sep 17 00:00:00 2001 From: tiankongdeguiji Date: Thu, 24 Feb 2022 19:59:45 +0800 Subject: [PATCH 03/10] [bug] fix len(ps_hosts) > 1 check in set_tf_config_and_get_train_worker_num (#124) --- easy_rec/python/utils/distribution_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/easy_rec/python/utils/distribution_utils.py b/easy_rec/python/utils/distribution_utils.py index def9c08ac..9c7c12827 100644 --- a/easy_rec/python/utils/distribution_utils.py +++ b/easy_rec/python/utils/distribution_utils.py @@ -96,7 +96,7 @@ def set_tf_config_and_get_train_worker_num( cluster = {'chief': [worker_hosts[0]], 'worker': worker_hosts[2:]} if distribute_strategy != DistributionStrategy.NoStrategy: cluster['evaluator'] = [worker_hosts[1]] - if len(ps_hosts) > 1: + if len(ps_hosts) > 0: cluster['ps'] = ps_hosts if job_name == 'ps': os.environ['TF_CONFIG'] = json.dumps({ @@ -166,7 +166,7 @@ def set_tf_config_and_get_train_worker_num( else: cluster = {'chief': [worker_hosts[0]], 'worker': worker_hosts[1:]} train_worker_num = len(worker_hosts) - if len(ps_hosts) > 1: + if len(ps_hosts) > 0: cluster['ps'] = ps_hosts if job_name == 'ps': os.environ['TF_CONFIG'] = json.dumps({ From 5f14d032fb5536189318fde989385be6ce29ce27 Mon Sep 17 00:00:00 2001 From: poson Date: Mon, 28 Feb 2022 10:42:55 +0800 Subject: [PATCH 04/10] [hive_input][benchmark]add doc and hive_input (#122) * [hive_input][benchmark]add doc and hive_input * [dlrm]add dlrm in easy_rec_model.proto * [hive_input] move hive config in pipeline.config * [hive_input] move init code to _init_config function --- docs/source/benchmark.md | 77 +++ docs/source/index.rst | 2 +- easy_rec/python/input/hive_input.py | 222 +++++++ easy_rec/python/main.py | 40 +- easy_rec/python/protos/dataset.proto | 2 + easy_rec/python/protos/hive_config.proto | 22 + easy_rec/python/protos/pipeline.proto | 4 + easy_rec/python/test/hive_input_test.py | 304 ++++++++++ .../emr_script/mmoe/mmoe_census_income.config | 567 ++++++++++++++++++ 9 files changed, 1226 insertions(+), 14 deletions(-) create mode 100644 docs/source/benchmark.md create mode 100644 easy_rec/python/input/hive_input.py create mode 100644 easy_rec/python/protos/hive_config.proto create mode 100644 easy_rec/python/test/hive_input_test.py create mode 100644 samples/emr_script/mmoe/mmoe_census_income.config diff --git a/docs/source/benchmark.md b/docs/source/benchmark.md new file mode 100644 index 000000000..b4ffefb43 --- /dev/null +++ b/docs/source/benchmark.md @@ -0,0 +1,77 @@ +# benchmark介绍 + +为了验证算法的准确性、帮助用户更好的使用EasyRec,我们做了大量的benchmark测试。我们还提供公开数据集、EasyRec配置文件,供用户更好的理解和使用EasyRec。 + +# 单目标数据集 + +## Taobao 数据集介绍 + +- 该数据集是淘宝展示广告点击率预估数据集,包含用户、广告特征和行为日志。[天池比赛链接](https://tianchi.aliyun.com/dataset/dataDetail?dataId=56) +- 训练数据表:pai_online_project.easyrec_demo_taobao_train_data +- 测试数据表:pai_online_project.easyrec_demo_taobao_test_data + +## Avazu CTR 数据集 + +- 该数据集是DSP广告公司Avazu在Kaggle平台举办的移动广告点击率预测模型挑战赛中使用的。[Click-Through Rate Prediction比赛链接](https://www.kaggle.com/c/avazu-ctr-prediction) +- 训练数据表:pai_online_project.dwd_avazu_ctr_deepmodel_train +- 测试数据表:pai_online_project.dwd_avazu_ctr_deepmodel_test + +# 多目标数据集 + +## AliCCP 数据集 + +- 数据集采集自手机淘宝移动客户端的推荐系统日志,其中包含点击和与之关联的转化数据。[天池比赛链接](https://tianchi.aliyun.com/dataset/dataDetail?dataId=408) +- 训练数据表:pai_rec_dev.AliCCP_sample_train_data_processed +- 测试数据表:pai_rec_dev.AliCCP_sample_test_data_processeds + +## CENSUS + +- CENSUS有48842个样本数据,每个样本14个属性,包括age, occupation, education, income等。样本的标注值为收入水平,例如>50K、\<=50K。[Census Income数据集链接](https://archive.ics.uci.edu/ml/datasets/census+income) +- 训练数据表:pai_rec_dev.census_income_train +- 测试数据表:pai_rec_dev.census_income_test + +# 单目标模型在taobao数据集上的测试结果 + +- 在PAI上面测试使用的资源包括2个parameter server,9个worker,其中一个worker做评估: + ```json + {"ps":{"count":2, + "cpu":1000, + "memory":40000}, + "worker":{"count":9, + "cpu":1000, + "memory":40000} + } + ``` + +## 单目标测试结果 + +| model | global_step | best_auc | config | +| ---------- | ----------- | -------- | ------------------------------------------------------------------------------------------------------------- | +| MultiTower | 1800 | 0.614680 | [taobao_mutiltower.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_mutiltower.config) | +| DIN | 1600 | 0.617049 | [din.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_din.config) | +| DeepFM | 1600 | 0.580521 | [deepfm.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_deepfm.config) | +| DCN | 1500 | 0.596816 | [dcn.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_dcn.config) | +| BST | 3500 | 0.566251 | [bst.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_bst.config) | +| AutoInt | 700 | 0.605982 | [autoint.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_autoint.config) | + +# 多目标模型在Ali-CCP数据集上的测试结果 + +- 在PAI上面测试使用的资源包括2个parameter server,9个worker,其中一个worker做评估: + ```json + {"ps":{"count":2, + "cpu":1000, + "memory":40000}, + "worker":{"count":9, + "cpu":1000, + "memory":40000} + } + ``` + +## 多目标测试结果 + +| model | global_step | ctr auc | masked cvr auc | ctcvr auc | 训练时间 | config | +| --------------- | ----------- | --------- | -------------- | --------- | ---- | -------------------------------------------------------------------------------------------------------------------- | +| SimpleMultiTask | 4100 | 0.592606 | | 0.6306802 | 1小时 | [simple_multi_task.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/multi_task/simple_multi_task.config) | +| MMoE | 3100 | 0.5869702 | | 0.6330008 | 1小时 | [mmoe.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/multi_task/mmoe.config) | +| ESMM | 800 | 0.5974812 | 0.6841141 | 0.6362526 | 3小时 | [esmm.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/multi_task/esmm.config) | +| PLE | 3200 | 0.5874 | | 0.6159 | 2小时 | [ple.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/multi_task/ple.config) | diff --git a/docs/source/index.rst b/docs/source/index.rst index 158768e1d..63f090f59 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -88,7 +88,7 @@ Welcome to easy_rec's documentation! faq tf_on_yarn get_role_arn - + benchmark Indices and tables diff --git a/easy_rec/python/input/hive_input.py b/easy_rec/python/input/hive_input.py new file mode 100644 index 000000000..e4a978e74 --- /dev/null +++ b/easy_rec/python/input/hive_input.py @@ -0,0 +1,222 @@ +# -*- coding: utf-8 -*- + +import logging + +import numpy as np +import tensorflow as tf +from pyhive import hive + +from easy_rec.python.input.input import Input +from easy_rec.python.utils import odps_util + + +class TableInfo(object): + + def __init__(self, + tablename, + selected_cols, + partition_kv, + hash_fields, + limit_num, + batch_size=16, + task_index=0, + task_num=1, + epoch=1): + self.tablename = tablename + self.selected_cols = selected_cols + self.partition_kv = partition_kv + self.hash_fields = hash_fields + self.limit_num = limit_num + self.task_index = task_index + self.task_num = task_num + self.batch_size = batch_size + self.epoch = epoch + + def gen_sql(self): + part = '' + if self.partition_kv and len(self.partition_kv) > 0: + res = [] + for k, v in self.partition_kv.items(): + res.append('{}={}'.format(k, v)) + part = ' '.join(res) + sql = """select {} + from {}""".format(self.selected_cols, self.tablename) + assert self.hash_fields is not None, 'hash_fields must not be empty' + fields = ['cast({} as string)'.format(key) for key in self.hash_fields.split(',')] + str_fields = ','.join(fields) + if not part: + sql += """ + where hash(concat({}))%{}={} + """.format(str_fields, self.task_num, self.task_index) + else: + sql += """ + where {} and hash(concat({}))%{}={} + """.format(part, str_fields, self.task_num, self.task_index) + if self.limit_num is not None and self.limit_num > 0: + sql += ' limit {}'.format(self.limit_num) + return sql + + +class HiveManager(object): + + def __init__(self, + host, + port, + username, + info, + database='default'): + self.host = host + self.port = port + self.username = username + self.database = database + self.info = info + + def __call__(self): + conn = hive.Connection( + host=self.host, + port=self.port, + username=self.username, + database=self.database) + cursor = conn.cursor() + sql = self.info.gen_sql() + res = [] + for ep in range(self.info.epoch): + cursor.execute(sql) + for result in cursor.fetchall(): + res.append(result) + if len(res) == self.info.batch_size: + yield res + res = [] + pass + + +class HiveInput(Input): + """Common IO based interface, could run at local or on data science.""" + + def __init__(self, + data_config, + feature_config, + input_path, + task_index=0, + task_num=1): + super(HiveInput, self).__init__(data_config, feature_config, input_path, + task_index, task_num) + self._hive_config = data_config.hive_config + self._num_epoch = data_config.num_epochs + + def _construct_table_info(self, table_name, hash_fields, limit_num): + # sample_table/dt=2014-11-23/name=a + segs = table_name.split('/') + table_name = segs[0].strip() + if len(segs) > 0: + partition_kv = {i.split('=')[0]: i.split('=')[1] for i in segs[1:]} + else: + partition_kv = None + selected_cols = ','.join(self._input_fields) + table_info = TableInfo(table_name, selected_cols, partition_kv, hash_fields, + limit_num, self._data_config.batch_size, + self._task_index, self._task_num, self._num_epoch) + return table_info + + def _construct_hive_connect(self): + conn = hive.Connection( + host=self._hive_config.host, + port=self._hive_config.port, + username=self._hive_config.username, + database=self._hive_config.database) + return conn + + def _parse_table(self, *fields): + fields = list(fields) + inputs = {self._input_fields[x]: fields[x] for x in self._effective_fids} + for x in self._label_fids: + inputs[self._input_fields[x]] = fields[x] + return inputs + + def _hive_read(self): + logging.info('start epoch[%d]' % self._num_epoch) + self._num_epoch += 1 + if type(self._input_path) != list: + self._input_path = [x for x in str(self._input_path).split(',')] + + # check data_config are consistent with odps tables + odps_util.check_input_field_and_types(self._data_config) + + record_defaults = [ + self.get_type_defaults(x, v) + for x, v in zip(self._input_field_types, self._input_field_defaults) + ] + + for table_path in self._input_path: + table_info = self._construct_table_info(table_path, + self._hive_config.hash_fields, + self._hive_config.limit_num) + conn = self._construct_hive_connect() + cursor = conn.cursor() + sql = table_info.gen_sql() + res = [] + cursor.execute(sql) + + batch_defaults = [ + np.array([x] * self._data_config.batch_size) for x in record_defaults + ] + + row_id = 0 + batch_data_np = [x.copy() for x in batch_defaults] + for result in cursor.fetchall(): + res.append(1) + for col_id in range(len(record_defaults)): + if result[col_id] not in ['', 'NULL', None]: + batch_data_np[col_id][row_id] = result[col_id] + if len(res) == self._data_config.batch_size: + yield tuple(batch_data_np) + res = [] + row_id = 0 + batch_data_np = [x.copy() for x in batch_defaults] + else: + row_id += 1 + + if len(res) > 0: + yield tuple(batch_data_np) + conn.close() + logging.info('finish epoch[%d]' % self._num_epoch) + + def _build(self, mode, params): + # get input type + list_type = [self.get_tf_type(x) for x in self._input_field_types] + list_type = tuple(list_type) + list_shapes = [tf.TensorShape([None]) for x in range(0, len(list_type))] + list_shapes = tuple(list_shapes) + + # read odps tables + + dataset = tf.data.Dataset.from_generator( + self._hive_read, output_types=list_type, output_shapes=list_shapes) + + if mode == tf.estimator.ModeKeys.TRAIN: + dataset = dataset.shuffle( + self._data_config.shuffle_buffer_size, + seed=2022, + reshuffle_each_iteration=True) + dataset = dataset.repeat(self.num_epochs) + else: + dataset = dataset.repeat(1) + + dataset = dataset.map( + self._parse_table, + num_parallel_calls=self._data_config.num_parallel_calls) + + # preprocess is necessary to transform data + # so that they could be feed into FeatureColumns + dataset = dataset.map( + map_func=self._preprocess, + num_parallel_calls=self._data_config.num_parallel_calls) + + dataset = dataset.prefetch(buffer_size=self._prefetch_size) + + if mode != tf.estimator.ModeKeys.PREDICT: + dataset = dataset.map(lambda x: + (self._get_features(x), self._get_labels(x))) + else: + dataset = dataset.map(lambda x: (self._get_features(x))) + return dataset diff --git a/easy_rec/python/main.py b/easy_rec/python/main.py index 7afd453b8..c30111c40 100644 --- a/easy_rec/python/main.py +++ b/easy_rec/python/main.py @@ -240,6 +240,31 @@ def train_and_evaluate(pipeline_config_path, continue_train=False): return pipeline_config +def _get_input_object_by_name(pipeline_config, worker_type): + """" + get object by worker type + + pipeline_config: pipeline_config + worker_type: train or eval + """ + input_type = "{}_path".format(worker_type) + input_name = pipeline_config.WhichOneof(input_type) + _dict = {"kafka_train_input": pipeline_config.kafka_train_input, + "kafka_eval_input": pipeline_config.kafka_eval_input, + "datahub_train_input": pipeline_config.datahub_train_input, + "datahub_eval_input": pipeline_config.datahub_eval_input, + "hive_train_input": pipeline_config.hive_train_input, + "hive_eval_input": pipeline_config.hive_eval_input + } + if input_name in _dict: + return _dict[input_name] + + if worker_type == "train": + return pipeline_config.train_input_path + else: + return pipeline_config.eval_input_path + + def _train_and_evaluate_impl(pipeline_config, continue_train=False): train_config = pipeline_config.train_config data_config = pipeline_config.data_config @@ -253,19 +278,8 @@ def _train_and_evaluate_impl(pipeline_config, continue_train=False): % pipeline_config.train_config.train_distribute) pipeline_config.train_config.sync_replicas = False - if pipeline_config.WhichOneof('train_path') == 'kafka_train_input': - train_data = pipeline_config.kafka_train_input - elif pipeline_config.WhichOneof('train_path') == 'datahub_train_input': - train_data = pipeline_config.datahub_train_input - else: - train_data = pipeline_config.train_input_path - - if pipeline_config.WhichOneof('eval_path') == 'kafka_eval_input': - eval_data = pipeline_config.kafka_eval_input - elif pipeline_config.WhichOneof('eval_path') == 'datahub_eval_input': - eval_data = pipeline_config.datahub_eval_input - else: - eval_data = pipeline_config.eval_input_path + train_data = _get_input_object_by_name(pipeline_config, 'train') + eval_data = _get_input_object_by_name(pipeline_config, 'eval') distribution = strategy_builder.build(train_config) estimator, run_config = _create_estimator( diff --git a/easy_rec/python/protos/dataset.proto b/easy_rec/python/protos/dataset.proto index 2710d2d91..8059af72c 100644 --- a/easy_rec/python/protos/dataset.proto +++ b/easy_rec/python/protos/dataset.proto @@ -177,6 +177,7 @@ message DatasetConfig { // input pipelines DummyInput = 8; KafkaInput = 13; + HiveInput = 16; } required InputType input_type = 10; @@ -245,4 +246,5 @@ message DatasetConfig { HardNegativeSampler hard_negative_sampler = 103; HardNegativeSamplerV2 hard_negative_sampler_v2 = 104; } + } diff --git a/easy_rec/python/protos/hive_config.proto b/easy_rec/python/protos/hive_config.proto new file mode 100644 index 000000000..8ed905ca2 --- /dev/null +++ b/easy_rec/python/protos/hive_config.proto @@ -0,0 +1,22 @@ +syntax = "proto2"; +package protos; + +message HiveConfig { + // hive master's ip + required string host = 1; + + // hive port + required uint32 port = 2 [default = 10000]; + + // hive username + required string username = 3; + + // hive database + required string database = 4 [default = 'default']; + + required string table_name = 5; + + required string hash_fields = 6; + + optional uint32 limit_num = 7 [default = 0]; +} diff --git a/easy_rec/python/protos/pipeline.proto b/easy_rec/python/protos/pipeline.proto index 09f44c200..1a351337b 100644 --- a/easy_rec/python/protos/pipeline.proto +++ b/easy_rec/python/protos/pipeline.proto @@ -8,6 +8,7 @@ import "easy_rec/python/protos/dataset.proto"; import "easy_rec/python/protos/feature_config.proto"; import "easy_rec/python/protos/easy_rec_model.proto"; import "easy_rec/python/protos/data_source.proto"; +import "easy_rec/python/protos/hive_config.proto"; // EasyRecConfig: the pipeline_config, including all sub configs @@ -16,11 +17,14 @@ message EasyRecConfig { string train_input_path = 1; KafkaServer kafka_train_input = 2; DatahubServer datahub_train_input = 12; + HiveConfig hive_train_input = 21; } oneof eval_path { string eval_input_path = 3; KafkaServer kafka_eval_input = 4; DatahubServer datahub_eval_input = 13; + HiveConfig hive_eval_input= 22; + } required string model_dir = 5; diff --git a/easy_rec/python/test/hive_input_test.py b/easy_rec/python/test/hive_input_test.py new file mode 100644 index 000000000..07d219c11 --- /dev/null +++ b/easy_rec/python/test/hive_input_test.py @@ -0,0 +1,304 @@ +# -*- encoding:utf-8 -*- +# Copyright (c) Alibaba, Inc. and its affiliates. +"""Define cv_input, the base class for cv tasks.""" +import tensorflow as tf +import unittest +from easy_rec.python.input.hive_input import HiveInput +from easy_rec.python.protos.dataset_pb2 import DatasetConfig +from easy_rec.python.utils import test_utils +from easy_rec.python.utils.config_util import * +from easy_rec.python.utils.test_utils import * +from easy_rec.python.utils.test_utils import _load_config_for_test +from easy_rec.python.protos.hive_config_pb2 import HiveConfig +import os + +if tf.__version__ >= '2.0': + #tf = tf.compat.v1 + import tensorflow.compat.v1 as tf + +gfile = tf.gfile + +if tf.__version__ >= '2.0': + from tensorflow.python.framework.ops import disable_eager_execution + + disable_eager_execution() + tf = tf.compat.v1 + + +class HiveInputTest(tf.test.TestCase): + + def _init_config(self): + hive_host = os.environ["hive_host"] + hive_username = os.environ["hive_username"] + hive_table_name = os.environ["hive_table_name"] + hive_hash_fields = os.environ["hive_hash_fields"] + + hive_train_input = """ + host: "{}" + username: "{}" + table_name: "{}" + limit_num: 500 + hash_fields: "{}" + """.format(hive_host, hive_username, hive_table_name, hive_hash_fields) + hive_eval_input =""" + host: "{}" + username: "{}" + table_name: "{}" + limit_num: 500 + hash_fields: "{}" + """.format(hive_host, hive_username, hive_table_name, hive_hash_fields) + self.hive_train_input_config = HiveConfig() + text_format.Merge(hive_train_input, self.hive_train_input_config) + + self.hive_eval_input_config = HiveConfig() + text_format.Merge(hive_eval_input, self.hive_eval_input_config) + + def __init__(self, methodName='HiveInputTest'): + super(HiveInputTest, self).__init__(methodName=methodName) + + @unittest.skipIf( + 'hive_host' not in os.environ or 'hive_username' not in os.environ or + 'hive_table_name' not in os.environ or 'hive_hash_fields' not in os.environ, + """Only execute hive_config var are specified,hive_host、 + hive_username、hive_table_name、hive_hash_fields is available.""") + def test_hive_input(self): + self._init_config() + data_config_str = """ + batch_size: 1024 + label_fields: "label_1" + label_fields: "label_2" + num_epochs: 1 + prefetch_size: 32 + input_type: HiveInput + input_fields { + input_name:'label_1' + input_type: INT32 + } + input_fields { + input_name:'label_2' + input_type: INT32 + } + input_fields { + input_name:'age' + input_type: INT32 + } + input_fields { + input_name: "class_of_worker" + } + input_fields { + input_name: "industry_code" + } + input_fields { + input_name: "occupation_code" + } + input_fields { + input_name: "education" + } + input_fields { + input_name: "wage_per_hour" + input_type: DOUBLE + } + input_fields { + input_name: "enrolled_in_edu_inst_last_wk" + } + input_fields { + input_name: "major_industry" + } + input_fields { + input_name: "major_occupation" + } + input_fields { + input_name: "mace" + } + input_fields { + input_name: "hispanic_origin" + } + input_fields { + input_name: "sex" + } + input_fields { + input_name: "member_of_a_labor_union" + } + input_fields { + input_name: "reason_for_unemployment" + } + input_fields { + input_name: "full_or_part_time_employment_stat" + } + input_fields { + input_name: "capital_gains" + input_type: DOUBLE + } + input_fields { + input_name: "capital_losses" + input_type: DOUBLE + } + input_fields { + input_name: "divdends_from_stocks" + input_type: DOUBLE + } + input_fields { + input_name: "tax_filer_status" + } + input_fields { + input_name: "region_of_previous_residence" + } + input_fields { + input_name: "state_of_previous_residence" + } + input_fields { + input_name: "detailed_household_and_family_stat" + } + input_fields { + input_name: "detailed_household_summary_in_household" + } + input_fields { + input_name: "instance_weight" + } + input_fields { + input_name: "migration_code_change_in_msa" + } + input_fields { + input_name: "migration_code_change_in_reg" + } + input_fields { + input_name: "migration_code_move_within_reg" + } + input_fields { + input_name: "live_in_this_house_1_year_ago" + } + input_fields { + input_name: "migration_prev_res_in_sunbelt" + } + input_fields { + input_name: "num_persons_worked_for_employer" + input_type: INT32 + } + input_fields { + input_name: "family_members_under_18" + } + input_fields { + input_name: "country_of_birth_father" + } + input_fields { + input_name: "country_of_birth_mother" + } + input_fields { + input_name: "country_of_birth_self" + } + input_fields { + input_name: "citizenship" + } + input_fields { + input_name: "own_business_or_self_employed" + } + input_fields { + input_name: "fill_inc_questionnaire_for_veteran_s_admin" + } + input_fields { + input_name: "veterans_benefits" + } + input_fields { + input_name: "weeks_worked_in_year" + input_type: INT32 + } + input_fields { + input_name: "year" + } + """ + + feature_config_str = """ + input_names: "own_business_or_self_employed" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" + """ + + dataset_config = DatasetConfig() + text_format.Merge(data_config_str, dataset_config) + feature_config = FeatureConfig() + text_format.Merge(feature_config_str, feature_config) + feature_configs = [feature_config] + + empty_config = FeatureConfig() + empty_config.CopyFrom(feature_config) + while len(empty_config.input_names) > 0: + empty_config.input_names.pop() + while len(empty_config.shared_names) > 0: + empty_config.shared_names.pop() + train_input_fn = HiveInput(dataset_config, feature_configs, + self.hive_train_input_config).create_input() + dataset = train_input_fn(mode=tf.estimator.ModeKeys.TRAIN) + iterator = dataset.make_initializable_iterator() + tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer) + features, labels = iterator.get_next() + init_op = tf.get_collection(tf.GraphKeys.TABLE_INITIALIZERS) + gpu_options = tf.GPUOptions(allow_growth=True) + session_config = tf.ConfigProto( + gpu_options=gpu_options, + allow_soft_placement=True, + log_device_placement=False) + with self.test_session(config=session_config) as sess: + sess.run(init_op) + feature_dict, label_dict = sess.run([features, labels]) + for key in feature_dict: + print(key, feature_dict[key][:5]) + + for key in label_dict: + print(key, label_dict[key][:5]) + return 0 + + @unittest.skipIf( + 'hive_host' not in os.environ or 'hive_username' not in os.environ or + 'hive_table_name' not in os.environ or 'hive_hash_fields' not in os.environ, + """Only execute hive_config var are specified,hive_host、 + hive_username、hive_table_name、hive_hash_fields is available.""") + def test_mmoe(self): + pipeline_config_path = 'samples/emr_script/mmoe/mmoe_census_income.config' + gpus = get_available_gpus() + if len(gpus) > 0: + set_gpu_id(gpus[0]) + else: + set_gpu_id(None) + + if not isinstance(pipeline_config_path, EasyRecConfig): + logging.info('testing pipeline config %s' % pipeline_config_path) + if 'TF_CONFIG' in os.environ: + del os.environ['TF_CONFIG'] + + if isinstance(pipeline_config_path, EasyRecConfig): + pipeline_config = pipeline_config_path + else: + pipeline_config = _load_config_for_test(pipeline_config_path, self._test_dir) + + pipeline_config.train_config.train_distribute = 0 + pipeline_config.train_config.num_gpus_per_worker = 1 + pipeline_config.train_config.sync_replicas = False + + config_util.save_pipeline_config(pipeline_config, self._test_dir) + test_pipeline_config_path = os.path.join(self._test_dir, 'pipeline.config') + hyperparam_str = "" + train_cmd = 'python -m easy_rec.python.train_eval --pipeline_config_path %s %s' % ( + test_pipeline_config_path, hyperparam_str) + proc = run_cmd(train_cmd, '%s/log_%s.txt' % (self._test_dir, 'master')) + proc.wait() + if proc.returncode != 0: + logging.error('train %s failed' % test_pipeline_config_path) + return 1 + return 0 + + def setUp(self): + logging.info('Testing %s.%s' % (type(self).__name__, self._testMethodName)) + self._test_dir = test_utils.get_tmp_dir() + self._success = True + logging.info('test dir: %s' % self._test_dir) + + def tearDown(self): + test_utils.set_gpu_id(None) + if self._success: + test_utils.clean_up(self._test_dir) + + +if __name__ == '__main__': + tf.test.main() diff --git a/samples/emr_script/mmoe/mmoe_census_income.config b/samples/emr_script/mmoe/mmoe_census_income.config new file mode 100644 index 000000000..a6e820ca7 --- /dev/null +++ b/samples/emr_script/mmoe/mmoe_census_income.config @@ -0,0 +1,567 @@ +hive_train_input { + host: "192.168.0.1" + username: "admin" + table_name: "census_income_train_simple" + limit_num: 500 + hash_fields: "age,class_of_worker,marital_status,education" +} + +hive_eval_input { + host: "192.168.0.1" + username: "admin" + table_name: "census_income_train_simple" + limit_num: 500 + hash_fields: "age,class_of_worker,marital_status,education" +} + +train_config { + optimizer_config { + use_moving_average: false + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1290 + decay_factor: 0.5 + min_learning_rate: 1e-06 + } + } + } + } + num_steps: 25 + sync_replicas: true + log_step_count_steps: 10 + save_checkpoints_steps: 25 +} + +eval_config { + metrics_set { + auc {} + } +} + +model_config { + model_class: "MMoE" + mmoe { + experts { + expert_name: "expert_1" + dnn { + hidden_units: [128, 64, 32, 16] + dropout_ratio: [0.1, 0.1, 0.1, 0.1] + } + } + experts { + expert_name: "expert_2" + dnn { + hidden_units: [128, 64, 32, 16] + dropout_ratio: [0.1, 0.1, 0.1, 0.1] + } + } + experts { + expert_name: "expert_3" + dnn { + hidden_units: [128, 64, 32, 16] + dropout_ratio: [0.1, 0.1, 0.1, 0.1] + } + } + experts { + expert_name: "expert_4" + dnn { + hidden_units: [128, 64, 32, 16] + dropout_ratio: [0.1, 0.1, 0.1, 0.1] + } + } + task_towers { + tower_name: "task1" + label_name: "label_1" + metrics_set { + auc {} + } + dnn { + hidden_units: [256, 192, 128, 64] + dropout_ratio: [0.1, 0.1, 0.1, 0.1] + } + loss_type: CLASSIFICATION + num_class: 1 + weight: 1.0 + } + task_towers { + tower_name: "task2" + label_name: "label_2" + dnn { + hidden_units: [256, 192, 128, 64] + dropout_ratio: [0.1, 0.1, 0.1, 0.1] + } + loss_type: CLASSIFICATION + num_class: 1 + weight: 1.0 + metrics_set { + auc {} + } + } + l2_regularization: 1e-06 + } + embedding_regularization: 5e-05 + feature_groups { + group_name: "all" + feature_names:"age" + feature_names:"detailed_household_and_family_stat" + feature_names:"detailed_household_summary_in_household" + feature_names:"migration_code_change_in_msa" + feature_names:"migration_code_change_in_reg" + feature_names:"migration_code_move_within_reg" + feature_names:"live_in_this_house_1_year_ago" + feature_names:"migration_prev_res_in_sunbelt" + feature_names:"num_persons_worked_for_employer" + feature_names:"citizenship" + feature_names:"mace" + feature_names:"hispanic_origin" + feature_names:"sex" + feature_names:"region_of_previous_residence" + feature_names:"instance_weight" + feature_names:"family_members_under_18" + feature_names:"country_of_birth_father" + feature_names:"country_of_birth_mother" + feature_names:"country_of_birth_self" + feature_names:"year" + feature_names:"class_of_worker" + feature_names:"industry_code" + feature_names:"occupation_code" + feature_names:"education" + feature_names:"major_industry" + feature_names:"major_occupation" + feature_names:"wage_per_hour" + feature_names:"enrolled_in_edu_inst_last_wk" + feature_names:"member_of_a_labor_union" + feature_names:"reason_for_unemployment" + feature_names:"full_or_part_time_employment_stat" + feature_names:"capital_gains" + feature_names:"capital_losses" + feature_names:"divdends_from_stocks" + feature_names:"tax_filer_status" + feature_names:"state_of_previous_residence" + feature_names:"own_business_or_self_employed" + feature_names:"fill_inc_questionnaire_for_veteran_s_admin" + feature_names:"veterans_benefits" + feature_names:"weeks_worked_in_year" + wide_deep: DEEP + } +} + +data_config { + batch_size: 10 + label_fields: "label_1" + label_fields: "label_2" + num_epochs: 1 + prefetch_size: 4 + input_type: HiveInput + input_fields { + input_name:'label_1' + input_type: INT32 + } + input_fields { + input_name:'label_2' + input_type: INT32 + } + input_fields { + input_name:'age' + input_type: INT32 + } + input_fields { + input_name: "class_of_worker" + } + input_fields { + input_name: "industry_code" + } + input_fields { + input_name: "occupation_code" + } + input_fields { + input_name: "education" + } + input_fields { + input_name: "wage_per_hour" + input_type: DOUBLE + } + input_fields { + input_name: "enrolled_in_edu_inst_last_wk" + } + input_fields { + input_name: "major_industry" + } + input_fields { + input_name: "major_occupation" + } + input_fields { + input_name: "mace" + } + input_fields { + input_name: "hispanic_origin" + } + input_fields { + input_name: "sex" + } + input_fields { + input_name: "member_of_a_labor_union" + } + input_fields { + input_name: "reason_for_unemployment" + } + input_fields { + input_name: "full_or_part_time_employment_stat" + } + input_fields { + input_name: "capital_gains" + input_type: DOUBLE + } + input_fields { + input_name: "capital_losses" + input_type: DOUBLE + } + input_fields { + input_name: "divdends_from_stocks" + input_type: DOUBLE + } + input_fields { + input_name: "tax_filer_status" + } + input_fields { + input_name: "region_of_previous_residence" + } + input_fields { + input_name: "state_of_previous_residence" + } + input_fields { + input_name: "detailed_household_and_family_stat" + } + input_fields { + input_name: "detailed_household_summary_in_household" + } + input_fields { + input_name: "instance_weight" + } + input_fields { + input_name: "migration_code_change_in_msa" + } + input_fields { + input_name: "migration_code_change_in_reg" + } + input_fields { + input_name: "migration_code_move_within_reg" + } + input_fields { + input_name: "live_in_this_house_1_year_ago" + } + input_fields { + input_name: "migration_prev_res_in_sunbelt" + } + input_fields { + input_name: "num_persons_worked_for_employer" + input_type: INT32 + } + input_fields { + input_name: "family_members_under_18" + } + input_fields { + input_name: "country_of_birth_father" + } + input_fields { + input_name: "country_of_birth_mother" + } + input_fields { + input_name: "country_of_birth_self" + } + input_fields { + input_name: "citizenship" + } + input_fields { + input_name: "own_business_or_self_employed" + } + input_fields { + input_name: "fill_inc_questionnaire_for_veteran_s_admin" + } + input_fields { + input_name: "veterans_benefits" + } + input_fields { + input_name: "weeks_worked_in_year" + input_type: INT32 + } + input_fields { + input_name: "year" + } +} + +feature_configs { + input_names: "age" + feature_type: RawFeature + embedding_dim: 9 + hash_bucket_size: 400 +} +feature_configs { + input_names: "class_of_worker" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "industry_code" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "occupation_code" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "education" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "wage_per_hour" + feature_type: RawFeature + embedding_dim: 9 + hash_bucket_size: 400 +} +feature_configs { + input_names: "enrolled_in_edu_inst_last_wk" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "major_industry" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "major_occupation" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "mace" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "hispanic_origin" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "sex" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "member_of_a_labor_union" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "reason_for_unemployment" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "full_or_part_time_employment_stat" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "capital_gains" + feature_type: RawFeature + embedding_dim: 9 + hash_bucket_size: 400 +} +feature_configs { + input_names: "capital_losses" + feature_type: RawFeature + embedding_dim: 9 + hash_bucket_size: 400 +} +feature_configs { + input_names: "divdends_from_stocks" + feature_type: RawFeature + embedding_dim: 9 + hash_bucket_size: 400 +} +feature_configs { + input_names: "tax_filer_status" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "region_of_previous_residence" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "state_of_previous_residence" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "detailed_household_and_family_stat" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "detailed_household_summary_in_household" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "instance_weight" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "migration_code_change_in_msa" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "migration_code_change_in_reg" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "migration_code_move_within_reg" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "live_in_this_house_1_year_ago" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "migration_prev_res_in_sunbelt" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "num_persons_worked_for_employer" + feature_type: RawFeature + embedding_dim: 9 + hash_bucket_size: 400 +} +feature_configs { + input_names: "family_members_under_18" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "country_of_birth_father" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "country_of_birth_mother" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "country_of_birth_self" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "citizenship" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "own_business_or_self_employed" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "fill_inc_questionnaire_for_veteran_s_admin" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "veterans_benefits" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} +feature_configs { + input_names: "weeks_worked_in_year" + feature_type: RawFeature + embedding_dim: 9 + hash_bucket_size: 400 +} +feature_configs { + input_names: "year" + feature_type: IdFeature + embedding_dim: 9 + hash_bucket_size: 400 + embedding_name: "feature" +} \ No newline at end of file From 42cb4020bc83f53826202eee04a85ede7cda4365 Mon Sep 17 00:00:00 2001 From: tiankongdeguiji Date: Mon, 28 Feb 2022 14:21:38 +0800 Subject: [PATCH 05/10] [bug] fix ps_hosts empty check in set_tf_config (#125) --- easy_rec/python/utils/distribution_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easy_rec/python/utils/distribution_utils.py b/easy_rec/python/utils/distribution_utils.py index 9c7c12827..7e9ec99d1 100644 --- a/easy_rec/python/utils/distribution_utils.py +++ b/easy_rec/python/utils/distribution_utils.py @@ -47,7 +47,7 @@ def set_tf_config_and_get_train_worker_num( 'set_tf_config_and_get_train_worker_num: distribute_strategy = %d' % distribute_strategy) worker_hosts = worker_hosts.split(',') - ps_hosts = ps_hosts.split(',') + ps_hosts = ps_hosts.split(',') if ps_hosts else [] total_worker_num = len(worker_hosts) train_worker_num = total_worker_num From c2c20c8f6321c8fd07869fc39d5b579f724a95c3 Mon Sep 17 00:00:00 2001 From: chengmengli06 <31561586+chengmengli06@users.noreply.github.com> Date: Tue, 1 Mar 2022 15:20:34 +0800 Subject: [PATCH 06/10] [bug] fix fg export bug for multitask models (#121) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix fg export bug for multiple task models * support for filter_inputs for multi placeholders Co-authored-by: 杨熙 --- docs/source/vector_retrieve.md | 6 +- easy_rec/python/inference/vector_retrieve.py | 4 +- easy_rec/python/input/input.py | 47 +-- easy_rec/python/test/export_test.py | 4 + easy_rec/python/test/odps_run.py | 8 +- easy_rec/python/tools/predict_and_chk.py | 22 +- .../model_config/export_filter_input.config | 294 ++++++++++++++++++ .../vector_retrieve/drop_table.sql | 2 +- .../vector_retrieve/run_vector_retrieve.sql | 2 +- 9 files changed, 353 insertions(+), 36 deletions(-) create mode 100644 samples/model_config/export_filter_input.config diff --git a/docs/source/vector_retrieve.md b/docs/source/vector_retrieve.md index d88cc9704..0ba2a7018 100644 --- a/docs/source/vector_retrieve.md +++ b/docs/source/vector_retrieve.md @@ -43,7 +43,7 @@ pai -name easy_rec_ext -project algo_public create table doc_table(pk BIGINT,vector string) partitioned by (pt string); INSERT OVERWRITE TABLE query_table PARTITION(pt='20190410') -VALUES +VALUES (1, '0.1,0.2,-0.4,0.5'), (2, '-0.1,0.8,0.4,0.5'), (3, '0.59,0.2,0.4,0.15'), @@ -59,7 +59,7 @@ VALUES create table query_table(pk BIGINT,vector string) partitioned by (pt string); INSERT OVERWRITE TABLE doc_table PARTITION(pt='20190410') -VALUES +VALUES (1, '0.1,0.2,0.4,0.5'), (2, '-0.1,0.2,0.4,0.5'), (3, '0.5,0.2,0.4,0.5'), @@ -113,4 +113,4 @@ SELECT * from knn_result_table where pt='20190410'; -- 20 2 0.3800000250339508 -- 30 3 0.5370000004768372 -- 30 30 0.4973999857902527 -``` \ No newline at end of file +``` diff --git a/easy_rec/python/inference/vector_retrieve.py b/easy_rec/python/inference/vector_retrieve.py index 917853484..4baca38db 100644 --- a/easy_rec/python/inference/vector_retrieve.py +++ b/easy_rec/python/inference/vector_retrieve.py @@ -10,14 +10,14 @@ import common_io import numpy as np import tensorflow as tf + try: import graphlearn as gl except: logging.WARN( - 'GraphLearn is not installed. You can install it by "pip install http://odps-release.cn-hangzhou.oss-cdn.aliyun-inc.com/graphlearn/tunnel/graphlearn-0.7-cp27-cp27mu-linux_x86_64.whl."' # noqa: E501 + 'GraphLearn is not installed. You can install it by "pip install https://easyrec.oss-cn-beijing.aliyuncs.com/3rdparty/graphlearn-0.7-cp27-cp27mu-linux_x86_64.whl.' # noqa: E501 ) - if tf.__version__ >= '2.0': tf = tf.compat.v1 diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py index 500c6ed95..dd7c216b6 100644 --- a/easy_rec/python/input/input.py +++ b/easy_rec/python/input/input.py @@ -140,23 +140,32 @@ def get_tf_type(self, field_type): assert field_type in type_map, 'invalid type: %s' % field_type return type_map[field_type] - def create_multi_placeholders(self, - placeholder_named_by_input, - export_fields_name=None): - """Create multiply placeholders on export. + def create_multi_placeholders(self, export_config): + """Create multiply placeholders on export, one for each feature. Args: - placeholder_named_by_input: If it is true, placeholder is named by the input feature, - otherwise the placeholder name if input_XX. Default: false. - export_fields_name: TagFeature / SeqFeature list that needs to be converted into - 2D placeholders when exporting. + export_config: ExportConfig instance. """ self._mode = tf.estimator.ModeKeys.PREDICT - effective_fids = list(self._effective_fids) + + if export_config.multi_value_fields: + export_fields_name = export_config.multi_value_fields.input_name + else: + export_fields_name = None + placeholder_named_by_input = export_config.placeholder_named_by_input + + if export_config.filter_inputs: + effective_fids = list(self._effective_fids) + else: + effective_fids = [ + fid for fid in range(len(self._input_fields)) + if self._input_fields[fid] not in self._label_fields + ] + if self._data_config.HasField('sample_weight'): effective_fids = effective_fids[:-1] - inputs = {} + inputs = {} for fid in effective_fids: input_name = self._input_fields[fid] if placeholder_named_by_input: @@ -189,9 +198,13 @@ def create_placeholders(self, export_config): logging.info('number of effective inputs:%d, total number inputs: %d' % (len(effective_fids), len(self._input_fields))) else: - effective_fids = list(range(1, len(self._input_fields))) - logging.info('will not filter any input, total number inputs:%d' % - len(effective_fids)) + effective_fids = [ + fid for fid in range(len(self._input_fields)) + if self._input_fields[fid] not in self._label_fields + ] + logging.info( + 'will not filter any input[except labels], total number inputs:%d' % + len(effective_fids)) if self._data_config.HasField('sample_weight'): effective_fids = effective_fids[:-1] input_vals = tf.reshape( @@ -566,13 +579,7 @@ def _input_fn(mode=None, params=None, config=None): return dataset elif mode is None: # serving_input_receiver_fn for export SavedModel if export_config.multi_placeholder: - if export_config.multi_value_fields: - export_fields_name = export_config.multi_value_fields.input_name - else: - export_fields_name = None - placeholder_named_by_input = export_config.placeholder_named_by_input - inputs, features = self.create_multi_placeholders( - placeholder_named_by_input, export_fields_name) + inputs, features = self.create_multi_placeholders(export_config) return tf.estimator.export.ServingInputReceiver(features, inputs) else: inputs, features = self.create_placeholders(export_config) diff --git a/easy_rec/python/test/export_test.py b/easy_rec/python/test/export_test.py index 69d339120..3f88fe1bb 100644 --- a/easy_rec/python/test/export_test.py +++ b/easy_rec/python/test/export_test.py @@ -81,6 +81,10 @@ def test_multi_tower(self): self._export_test('samples/model_config/multi_tower_export.config', self._extract_data) + def test_filter_input(self): + self._export_test('samples/model_config/export_filter_input.config', + self._extract_data) + def test_mmoe(self): self._export_test( 'samples/model_config/mmoe_on_taobao.config', diff --git a/easy_rec/python/test/odps_run.py b/easy_rec/python/test/odps_run.py index 4b6179fbc..443db118b 100644 --- a/easy_rec/python/test/odps_run.py +++ b/easy_rec/python/test/odps_run.py @@ -196,12 +196,8 @@ def test_boundary_test(self): tot.drop_table() def test_vector_retrieve(self): - start_files = [ - 'vector_retrieve/create_inner_vector_table.sql' - ] - test_files = [ - 'vector_retrieve/run_vector_retrieve.sql' - ] + start_files = ['vector_retrieve/create_inner_vector_table.sql'] + test_files = ['vector_retrieve/run_vector_retrieve.sql'] end_file = ['vector_retrieve/drop_table.sql'] tot = OdpsTest(start_files, test_files, end_file, odps_oss_config) tot.start_test() diff --git a/easy_rec/python/tools/predict_and_chk.py b/easy_rec/python/tools/predict_and_chk.py index 8cc0f70f1..51fa945be 100644 --- a/easy_rec/python/tools/predict_and_chk.py +++ b/easy_rec/python/tools/predict_and_chk.py @@ -23,6 +23,11 @@ '--cmp_res_path', type=str, default=None, help='compare result path') parser.add_argument( '--cmp_key', type=str, default='probs', help='compare key') + parser.add_argument( + '--rtp_fea_id', + type=int, + default=-1, + help='rtp feature column index, default to the last column') parser.add_argument('--tol', type=float, default=1e-5, help='tolerance') parser.add_argument( '--label_id', @@ -30,9 +35,15 @@ type=int, help='the label column, which is to be excluded') parser.add_argument( - '--separator', type=str, default='', help='separator between features') + '--separator', + type=str, + default='', + help='separator between features, default to \\u0002') parser.add_argument( - '--rtp_separator', type=str, default='', help='separator') + '--rtp_separator', + type=str, + default='', + help='separator, default to \\u0001') args = parser.parse_args() if not args.saved_model_dir: @@ -51,12 +62,17 @@ logging.info('separator: ' + args.separator) predictor = Predictor(args.saved_model_dir) + if len(predictor.input_names) == 1: + assert len( + args.label_id + ) == 0, 'label_id should not be set if rtp feature format is used.' + with open(args.input_path, 'r') as fin: batch_input = [] for line_str in fin: line_str = line_str.strip() line_tok = line_str.split(args.rtp_separator) - feature = line_tok[-1] + feature = line_tok[args.rtp_fea_id] feature = [ x for fid, x in enumerate(feature.split(args.separator)) if fid not in args.label_id diff --git a/samples/model_config/export_filter_input.config b/samples/model_config/export_filter_input.config new file mode 100644 index 000000000..42a37ff8e --- /dev/null +++ b/samples/model_config/export_filter_input.config @@ -0,0 +1,294 @@ +train_input_path: "data/test/tb_data/taobao_train_data" +eval_input_path: "data/test/tb_data/taobao_test_data" +model_dir: "experiments/multi_tower_taobao_ckpt" + +train_config { + log_step_count_steps: 100 + optimizer_config: { + adam_optimizer: { + learning_rate: { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1000 + decay_factor: 0.5 + min_learning_rate: 0.00001 + } + } + } + use_moving_average: false + } + save_checkpoints_steps: 100 + sync_replicas: True + num_steps: 200 +} + +eval_config { + metrics_set: { + auc {} + } +} + +data_config { + input_fields { + input_name:'clk' + input_type: INT32 + } + input_fields { + input_name:'buy' + input_type: INT32 + } + input_fields { + input_name: 'pid' + input_type: STRING + } + input_fields { + input_name: 'adgroup_id' + input_type: STRING + } + input_fields { + input_name: 'cate_id' + input_type: STRING + } + input_fields { + input_name: 'campaign_id' + input_type: STRING + } + input_fields { + input_name: 'customer' + input_type: STRING + } + input_fields { + input_name: 'brand' + input_type: STRING + } + input_fields { + input_name: 'user_id' + input_type: STRING + } + input_fields { + input_name: 'cms_segid' + input_type: STRING + } + input_fields { + input_name: 'cms_group_id' + input_type: STRING + } + input_fields { + input_name: 'final_gender_code' + input_type: STRING + } + input_fields { + input_name: 'age_level' + input_type: STRING + } + input_fields { + input_name: 'pvalue_level' + input_type: STRING + } + input_fields { + input_name: 'shopping_level' + input_type: STRING + } + input_fields { + input_name: 'occupation' + input_type: STRING + } + input_fields { + input_name: 'new_user_class_level' + input_type: STRING + } + input_fields { + input_name: 'tag_category_list' + input_type: STRING + } + input_fields { + input_name: 'tag_brand_list' + input_type: STRING + } + input_fields { + input_name: 'price' + input_type: INT32 + } + + label_fields: 'clk' + batch_size: 4096 + num_epochs: 10000 + prefetch_size: 32 + input_type: CSVInput +} + +feature_config: { + features: { + input_names: 'pid' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'adgroup_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'cate_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 + } + features: { + input_names: 'campaign_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'customer' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'brand' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 + } + features: { + input_names: 'user_id' + feature_type: IdFeature + embedding_dim: 32 + hash_bucket_size: 100000 + } + features: { + input_names: 'cms_segid' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: 'cms_group_id' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 + } + features: { + input_names: 'final_gender_code' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'age_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'pvalue_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'shopping_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'occupation' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } + features: { + input_names: 'new_user_class_level' + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 + } +# features: { +# input_names: 'tag_category_list' +# feature_type: TagFeature +# separator: '|' +# hash_bucket_size: 100000 +# embedding_dim: 16 +# } +# features: { +# input_names: 'tag_brand_list' +# feature_type: TagFeature +# separator: '|' +# hash_bucket_size: 100000 +# embedding_dim: 16 +# } + features: { + input_names: 'price' + feature_type: IdFeature + embedding_dim: 16 + num_buckets: 50 + } +} +model_config: { + model_class: 'MultiTower' + feature_groups: { + group_name: 'user' + feature_names: 'user_id' + feature_names: 'cms_segid' + feature_names: 'cms_group_id' + feature_names: 'age_level' + feature_names: 'pvalue_level' + feature_names: 'shopping_level' + feature_names: 'occupation' + feature_names: 'new_user_class_level' + wide_deep: DEEP + } + feature_groups: { + group_name: 'item' + feature_names: 'adgroup_id' + feature_names: 'cate_id' + feature_names: 'campaign_id' + feature_names: 'customer' + feature_names: 'brand' + feature_names: 'price' + wide_deep: DEEP + } + feature_groups: { + group_name: 'combo' + feature_names: 'pid' +# feature_names: 'tag_category_list' +# feature_names: 'tag_brand_list' + wide_deep: DEEP + } + + multi_tower { + towers { + input: "user" + dnn { + hidden_units: [256, 128, 96, 64] + } + } + towers { + input: "item" + dnn { + hidden_units: [256, 128, 96, 64] + } + } + towers { + input: "combo" + dnn { + hidden_units: [128, 96, 64, 32] + } + } + final_dnn { + hidden_units: [128, 96, 64, 32, 16] + } + l2_regularization: 1e-6 + } + embedding_regularization: 1e-4 +} + +export_config { + multi_placeholder: true + filter_inputs: false +} diff --git a/samples/odps_script/vector_retrieve/drop_table.sql b/samples/odps_script/vector_retrieve/drop_table.sql index 3550efc6b..7d7f03062 100644 --- a/samples/odps_script/vector_retrieve/drop_table.sql +++ b/samples/odps_script/vector_retrieve/drop_table.sql @@ -1,3 +1,3 @@ drop TABLE IF EXISTS query_vector_{TIME_STAMP}; drop TABLE IF EXISTS doc_vector_{TIME_STAMP}; -drop TABLE IF EXISTS result_vector_{TIME_STAMP}; \ No newline at end of file +drop TABLE IF EXISTS result_vector_{TIME_STAMP}; diff --git a/samples/odps_script/vector_retrieve/run_vector_retrieve.sql b/samples/odps_script/vector_retrieve/run_vector_retrieve.sql index 2314a3eea..2f4559c54 100644 --- a/samples/odps_script/vector_retrieve/run_vector_retrieve.sql +++ b/samples/odps_script/vector_retrieve/run_vector_retrieve.sql @@ -13,4 +13,4 @@ pai -name easy_rec_ext -Dknn_feature_dims=4 -Dknn_index_type='ivfflat' -Dknn_feature_delimiter=',' -; \ No newline at end of file +; From 020d39292aded6dc46286549f2cbd388ab97618e Mon Sep 17 00:00:00 2001 From: Cosmo Zhang Date: Wed, 2 Mar 2022 10:32:30 +0800 Subject: [PATCH 07/10] [feat] RankModel support serving on Alibaba RTP (#123) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 为了适配BE2.0 O2O: 1. 添加OdpsRTPInputV2的input类型,该类型额外需要一个原始的fg.json文件,使用RTP原生的算子解析feature字符串。 2. 修改deploy_ext.sh,仅打包easyrec本体并将其上传至ODPS,不再部署XML图(因为那个依赖于PAI内部脚本,跑不通)。此外优化了打包流程。 Squashed commit of the following: commit 49bd17c96995dc96a31c29e8b81c13454bd6bd65 Author: 彭多 Date: Thu Oct 14 15:19:06 2021 +0800 rtp fg parse ok commit bd89bdf0759a6256e02bf7a61126d8848e4a37cf Author: 彭多 Date: Thu Oct 14 11:37:29 2021 +0800 dev commit f89d7305f80d4701d5108808a6859889227e6a71 Author: 彭多 Date: Thu Oct 14 11:07:07 2021 +0800 fix commit 4180dce66354fd18e14b653a03e7e5d611d697f6 Author: 彭多 Date: Thu Oct 14 11:02:07 2021 +0800 dev commit a51c4d2fa7e1d06b56e4edffcca8aebdf6dc318a Author: 彭多 Date: Thu Oct 14 10:59:36 2021 +0800 dev * disable tf shape optimize * add ran_predict node in dssm * add rank predict embedding nodes * add __rank_service_* collections to model graph * add __rank_service_* collections to model graph * export_checkpoint * fix merge problems * fix merging problems * forward rank_predict in RankModel * fix import error * fix import error * fix: avoid shape optimization for non-OdpsRTPInputV2 input types * fix output list * fix output list * remove useless script & add some comments * allow specifiying export outputs * fix bug: embedding column with same name will cause RTP building graph incorrectly * fix bugs in DssmModel output building * DssmModel: do not export rank_prediction in _prediction_dict * fix some problems --- .gitignore | 5 + easy_rec/__init__.py | 1 + .../feature_column/feature_column_v2.py | 28 +++- easy_rec/python/compat/ops.py | 13 ++ easy_rec/python/input/input.py | 6 + easy_rec/python/input/odps_rtp_input_v2.py | 100 ++++++++++++ easy_rec/python/layers/utils.py | 150 ++++++++++++++++++ easy_rec/python/main.py | 65 +++++++- easy_rec/python/model/deepfm.py | 10 ++ easy_rec/python/model/dssm.py | 25 +++ easy_rec/python/model/easy_rec_estimator.py | 92 ++++++++++- easy_rec/python/model/easy_rec_model.py | 33 ++++ easy_rec/python/model/rank_model.py | 44 ++++- easy_rec/python/protos/dataset.proto | 1 + easy_rec/python/protos/export.proto | 6 + pai_jobs/deploy.sh | 0 pai_jobs/deploy_ext.sh | 7 +- pai_jobs/run.py | 19 ++- 18 files changed, 582 insertions(+), 23 deletions(-) create mode 100644 easy_rec/python/compat/ops.py create mode 100644 easy_rec/python/input/odps_rtp_input_v2.py create mode 100644 easy_rec/python/layers/utils.py mode change 100644 => 100755 pai_jobs/deploy.sh mode change 100644 => 100755 pai_jobs/deploy_ext.sh diff --git a/.gitignore b/.gitignore index 6b38aa74b..2d0fa8010 100644 --- a/.gitignore +++ b/.gitignore @@ -22,7 +22,12 @@ log # pai job pai_jobs/easy_rec pai_jobs/easy_rec.tar.gz +pai_jobs/easy_rec*.tar.gz # idea files .idea + +# unit test +/data +/UNIT_TEST_CASE_LIST diff --git a/easy_rec/__init__.py b/easy_rec/__init__.py index 6b9c30155..cfafba708 100644 --- a/easy_rec/__init__.py +++ b/easy_rec/__init__.py @@ -20,6 +20,7 @@ from easy_rec.python.main import distribute_evaluate # isort:skip # noqa: E402 from easy_rec.python.main import export # isort:skip # noqa: E402 from easy_rec.python.main import train_and_evaluate # isort:skip # noqa: E402 +from easy_rec.python.main import export_checkpoint # isort:skip # noqa: E402 try: import tensorflow_io.oss diff --git a/easy_rec/python/compat/feature_column/feature_column_v2.py b/easy_rec/python/compat/feature_column/feature_column_v2.py index 27b9eabdb..be310749c 100644 --- a/easy_rec/python/compat/feature_column/feature_column_v2.py +++ b/easy_rec/python/compat/feature_column/feature_column_v2.py @@ -167,6 +167,8 @@ from easy_rec.python.compat import embedding_ops as ev_embedding_ops from easy_rec.python.compat.feature_column import feature_column as fc_old from easy_rec.python.compat.feature_column import utils as fc_utils +from easy_rec.python.compat import ops as compat_ops +from easy_rec.python.layers import utils as layer_utils _FEATURE_COLUMN_DEPRECATION_DATE = None _FEATURE_COLUMN_DEPRECATION = ('The old _FeatureColumn APIs are being ' @@ -3122,8 +3124,30 @@ def _old_get_dense_tensor_internal(self, sparse_tensors, weight_collections, trainable=self.trainable and trainable, partitioner=self.partitioner, collections=weight_collections) - return self._get_dense_tensor_internal_helper(sparse_tensors, - embedding_weights) + + # Write the embedding configuration to RTP-specified collections. This will inform RTP to + # optimize this embedding operation. + embedding_attrs = layer_utils.gen_embedding_attrs(column=self, + variable=embedding_weights, + bucket_size=self.categorical_column._num_buckets, + combiner=self.combiner, + is_embedding_var=self.use_embedding_variable) + embedding_attrs['name'] = layer_utils.unique_name_in_collection( + compat_ops.GraphKeys.RANK_SERVICE_EMBEDDING, embedding_attrs['name']) + layer_utils.update_attr_to_collection(compat_ops.GraphKeys.RANK_SERVICE_EMBEDDING, embedding_attrs) + + # operate embedding + predictions = self._get_dense_tensor_internal_helper(sparse_tensors, embedding_weights) + + # Update the information about the output and input nodes of embedding operation to the + # previous written RTP-specific collection entry. RTP uses these informations to extract + # the embedding subgraph. + layer_utils.append_tensor_to_collection(compat_ops.GraphKeys.RANK_SERVICE_EMBEDDING, + embedding_attrs['name'], 'tensor', predictions) + layer_utils.append_tensor_to_collection(compat_ops.GraphKeys.RANK_SERVICE_EMBEDDING, + embedding_attrs['name'], 'input', sparse_tensors.id_tensor) + + return predictions def get_dense_tensor(self, transformation_cache, state_manager): """Returns tensor after doing the embedding lookup. diff --git a/easy_rec/python/compat/ops.py b/easy_rec/python/compat/ops.py new file mode 100644 index 000000000..ad1f808d9 --- /dev/null +++ b/easy_rec/python/compat/ops.py @@ -0,0 +1,13 @@ +from tensorflow.python.framework import ops + +class GraphKeys(ops.GraphKeys): + # For rank service + RANK_SERVICE_FG_CONF = "__rank_service_fg_conf" + RANK_SERVICE_INPUT = "__rank_service_input" + RANK_SERVICE_OUTPUT = "__rank_service_output" + RANK_SERVICE_EMBEDDING = "__rank_service_embedding" + RANK_SERVICE_INPUT_SRC = "__rank_service_input_src" + RANK_SERVICE_REPLACE_OP = "__rank_service_replace" + RANK_SERVICE_SHAPE_OPT_FLAG = "__rank_service_shape_opt_flag" + # For compatition between RTP and EasyRec + RANK_SERVICE_FEATURE_NODE = "__rank_service_feature_node" diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py index dd7c216b6..987f1acf9 100644 --- a/easy_rec/python/input/input.py +++ b/easy_rec/python/input/input.py @@ -282,6 +282,7 @@ def _preprocess(self, field_dict): parsed_dict[k] = v self._appended_fields.append(k) + print("[input] all feature names: {}".format([fc.feature_name for fc in self._feature_configs])) for fc in self._feature_configs: feature_name = fc.feature_name feature_type = fc.feature_type @@ -554,6 +555,9 @@ def _lookup(args, pad=True): def _build(self, mode, params): raise NotImplementedError + def _pre_build(self, mode, params): + pass + def create_input(self, export_config=None): def _input_fn(mode=None, params=None, config=None): @@ -571,6 +575,7 @@ def _input_fn(mode=None, params=None, config=None): else, return: tf.estimator.export.ServingInputReceiver instance """ + self._pre_build(mode, params) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT): # build dataset from self._config.input_path @@ -583,6 +588,7 @@ def _input_fn(mode=None, params=None, config=None): return tf.estimator.export.ServingInputReceiver(features, inputs) else: inputs, features = self.create_placeholders(export_config) + print("built feature placeholders. features: {}".format(features.keys())) return tf.estimator.export.ServingInputReceiver(features, inputs) return _input_fn diff --git a/easy_rec/python/input/odps_rtp_input_v2.py b/easy_rec/python/input/odps_rtp_input_v2.py new file mode 100644 index 000000000..328fc3928 --- /dev/null +++ b/easy_rec/python/input/odps_rtp_input_v2.py @@ -0,0 +1,100 @@ +# -*- encoding:utf-8 -*- +# Copyright (c) Alibaba, Inc. and its affiliates. +import logging + +import numpy as np +import tensorflow as tf +import json + +from easy_rec.python.input.odps_rtp_input import OdpsRTPInput + +try: + import pai + import rtp_fg +except Exception: + pai = None + rtp_fg = None + +class OdpsRTPInputV2(OdpsRTPInput): + """RTPInput for parsing rtp fg new input format on odps. + + Our new format(csv in table) of rtp output: + label0, item_id, ..., user_id, features + Where features is in default RTP-tensorflow format. + The features column and labels are specified by data_config.selected_cols, + columns are selected by names in the table + such as: clk,features, the last selected column is features, the first + selected columns are labels + """ + + def __init__(self, + data_config, + feature_config, + input_path, + task_index=0, + task_num=1, + fg_json_path=None): + super(OdpsRTPInputV2, self).__init__( + data_config, feature_config, input_path, task_index, task_num) + self._fg_config_path = fg_json_path + logging.info('fg config path: {}'.format(self._fg_config_path)) + if self._fg_config_path is None: + raise ValueError("fg_json_path is not set") + with tf.gfile.GFile(self._fg_config_path, 'r') as f: + self._fg_config = json.load(f) + + def _parse_table(self, *fields): + self.check_rtp() + + fields = list(fields) + labels = fields[:-1] + + # only for features, labels excluded + record_defaults = [ + self.get_type_defaults(t, v) + for x, t, v in zip(self._input_fields, self._input_field_types, + self._input_field_defaults) + if x not in self._label_fields + ] + # assume that the last field is the generated feature column + features = rtp_fg.parse_genreated_fg(self._fg_config, fields[-1]) + + field_keys = [x for x in self._input_fields if x not in self._label_fields] + for feature_key in features: + if feature_key not in field_keys or feature_key not in self._effective_fields: + del features[feature_key] + inputs = {x: features[x] for x in features.keys()} + + for x in range(len(self._label_fields)): + inputs[self._label_fields[x]] = labels[x] + return inputs + + def create_placeholders(self, *args, **kwargs): + """Create serving placeholders with rtp_fg""" + self.check_rtp() + self._mode = tf.estimator.ModeKeys.PREDICT + inputs_placeholder = tf.placeholder(tf.string, [None], name='features') + print("[OdpsRTPInputV2] building placeholders.") + print("[OdpsRTPInputV2] fg_config: {}".format(self._fg_config)) + features = rtp_fg.parse_genreated_fg(self._fg_config, inputs_placeholder) + print("[OdpsRTPInputV2] built features: {}".format(features.keys())) + features = self._preprocess(features) + print("[OdpsRTPInputV2] processed features: {}".format(features.keys())) + return {'features': inputs_placeholder}, features + + def create_multi_placeholders(self, *args, **kwargs): + """Create serving multi-placeholders with rtp_fg""" + raise NotImplementedError("create_multi_placeholders is not supported for OdpsRTPInputV2") + + def check_rtp(self): + if rtp_fg is None: + raise NotImplementedError("OdpsRTPInputV2 cannot run without rtp_fg, which is not installed") + + def _pre_build(self, mode, params): + try: + # Prevent TF from replacing the shape tensor to a constant tensor. This will + # cause the batch size being fixed. And RTP will be not able to recogonize + # the input shape. + tf.get_default_graph().set_shape_optimize(False) + except AttributeError as e: + logging.warning("failed to disable shape optimization:", e) diff --git a/easy_rec/python/layers/utils.py b/easy_rec/python/layers/utils.py new file mode 100644 index 000000000..cfd5e85e8 --- /dev/null +++ b/easy_rec/python/layers/utils.py @@ -0,0 +1,150 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Common util functions used by layers. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +from tensorflow.python.framework import ops +from tensorflow.python.framework import sparse_tensor +from tensorflow.python.ops import variables +try: + from tensorflow.python.ops import kv_variable_ops +except ImportError: + kv_variable_ops = None + + +ColumnNameInCollection = {} + + +def _tensor_to_map(tensor): + return { + 'node_path' : tensor.name, + 'shape' : tensor.shape.as_list() if tensor.shape else None, + 'dtype' : tensor.dtype.name + } + + +def _tensor_to_tensorinfo(tensor): + tensor_info = {} + if isinstance(tensor, sparse_tensor.SparseTensor): + tensor_info['is_dense'] = False + tensor_info['values'] = _tensor_to_map(tensor.values) + tensor_info['indices'] = _tensor_to_map(tensor.indices) + tensor_info['dense_shape'] = _tensor_to_map(tensor.dense_shape) + else: + tensor_info['is_dense'] = True + tensor_info.update(_tensor_to_map(tensor)) + return tensor_info + + +def add_tensor_to_collection(collection_name, name, tensor): + tensor_info = _tensor_to_tensorinfo(tensor) + tensor_info['name'] = name + update_attr_to_collection(collection_name, tensor_info) + + +def append_tensor_to_collection(collection_name, name, key, tensor): + tensor_info = _tensor_to_tensorinfo(tensor) + append_attr_to_collection(collection_name, name, key, tensor_info) + + +def _collection_item_key(col, name): + return "%d#%s" % (id(col), name) + + +def _process_item(collection_name, name, func): + col = ops.get_collection_ref(collection_name) + item_found = {} + idx_found = -1 + + # add id(col) because col may re-new sometimes + key = _collection_item_key(col, name) + if key in ColumnNameInCollection: + idx_found = ColumnNameInCollection[key] + if idx_found >= len(col): + raise Exception("Find column name in collection failed: index out of range") + + item_found = json.loads(col[idx_found]) + if item_found['name'] != name: + raise Exception("Find column name in collection failed: item name not match") + func(item_found) + col[idx_found] = json.dumps(item_found) + else: + func(item_found) + col.append(json.dumps(item_found)) + ColumnNameInCollection[key] = len(col) - 1 + + +def append_attr_to_collection(collection_name, name, key, value): + def append(item_found): + if key not in item_found: + item_found[key] = [] + item_found[key].append(value) + + _process_item(collection_name, name, append) + + +def update_attr_to_collection(collection_name, attrs): + def update(item_found): + item_found.update(attrs) + + _process_item(collection_name, attrs['name'], update) + + +def unique_name_in_collection(collection_name, name): + col = ops.get_collection_ref(collection_name) + unique_name = name + index = 0 + while True: + key = _collection_item_key(col, unique_name) + if key not in ColumnNameInCollection: + break + index += 1 + unique_name = "%s_%d" % (name, index) + return unique_name + + +def gen_embedding_attrs(column=None, + variable=None, + bucket_size=None, + combiner=None, + is_embedding_var=None): + attrs = dict() + attrs["name"] = column.name + attrs["bucket_size"] = bucket_size + attrs["combiner"] = combiner + attrs["is_embedding_var"] = is_embedding_var + attrs["weights_op_path"] = variable.name + if kv_variable_ops: + if isinstance(variable, kv_variable_ops.EmbeddingVariable): + attrs["is_embedding_var"] = True + attrs["embedding_var_keys"] = variable._shared_name + "-keys" + attrs["embedding_var_values"] = variable._shared_name + "-values" + elif (isinstance(variable, variables.PartitionedVariable)) and \ + (isinstance(variable._get_variable_list()[0], kv_variable_ops.EmbeddingVariable)): + attrs["embedding_var_keys"] = [v._shared_name + "-keys" for v in variable] + attrs["embedding_var_values"] = [v._shared_name + "-values" for v in variable] + else: + attrs["is_embedding_var"] = False + else: + attrs["is_embedding_var"] = False + return attrs + +def mark_input_src(name, src_desc): + ops.add_to_collection(ops.GraphKeys.RANK_SERVICE_INPUT_SRC, + json.dumps({'name':name, 'src':src_desc})) diff --git a/easy_rec/python/main.py b/easy_rec/python/main.py index c30111c40..cbaaf5ed2 100644 --- a/easy_rec/python/main.py +++ b/easy_rec/python/main.py @@ -55,7 +55,8 @@ def _get_input_fn(data_config, feature_configs, data_path=None, - export_config=None): + export_config=None, + **kwargs): """Build estimator input function. Args: @@ -78,7 +79,8 @@ def _get_input_fn(data_config, feature_configs, data_path, task_index=task_id, - task_num=task_num) + task_num=task_num, + **kwargs) input_fn = input_obj.create_input(export_config) return input_fn @@ -135,9 +137,12 @@ def _create_eval_export_spec(pipeline_config, eval_data): logging.info('eval_steps = %d' % eval_steps) else: eval_steps = None + input_fn_kwargs = {} + if data_config.input_type == data_config.InputType.OdpsRTPInputV2: + input_fn_kwargs['fg_json_path'] = pipeline_config.fg_json_path # create eval input export_input_fn = _get_input_fn(data_config, feature_configs, None, - export_config) + export_config, **input_fn_kwargs) if export_config.exporter_type == 'final': exporters = [ FinalExporter(name='final', serving_input_receiver_fn=export_input_fn) @@ -178,7 +183,8 @@ def _metric_cmp_fn(best_eval_result, current_eval_result): # set throttle_secs to a small number, so that we can control evaluation # interval steps by checkpoint saving steps - eval_input_fn = _get_input_fn(data_config, feature_configs, eval_data) + eval_input_fn = _get_input_fn(data_config, feature_configs, eval_data, + **input_fn_kwargs) eval_spec = tf.estimator.EvalSpec( name='val', input_fn=eval_input_fn, @@ -301,8 +307,14 @@ def _train_and_evaluate_impl(pipeline_config, continue_train=False): logging.warn('will train INFINITE number of steps') else: logging.info('train_steps = %d' % train_steps) + + input_fn_kwargs = {} + if data_config.input_type == data_config.InputType.OdpsRTPInputV2: + input_fn_kwargs['fg_json_path'] = pipeline_config.fg_json_path + # create train input - train_input_fn = _get_input_fn(data_config, feature_configs, train_data) + train_input_fn = _get_input_fn(data_config, feature_configs, train_data, + **input_fn_kwargs) # Currently only a single Eval Spec is allowed. train_spec = tf.estimator.TrainSpec( input_fn=train_input_fn, max_steps=train_steps) @@ -708,8 +720,11 @@ def export(export_dir, # construct serving input fn export_config = pipeline_config.export_config data_config = pipeline_config.data_config + input_fn_kwargs = {} + if data_config.input_type == data_config.InputType.OdpsRTPInputV2: + input_fn_kwargs['fg_json_path'] = pipeline_config.fg_json_path serving_input_fn = _get_input_fn(data_config, feature_configs, None, - export_config) + export_config, **input_fn_kwargs) if 'oss_path' in extra_params: return export_big_model_to_oss(export_dir, pipeline_config, extra_params, serving_input_fn, estimator, checkpoint_path, @@ -747,3 +762,41 @@ def export(export_dir, logging.info('model has been exported to %s successfully' % final_export_dir) return final_export_dir + + +def export_checkpoint( + pipeline_config=None, + export_path='', + checkpoint_path='', + asset_files=None, + verbose=False, + mode=tf.estimator.ModeKeys.PREDICT): + """Export the EasyRec model as checkpoint""" + pipeline_config = config_util.get_configs_from_pipeline_file(pipeline_config) + if pipeline_config.fg_json_path: + fg_util.load_fg_json_to_config(pipeline_config) + feature_configs = config_util.get_compatible_feature_configs(pipeline_config) + data_config = pipeline_config.data_config + + input_fn_kwargs = {} + if data_config.input_type == data_config.InputType.OdpsRTPInputV2: + input_fn_kwargs['fg_json_path'] = pipeline_config.fg_json_path + + # create estimator + params = {'log_device_placement': verbose} + if asset_files: + logging.info('will add asset files: %s' % asset_files) + params['asset_files'] = asset_files + estimator, _ = _create_estimator(pipeline_config, params=params) + + # construct serving input fn + export_config = pipeline_config.export_config + serving_input_fn = _get_input_fn(data_config, feature_configs, None, + export_config, **input_fn_kwargs) + estimator.export_checkpoint( + export_path=export_path, + serving_input_receiver_fn=serving_input_fn, + checkpoint_path=checkpoint_path, + mode=mode) + + logging.info('model checkpoint has been exported successfully') diff --git a/easy_rec/python/model/deepfm.py b/easy_rec/python/model/deepfm.py index 8734de8d5..4e96bd024 100644 --- a/easy_rec/python/model/deepfm.py +++ b/easy_rec/python/model/deepfm.py @@ -58,6 +58,7 @@ def build_predict_graph(self): # FM fm_fea = fm.FM(name='fm_feature')(self._fm_features) + self._fm_outputs = fm_fea # Deep deep_layer = dnn.DNN(self._model_config.dnn, self._l2_reg, 'deep_feature', @@ -94,3 +95,12 @@ def build_predict_graph(self): self._add_to_prediction_dict(output) return self._prediction_dict + + def build_feature_output_dict(self): + outputs = super(DeepFM, self).build_feature_output_dict() + outputs.update({ + 'wide_features': tf.reduce_join(tf.as_string(self._wide_features), axis=-1, separator=','), + 'deep_features': tf.reduce_join(tf.as_string(self._deep_features), axis=-1, separator=','), + 'fm_outputs': tf.reduce_join(tf.as_string(self._fm_outputs), axis=-1, separator=',') + }) + return outputs diff --git a/easy_rec/python/model/dssm.py b/easy_rec/python/model/dssm.py index 20f873677..518021418 100644 --- a/easy_rec/python/model/dssm.py +++ b/easy_rec/python/model/dssm.py @@ -47,6 +47,9 @@ def __init__(self, self.item_tower_feature, _ = self._input_layer(self._feature_dict, 'item') self.item_id = self.item_tower.id + self._user_tower_emb = None + self._item_tower_emb = None + if self._loss_type in [LossType.CLASSIFICATION, LossType.L2_LOSS]: self._is_point_wise = True logging.info('Use point wise dssm.') @@ -157,6 +160,8 @@ def build_predict_graph(self): else: self._prediction_dict['y'] = y_pred + self._user_tower_emb = user_tower_emb + self._item_tower_emb = item_tower_emb self._prediction_dict['user_emb'] = tf.reduce_join( tf.as_string(user_tower_emb), axis=-1, separator=',') self._prediction_dict['item_emb'] = tf.reduce_join( @@ -254,3 +259,23 @@ def get_outputs(self): return ['y', 'user_emb', 'item_emb'] else: raise ValueError('invalid loss type: %s' % str(self._loss_type)) + + def build_output_dict(self): + output_dict = super(DSSM, self).build_output_dict() + output_dict['user_tower_feature'] = tf.reduce_join(tf.as_string(self.user_tower_feature), axis=-1, separator=',') + output_dict['item_tower_feature'] = tf.reduce_join(tf.as_string(self.item_tower_feature), axis=-1, separator=',') + return output_dict + + def build_rtp_output_dict(self): + output_dict = super(DSSM, self).build_rtp_output_dict() + if self._user_tower_emb is None: + raise ValueError("User tower embedding does not exist. Please checking predict graph.") + output_dict['user_embedding_output'] = tf.identity(self._user_tower_emb, name='user_embedding_output') + if self._item_tower_emb is None: + raise ValueError("Item tower embedding does not exist. Please checking predict graph.") + output_dict['item_embedding_output'] = tf.identity(self._item_tower_emb, name='item_embedding_output') + if self._loss_type == LossType.CLASSIFICATION: + if 'probs' not in self._prediction_dict: + raise ValueError("Probs output does not exist. Please checking predict graph.") + output_dict['rank_predict'] = tf.identity(self._prediction_dict['probs'], name='rank_predict') + return output_dict diff --git a/easy_rec/python/model/easy_rec_estimator.py b/easy_rec/python/model/easy_rec_estimator.py index b64c04f06..77ec6bbf0 100644 --- a/easy_rec/python/model/easy_rec_estimator.py +++ b/easy_rec/python/model/easy_rec_estimator.py @@ -1,16 +1,25 @@ # -*- encoding:utf-8 -*- # Copyright (c) Alibaba, Inc. and its affiliates. from __future__ import print_function +import collections import logging import os import re import time +import json from collections import OrderedDict import tensorflow as tf +from tensorflow.python.framework.sparse_tensor import SparseTensor from tensorflow.python.ops import variables from tensorflow.python.saved_model import signature_constants +from tensorflow.python.training import checkpoint_management +from tensorflow.python.framework import ops +from tensorflow.python.eager import context +from tensorflow.python.client import session as tf_session +from tensorflow.python.training import monitored_session +from tensorflow.python.training import saver from easy_rec.python.builders import optimizer_builder from easy_rec.python.compat import optimizers @@ -18,6 +27,8 @@ from easy_rec.python.compat.early_stopping import find_early_stop_var from easy_rec.python.compat.early_stopping import stop_if_no_decrease_hook from easy_rec.python.compat.early_stopping import stop_if_no_increase_hook +from easy_rec.python.compat.ops import GraphKeys +from easy_rec.python.layers.utils import _tensor_to_tensorinfo from easy_rec.python.protos.pipeline_pb2 import EasyRecConfig from easy_rec.python.protos.train_pb2 import DistributionStrategy from easy_rec.python.utils import estimator_utils @@ -422,15 +433,20 @@ def _export_model_fn(self, features, labels, run_config, params): features, labels=None, is_training=False) - predict_dict = model.build_predict_graph() + model.build_predict_graph() - # add output info to estimator spec + export_config = self._pipeline_config.export_config outputs = {} - output_list = model.get_outputs() - for out in output_list: - assert out in predict_dict, \ - 'output node %s not in prediction_dict, can not be exported' % out - outputs[out] = predict_dict[out] + logging.info("building default outputs") + outputs.update(model.build_output_dict()) + if export_config.export_features: + logging.info("building output features") + outputs.update(model.build_feature_output_dict()) + if export_config.export_rtp_outputs: + logging.info("building RTP outputs") + outputs.update(model.build_rtp_output_dict()) + + for out in outputs: tf.logging.info( 'output %s shape: %s type: %s' % (out, outputs[out].get_shape().as_list(), outputs[out].dtype)) @@ -465,9 +481,71 @@ def _export_model_fn(self, features, labels, run_config, params): def _model_fn(self, features, labels, mode, config, params): os.environ['tf.estimator.mode'] = mode os.environ['tf.estimator.ModeKeys.TRAIN'] = tf.estimator.ModeKeys.TRAIN + if self._pipeline_config.fg_json_path: + EasyRecEstimator._write_rtp_fg_config_to_col(fg_config_path=self._pipeline_config.fg_json_path) + EasyRecEstimator._write_rtp_inputs_to_col(features) if mode == tf.estimator.ModeKeys.TRAIN: return self._train_model_fn(features, labels, config) elif mode == tf.estimator.ModeKeys.EVAL: return self._eval_model_fn(features, labels, config) elif mode == tf.estimator.ModeKeys.PREDICT: return self._export_model_fn(features, labels, config, params) + + @staticmethod + def _write_rtp_fg_config_to_col(fg_config=None, fg_config_path=None): + """Write RTP config to RTP-specified graph collections. + + Args: + fg_config: JSON-dict RTP config. If set, fg_config_path will be ignored. + fg_config_path: path to the RTP config file. + """ + if fg_config is None: + with tf.gfile.GFile(fg_config_path, 'r') as f: + fg_config = json.load(f) + col = ops.get_collection_ref(GraphKeys.RANK_SERVICE_FG_CONF) + if len(col) == 0: + col.append(json.dumps(fg_config)) + else: + col[0] = json.dumps(fg_config) + + @staticmethod + def _write_rtp_inputs_to_col(features): + """Write input nodes information to RTP-specified graph collections. + + Args: + features: the feature dictionary used as model input. + """ + feature_info_map = dict() + for feature_name, feature_value in features.items(): + feature_info = _tensor_to_tensorinfo(feature_value) + feature_info_map[feature_name] = feature_info + col = ops.get_collection_ref(GraphKeys.RANK_SERVICE_FEATURE_NODE) + if len(col) == 0: + col.append(json.dumps(feature_info_map)) + else: + col[0] = json.dumps(feature_info_map) + + def export_checkpoint(self, + export_path=None, + serving_input_receiver_fn=None, + checkpoint_path=None, + mode=tf.estimator.ModeKeys.PREDICT): + with context.graph_mode(): + if not checkpoint_path: + # Locate the latest checkpoint + checkpoint_path = estimator_utils.latest_checkpoint( + self._model_dir) + if not checkpoint_path: + raise ValueError("Couldn't find trained model at %s." % self._model_dir) + with ops.Graph().as_default() as g: + input_receiver = serving_input_receiver_fn() + estimator_spec = self._call_model_fn( + features=input_receiver.features, + labels=getattr(input_receiver, 'labels', None), + mode=mode, + config=self.config) + with tf_session.Session(config=self._session_config) as session: + graph_saver = estimator_spec.scaffold.saver or saver.Saver(sharded=True) + graph_saver.restore(session, checkpoint_path) + graph_saver.save(session, export_path) + diff --git a/easy_rec/python/model/easy_rec_model.py b/easy_rec/python/model/easy_rec_model.py index 7ea15a564..5f8256e27 100644 --- a/easy_rec/python/model/easy_rec_model.py +++ b/easy_rec/python/model/easy_rec_model.py @@ -103,6 +103,39 @@ def build_metric_graph(self, eval_config): def get_outputs(self): pass + def build_output_dict(self): + """for exporting: get standard output nodes""" + outputs = {} + for name in self.get_outputs(): + if name not in self._prediction_dict: + raise KeyError('output node {} not in prediction_dict, can not be exported'.format(name)) + outputs[name] = self._prediction_dict[name] + return outputs + + def build_feature_output_dict(self): + """for exporting: get output feature nodes""" + outputs = {} + for feature_name in self._feature_dict: + out_name = 'feature_' + feature_name + feature_value = self._feature_dict[feature_name] + if isinstance(feature_value, tf.SparseTensor): + sparse_values = feature_value.values + if sparse_values.dtype != tf.string: + sparse_values = tf.as_string(sparse_values) + feature_value = tf.sparse_to_dense(feature_value.indices, + feature_value.dense_shape, + sparse_values, + "") + elif feature_value.dtype != tf.string: + feature_value = tf.as_string(feature_value) + feature_value = tf.reduce_join(feature_value, axis=-1, separator=',') + outputs[out_name] = feature_value + return outputs + + def build_rtp_output_dict(self): + """for exporting: get output nodes for RTP infering""" + return {} + def restore(self, ckpt_path, include_global_step=False, diff --git a/easy_rec/python/model/rank_model.py b/easy_rec/python/model/rank_model.py index b05bac879..31cc24289 100644 --- a/easy_rec/python/model/rank_model.py +++ b/easy_rec/python/model/rank_model.py @@ -1,5 +1,6 @@ # -*- encoding:utf-8 -*- # Copyright (c) Alibaba, Inc. and its affiliates. +import logging import tensorflow as tf from easy_rec.python.builders import loss_builder @@ -54,9 +55,46 @@ def _output_to_prediction_impl(self, return prediction_dict def _add_to_prediction_dict(self, output): - self._prediction_dict.update( - self._output_to_prediction_impl( - output, loss_type=self._loss_type, num_class=self._num_class)) + prediction_dict = self._output_to_prediction_impl( + output, loss_type=self._loss_type, num_class=self._num_class) + self._prediction_dict.update(prediction_dict) + + def build_rtp_output_dict(self): + """Forward tensor as `rank_predict`, which is a special node for RTP""" + outputs = {} + outputs.update(super(RankModel, self).build_rtp_output_dict()) + rank_predict = None + try: + op = tf.get_default_graph().get_operation_by_name('rank_predict') + if len(op.outputs) != 1: + raise ValueError(("failed to build RTP rank_predict output: op {}[{}] has output " + + "size {}, however 1 is expected.").format( + op.name, op.type, len(op.outputs))) + rank_predict = op.outputs[0] + except KeyError: + forwarded = None + if self._loss_type == LossType.CLASSIFICATION: + if 'probs' in self._prediction_dict: + forwarded = self._prediction_dict['probs'] + else: + raise ValueError("failed to build RTP rank_predict output: classification model " + + "expect 'probs' prediction, which is not found. Please check if" + + " build_predict_graph() is called.") + elif self._loss_type in [LossType.L2_LOSS, LossType.SIGMOID_L2_LOSS]: + if 'y' in self._prediction_dict: + forwarded = self._prediction_dict['y'] + else: + raise ValueError("failed to build RTP rank_predict output: regression model expect" + + "'y' prediction, which is not found. Please check if build_predic" + + "t_graph() is called.") + else: + logging.warning("failed to build RTP rank_predict: unsupported loss type {}".foramt( + self._loss_type)) + if forwarded is not None: + rank_predict = tf.identity(forwarded, name='rank_predict') + if rank_predict is not None: + outputs['rank_predict'] = rank_predict + return outputs def _build_loss_impl(self, loss_type, diff --git a/easy_rec/python/protos/dataset.proto b/easy_rec/python/protos/dataset.proto index 8059af72c..bc6875cfc 100644 --- a/easy_rec/python/protos/dataset.proto +++ b/easy_rec/python/protos/dataset.proto @@ -171,6 +171,7 @@ message DatasetConfig { RTPInput = 4; RTPInputV2 = 5; OdpsRTPInput = 6; + OdpsRTPInputV2 = 16; TFRecordInput = 7; BatchTFRecordInput = 14; // for the purpose to debug performance bottleneck of diff --git a/easy_rec/python/protos/export.proto b/easy_rec/python/protos/export.proto index b5b419118..937d55341 100644 --- a/easy_rec/python/protos/export.proto +++ b/easy_rec/python/protos/export.proto @@ -47,4 +47,10 @@ message ExportConfig { // filter out inputs, only keep effective ones optional bool filter_inputs = 12 [default = true]; + + // export the original feature values as string + optional bool export_features = 13 [default = false]; + + // export the outputs required by RTP + optional bool export_rtp_outputs = 14 [default = false]; } diff --git a/pai_jobs/deploy.sh b/pai_jobs/deploy.sh old mode 100644 new mode 100755 diff --git a/pai_jobs/deploy_ext.sh b/pai_jobs/deploy_ext.sh old mode 100644 new mode 100755 index cadc4962a..8426596d5 --- a/pai_jobs/deploy_ext.sh +++ b/pai_jobs/deploy_ext.sh @@ -85,15 +85,14 @@ cd $curr_dir RES_PATH=easy_rec_ext_${VERSION}_res.tar.gz -if [ ! -e easy_rec ] +if [ -e easy_rec ] then - ln -s $root_dir/easy_rec ./ + rm -rf easy_rec fi -cp easy_rec/__init__.py easy_rec/__init__.py.bak +cp -R $root_dir/easy_rec ./easy_rec sed -i -e "s/\[VERSION\]/$VERSION/g" easy_rec/__init__.py find -L easy_rec -name "*.pyc" | xargs rm -rf tar -cvzhf $RES_PATH easy_rec run.py -mv easy_rec/__init__.py.bak easy_rec/__init__.py # 2 means generate only if [ $mode -ne 2 ] diff --git a/pai_jobs/run.py b/pai_jobs/run.py index 163eecc72..0961f7eb9 100644 --- a/pai_jobs/run.py +++ b/pai_jobs/run.py @@ -413,6 +413,23 @@ def main(argv): batch_size=FLAGS.batch_size, slice_id=FLAGS.task_index, slice_num=worker_num) + elif FLAGS.cmd == 'export_checkpoint': + check_param('export_dir') + check_param('config') + set_tf_config_and_get_train_worker_num( + FLAGS.ps_hosts, + FLAGS.worker_hosts, + FLAGS.task_index, + FLAGS.job_name, + eval_method='none') + assert len(FLAGS.worker_hosts.split(',')) == 1, 'export only need 1 woker' + config_util.auto_expand_share_feature_configs(pipeline_config) + easy_rec.export_checkpoint( + pipeline_config, + export_path=FLAGS.export_dir + '/model', + checkpoint_path=FLAGS.checkpoint_path, + asset_files=FLAGS.asset_files, + verbose=FLAGS.verbose) elif FLAGS.cmd == 'vector_retrieve': check_param('knn_distance') assert FLAGS.knn_feature_dims is not None, '`knn_feature_dims` should not be None' @@ -444,7 +461,7 @@ def main(argv): knn(FLAGS.knn_num_neighbours, FLAGS.task_index, len(worker_hosts)) else: raise ValueError( - 'cmd should be one of train/evaluate/export/predict/vector_retrieve') + 'cmd should be one of train/evaluate/export/predict/export_checkpoint/vector_retrieve') if __name__ == '__main__': From 665f61c0058d9a001624561a723f708da3d98f71 Mon Sep 17 00:00:00 2001 From: lgqfhwy Date: Wed, 2 Mar 2022 11:32:46 +0800 Subject: [PATCH 08/10] [feat]:Add numeric sequence feature (#102) * add numeric sequence feature --- .gitattributes | 2 + .github/workflows/ci.yml | 40 +- docs/source/develop.md | 4 +- docs/source/quick_start/local_tutorial.md | 2 +- .../feature_column/feature_column_v2.py | 464 +++++++++++++++++- .../feature_column/sequence_feature_column.py | 59 ++- .../python/feature_column/feature_column.py | 86 +++- easy_rec/python/input/input.py | 91 +++- easy_rec/python/layers/input_layer.py | 76 +-- easy_rec/python/layers/seq_input_layer.py | 10 +- easy_rec/python/model/autoint.py | 10 +- easy_rec/python/protos/feature_config.proto | 17 +- easy_rec/python/test/train_eval_test.py | 65 ++- easy_rec/python/utils/config_util.py | 6 +- easy_rec/python/utils/convert_rtp_fg.py | 24 +- pre-commit | 3 + ...ic_boundary_sequence_feature_taobao.config | 346 +++++++++++++ ...hash_bucket_sequence_feature_taobao.config | 302 ++++++++++++ ...num_buckets_sequence_feature_taobao.config | 308 ++++++++++++ ...numeric_raw_sequence_feature_taobao.config | 304 ++++++++++++ ...tl_on_multi_sequence_feature_taobao.config | 311 ++++++++++++ ...ic_boundary_sequence_feature_taobao.config | 344 +++++++++++++ ...hash_bucket_sequence_feature_taobao.config | 300 +++++++++++ ...num_buckets_sequence_feature_taobao.config | 306 ++++++++++++ ...numeric_raw_sequence_feature_taobao.config | 300 +++++++++++ .../rtp_fg/fg_test_extensions_final.config | 2 + scripts/pre-commit | 46 ++ 27 files changed, 3723 insertions(+), 105 deletions(-) create mode 100644 samples/model_config/dbmtl_on_multi_numeric_boundary_sequence_feature_taobao.config create mode 100644 samples/model_config/dbmtl_on_multi_numeric_hash_bucket_sequence_feature_taobao.config create mode 100644 samples/model_config/dbmtl_on_multi_numeric_num_buckets_sequence_feature_taobao.config create mode 100644 samples/model_config/dbmtl_on_multi_numeric_raw_sequence_feature_taobao.config create mode 100644 samples/model_config/dbmtl_on_multi_sequence_feature_taobao.config create mode 100644 samples/model_config/dbmtl_on_numeric_boundary_sequence_feature_taobao.config create mode 100644 samples/model_config/dbmtl_on_numeric_hash_bucket_sequence_feature_taobao.config create mode 100644 samples/model_config/dbmtl_on_numeric_num_buckets_sequence_feature_taobao.config create mode 100644 samples/model_config/dbmtl_on_numeric_raw_sequence_feature_taobao.config create mode 100755 scripts/pre-commit diff --git a/.gitattributes b/.gitattributes index 7fd6aee34..5e7dc28eb 100644 --- a/.gitattributes +++ b/.gitattributes @@ -5,6 +5,7 @@ data/test/inference/fg_export_multi/variables/variables.index filter=lfs diff=lf data/test/inference/tb_multitower_export/assets/pipeline.config filter=lfs diff=lfs merge=lfs -text data/test/latest_ckpt_test/model.ckpt-500.meta filter=lfs diff=lfs merge=lfs -text data/test/tb_data/taobao_test_data filter=lfs diff=lfs merge=lfs -text +data/test/tb_data/taobao_multi_seq_test_data filter=lfs diff=lfs merge=lfs -text data/test/test.csv filter=lfs diff=lfs merge=lfs -text data/test/inference/tb_multitower_placeholder_rename_export/assets/pipeline.config filter=lfs diff=lfs merge=lfs -text data/test/inference/tb_multitower_export/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text @@ -15,6 +16,7 @@ data/test/criteo_sample.tfrecord filter=lfs diff=lfs merge=lfs -text data/test/rtp/taobao_valid.csv filter=lfs diff=lfs merge=lfs -text data/test/rtp/taobao_train_feature.txt filter=lfs diff=lfs merge=lfs -text data/test/tb_data/taobao_train_data filter=lfs diff=lfs merge=lfs -text +data/test/tb_data/taobao_multi_seq_train_data filter=lfs diff=lfs merge=lfs -text data/test/inference/fg_export_single/variables/variables.index filter=lfs diff=lfs merge=lfs -text data/test/inference/lookup_data_test80.csv filter=lfs diff=lfs merge=lfs -text data/test/inference/tb_multitower_export/variables/variables.index filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8dc01a6e9..a223b9c2c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,11 +21,11 @@ jobs: TEST_DEVICES: "" run: | source activate /home/admin/tf12_py2/ - if [ ! -e "/tmp/easyrec_data_20210818.tar.gz" ] + if [ ! -e "/tmp/easyrec_data_20220113.tar.gz" ] then - wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20210818.tar.gz -O /tmp/easyrec_data_20210818.tar.gz + wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20220113.tar.gz -O /tmp/easyrec_data_20220113.tar.gz fi - tar -zvxf /tmp/easyrec_data_20210818.tar.gz + tar -zvxf /tmp/easyrec_data_20220113.tar.gz source scripts/ci_test.sh - name: LabelAndComment env: @@ -52,39 +52,39 @@ jobs: var pass_label = null; if (labels != null) { pass_label = labels.find(label=>label.name=='ci_test_passed'); - } - + } + var fail_label = null; if (labels != null) { fail_label = labels.find(label=>label.name=='ci_test_failed'); - } - + } + if (pass_label) { github.rest.issues.removeLabel({ issue_number: context.issue.number, owner: context.repo.owner, - repo: context.repo.repo, + repo: context.repo.repo, name: 'ci_test_passed' - }) - } - + }) + } + if (fail_label) { github.rest.issues.removeLabel({ issue_number: context.issue.number, owner: context.repo.owner, - repo: context.repo.repo, + repo: context.repo.repo, name: 'ci_test_failed' - }) - } - - if (CI_TEST_PASSED == 1) { + }) + } + + if (CI_TEST_PASSED == 1) { github.rest.issues.addLabels({ issue_number: context.issue.number, owner: context.repo.owner, - repo: context.repo.repo, + repo: context.repo.repo, labels: ['ci_test_passed'] }) - + github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, @@ -95,10 +95,10 @@ jobs: github.rest.issues.addLabels({ issue_number: context.issue.number, owner: context.repo.owner, - repo: context.repo.repo, + repo: context.repo.repo, labels: ['ci_test_failed'] }) - + github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, diff --git a/docs/source/develop.md b/docs/source/develop.md index 0e6318653..83cdff13e 100644 --- a/docs/source/develop.md +++ b/docs/source/develop.md @@ -55,8 +55,8 @@ TEMPDIR=/tmp python -m easy_rec.python.test.odps_run --oss_config ~/.ossutilconf 下载测试数据 ```bash -wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20210818.tar.gz -tar -xvzf easyrec_data_20210818.tar.gz +wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20220113.tar.gz +tar -xvzf easyrec_data_20220113.tar.gz ``` 如果您要添加新数据,请在“git commit”之前执行以下操作,以将其提交到 git-lfs: diff --git a/docs/source/quick_start/local_tutorial.md b/docs/source/quick_start/local_tutorial.md index bbde40e9d..861a00b26 100644 --- a/docs/source/quick_start/local_tutorial.md +++ b/docs/source/quick_start/local_tutorial.md @@ -5,7 +5,7 @@ ```bash git clone https://github.com/alibaba/EasyRec.git cd EasyRec -wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20210818.tar.gz +wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20220113.tar.gz bash scripts/gen_proto.sh # 根据proto文件生成 配置解析.py文件 python setup.py install ``` diff --git a/easy_rec/python/compat/feature_column/feature_column_v2.py b/easy_rec/python/compat/feature_column/feature_column_v2.py index be310749c..31155b75a 100644 --- a/easy_rec/python/compat/feature_column/feature_column_v2.py +++ b/easy_rec/python/compat/feature_column/feature_column_v2.py @@ -134,6 +134,7 @@ import numpy as np import six +import tensorflow as tf from tensorflow.python.eager import context from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops @@ -2984,6 +2985,451 @@ def _from_config(cls, config, custom_objects=None, columns_by_name=None): return cls(**kwargs) +class SequenceBucketizedColumn( + DenseColumn, + CategoricalColumn, + fc_old._DenseColumn, # pylint: disable=protected-access + fc_old._CategoricalColumn, # pylint: disable=protected-access + collections.namedtuple('SequenceBucketizedColumn', + ('source_column', 'boundaries'))): + """See `bucketized_column`.""" + + @property + def _is_v2_column(self): + return (isinstance(self.source_column, FeatureColumn) and + self.source_column._is_v2_column) # pylint: disable=protected-access + + @property + def name(self): + """See `FeatureColumn` base class.""" + return '{}_bucketized'.format(self.source_column.name) + + @property + def raw_name(self): + """See `FeatureColumn` base class.""" + return self.source_column.raw_name + + @property + def parse_example_spec(self): + """See `FeatureColumn` base class.""" + return self.source_column.parse_example_spec + + @property + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _parse_example_spec(self): + return self.source_column._parse_example_spec # pylint: disable=protected-access + + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _transform_feature(self, inputs): + """Returns bucketized categorical `source_column` tensor.""" + source_tensor = inputs.get(self.source_column) + bucketize_values = math_ops._bucketize( + source_tensor.values, boundaries=self.boundaries) + bucketize_tensor = sparse_tensor_lib.SparseTensor( + indices=source_tensor.indices, + values=bucketize_values, + dense_shape=source_tensor.dense_shape) + return bucketize_tensor + + def transform_feature(self, transformation_cache, state_manager): + """Returns bucketized categorical `source_column` tensor.""" + source_tensor = transformation_cache.get(self.source_column, state_manager) + return math_ops._bucketize( # pylint: disable=protected-access + source_tensor, + boundaries=self.boundaries) + + @property + def variable_shape(self): + """See `DenseColumn` base class.""" + return tensor_shape.TensorShape( + tuple(self.source_column.shape) + (len(self.boundaries) + 1,)) + + @property + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _variable_shape(self): + return self.variable_shape + + def _get_dense_tensor_for_input_tensor(self, input_tensor): + return array_ops.one_hot( + indices=math_ops.cast(input_tensor, dtypes.int64), + depth=len(self.boundaries) + 1, + on_value=1., + off_value=0.) + + def get_dense_tensor(self, transformation_cache, state_manager): + """Returns one hot encoded dense `Tensor`.""" + input_tensor = transformation_cache.get(self, state_manager) + return self._get_dense_tensor_for_input_tensor(input_tensor) + + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): + del weight_collections + del trainable + input_tensor = inputs.get(self) + return self._get_dense_tensor_for_input_tensor(input_tensor) + + @property + def num_buckets(self): + """See `CategoricalColumn` base class.""" + # By construction, source_column is always one-dimensional. + return (len(self.boundaries) + 1) * self.source_column.shape[0] + + @property + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _num_buckets(self): + return self.num_buckets + + def _get_sparse_tensors_for_input_tensor(self, input_sparse_tensor): + input_tensor = input_sparse_tensor.values + input_indices = input_sparse_tensor.indices + batch_size = array_ops.shape(input_tensor)[0] + # By construction, source_column is always one-dimensional. + source_dimension = self.source_column.shape[0] + + i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size]) + # Flatten the bucket indices and unique them across dimensions + # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets + bucket_indices = ( + array_ops.reshape(input_tensor, + (-1,)) + (len(self.boundaries) + 1) * i2) + + sparse_tensor = sparse_tensor_lib.SparseTensor( + indices=input_indices, + values=bucket_indices, + dense_shape=input_sparse_tensor.dense_shape) + # Compute the third dimension explicitly instead of setting it to -1, as + # that doesn't work for dynamically shaped tensors with 0-length at runtime. + # This happens for empty sequences. + shape = array_ops.shape(sparse_tensor) + target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])] + ret_seq_tensor = sparse_ops.sparse_reshape(sparse_tensor, target_shape) + return CategoricalColumn.IdWeightPair(ret_seq_tensor, None) + + def get_sparse_tensors(self, transformation_cache, state_manager): + """Converts dense inputs to SparseTensor so downstream code can use it.""" + input_tensor = transformation_cache.get(self, state_manager) + return self._get_sparse_tensors_for_input_tensor(input_tensor) + + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _get_sparse_tensors(self, + inputs, + weight_collections=None, + trainable=None): + """Converts dense inputs to SparseTensor so downstream code can use it.""" + del weight_collections + del trainable + input_tensor = inputs.get(self) + return self._get_sparse_tensors_for_input_tensor(input_tensor) + + @property + def parents(self): + """See 'FeatureColumn` base class.""" + return [self.source_column] + + def _get_config(self): + """See 'FeatureColumn` base class.""" + config = dict(zip(self._fields, self)) + config['source_column'] = serialize_feature_column(self.source_column) + return config + + @classmethod + def _from_config(cls, config, custom_objects=None, columns_by_name=None): + """See 'FeatureColumn` base class.""" + _check_config_keys(config, cls._fields) + kwargs = config.copy() + kwargs['source_column'] = deserialize_feature_column( + config['source_column'], custom_objects, columns_by_name) + return cls(**kwargs) + + +class SequenceNumericColumn( + DenseColumn, + CategoricalColumn, + fc_old._DenseColumn, # pylint: disable=protected-access + fc_old._CategoricalColumn, # pylint: disable=protected-access + collections.namedtuple('SequenceNumericColumn', + ('source_column', 'sequence_length'))): + """See `SequenceNumericColumn`.""" + + @property + def _is_v2_column(self): + return (isinstance(self.source_column, FeatureColumn) and + self.source_column._is_v2_column) # pylint: disable=protected-access + + @property + def name(self): + """See `FeatureColumn` base class.""" + return '{}_bucketized'.format(self.source_column.name) + + @property + def raw_name(self): + """See `FeatureColumn` base class.""" + return self.source_column.raw_name + + @property + def parse_example_spec(self): + """See `FeatureColumn` base class.""" + return self.source_column.parse_example_spec + + @property + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _parse_example_spec(self): + return self.source_column._parse_example_spec # pylint: disable=protected-access + + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _transform_feature(self, inputs): + """Returns bucketized categorical `source_column` tensor.""" + source_tensor = inputs.get(self.source_column) + return source_tensor + + def transform_feature(self, transformation_cache, state_manager): + """Returns bucketized categorical `source_column` tensor.""" + source_tensor = transformation_cache.get(self.source_column, state_manager) + return source_tensor + + @property + def variable_shape(self): + """See `DenseColumn` base class.""" + return tensor_shape.TensorShape( + tuple(self.source_column.shape) + (self.sequence_length,)) + + @property + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _variable_shape(self): + return self.variable_shape + + def _get_dense_tensor_for_input_tensor(self, input_tensor): + return array_ops.one_hot( + indices=math_ops.cast(input_tensor, dtypes.int64), + depth=self.sequence_length, + on_value=1., + off_value=0.) + + def get_dense_tensor(self, transformation_cache, state_manager): + """Returns one hot encoded dense `Tensor`.""" + input_tensor = transformation_cache.get(self, state_manager) + return self._get_dense_tensor_for_input_tensor(input_tensor) + + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): + del weight_collections + del trainable + input_tensor = inputs.get(self) + return self._get_dense_tensor_for_input_tensor(input_tensor) + + def _get_sequence_dense_tensor(self, inputs): + input_tensor = inputs.get(self) + sparse_tensors = self._get_sparse_tensors_for_input_tensor( + input_tensor).id_tensor + sequence_length = fc_utils.sequence_length_from_sparse_tensor( + sparse_tensors) + sequence_length = tf.cast(sequence_length, tf.int32) + shape = array_ops.shape(sparse_tensors) + target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])] + ret_tensor = tf.sparse_to_dense(sparse_tensors.indices, target_shape, + sparse_tensors.values) + return CategoricalColumn.IdWeightPair(ret_tensor, sequence_length) + + @property + def num_buckets(self): + """See `CategoricalColumn` base class.""" + # By construction, source_column is always one-dimensional. + return self.sequence_length * self.source_column.shape[0] + + @property + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _num_buckets(self): + return self.num_buckets + + def _get_sparse_tensors_for_input_tensor(self, sparse_tensor): + # Compute the third dimension explicitly instead of setting it to -1, as + # that doesn't work for dynamically shaped tensors with 0-length at runtime. + # This happens for empty sequences. + shape = array_ops.shape(sparse_tensor) + target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])] + ret_seq_tensor = sparse_ops.sparse_reshape(sparse_tensor, target_shape) + return CategoricalColumn.IdWeightPair(ret_seq_tensor, None) + + def get_sparse_tensors(self, transformation_cache, state_manager): + """Converts dense inputs to SparseTensor so downstream code can use it.""" + input_tensor = transformation_cache.get(self, state_manager) + return self._get_sparse_tensors_for_input_tensor(input_tensor) + + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _get_sparse_tensors(self, + inputs, + weight_collections=None, + trainable=None): + """Converts dense inputs to SparseTensor so downstream code can use it.""" + del weight_collections + del trainable + input_tensor = inputs.get(self) + return self._get_sparse_tensors_for_input_tensor(input_tensor) + + @property + def parents(self): + """See 'FeatureColumn` base class.""" + return [self.source_column] + + def _get_config(self): + """See 'FeatureColumn` base class.""" + config = dict(zip(self._fields, self)) + config['source_column'] = serialize_feature_column(self.source_column) + return config + + @classmethod + def _from_config(cls, config, custom_objects=None, columns_by_name=None): + """See 'FeatureColumn` base class.""" + _check_config_keys(config, cls._fields) + kwargs = config.copy() + kwargs['source_column'] = deserialize_feature_column( + config['source_column'], custom_objects, columns_by_name) + return cls(**kwargs) + + +class SequenceWeightedCategoricalColumn( + CategoricalColumn, + fc_old._CategoricalColumn, # pylint: disable=protected-access + collections.namedtuple( + 'SequenceWeightedCategoricalColumn', + ('categorical_column', 'weight_feature_key', 'dtype'))): + """See `weighted_categorical_column`.""" + + @property + def _is_v2_column(self): + return (isinstance(self.categorical_column, FeatureColumn) and + self.categorical_column._is_v2_column) # pylint: disable=protected-access + + @property + def name(self): + """See `FeatureColumn` base class.""" + return '{}_weighted_by_{}'.format(self.categorical_column.name, + self.weight_feature_key) + + @property + def raw_name(self): + """See `FeatureColumn` base class.""" + return self.categorical_column.raw_name + + @property + def parse_example_spec(self): + """See `FeatureColumn` base class.""" + config = self.categorical_column.parse_example_spec + if self.weight_feature_key in config: + raise ValueError('Parse config {} already exists for {}.'.format( + config[self.weight_feature_key], self.weight_feature_key)) + config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype) + return config + + @property + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _parse_example_spec(self): + config = self.categorical_column._parse_example_spec # pylint: disable=protected-access + if self.weight_feature_key in config: + raise ValueError('Parse config {} already exists for {}.'.format( + config[self.weight_feature_key], self.weight_feature_key)) + config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype) + return config + + @property + def num_buckets(self): + """See `DenseColumn` base class.""" + return self.categorical_column.num_buckets + + @property + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _num_buckets(self): + return self.categorical_column._num_buckets # pylint: disable=protected-access + + def _transform_weight_tensor(self, weight_tensor): + if weight_tensor is None: + raise ValueError('Missing weights {}.'.format(self.weight_feature_key)) + weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( + weight_tensor) + if self.dtype != weight_tensor.dtype.base_dtype: + raise ValueError('Bad dtype, expected {}, but got {}.'.format( + self.dtype, weight_tensor.dtype)) + if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor): + # The weight tensor can be a regular Tensor. In this case, sparsify it. + weight_tensor = _to_sparse_input_and_drop_ignore_values( + weight_tensor, ignore_value=0.0) + if not weight_tensor.dtype.is_floating: + weight_tensor = math_ops.cast(weight_tensor, dtypes.float32) + shape = tf.shape(weight_tensor) + target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])] + weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape) + return weight_tensor + + def transform_feature(self, transformation_cache, state_manager): + """Applies weights to tensor generated from `categorical_column`'.""" + weight_tensor = transformation_cache.get(self.weight_feature_key, + state_manager) + weight_tensor = self._transform_weight_tensor(weight_tensor) + return (transformation_cache.get(self.categorical_column, + state_manager), weight_tensor) + + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _transform_feature(self, inputs): + """Applies weights to tensor generated from `categorical_column`'.""" + weight_tensor = inputs.get(self.weight_feature_key) + weight_tensor = self._transform_weight_tensor(weight_tensor) + return (inputs.get(self.categorical_column), weight_tensor) + + def get_sparse_tensors(self, transformation_cache, state_manager): + """See `CategoricalColumn` base class.""" + tensors = transformation_cache.get(self, state_manager) + return CategoricalColumn.IdWeightPair(tensors[0], tensors[1]) + + @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, + _FEATURE_COLUMN_DEPRECATION) + def _get_sparse_tensors(self, + inputs, + weight_collections=None, + trainable=None): + del weight_collections + del trainable + tensors = inputs.get(self) + return CategoricalColumn.IdWeightPair(tensors[0], tensors[1]) + + @property + def parents(self): + """See 'FeatureColumn` base class.""" + return [self.categorical_column, self.weight_feature_key] + + def _get_config(self): + """See 'FeatureColumn` base class.""" + config = dict(zip(self._fields, self)) + config['categorical_column'] = serialize_feature_column( + self.categorical_column) + config['dtype'] = self.dtype.name + return config + + @classmethod + def _from_config(cls, config, custom_objects=None, columns_by_name=None): + """See 'FeatureColumn` base class.""" + _check_config_keys(config, cls._fields) + kwargs = config.copy() + kwargs['categorical_column'] = deserialize_feature_column( + config['categorical_column'], custom_objects, columns_by_name) + kwargs['dtype'] = dtypes.as_dtype(config['dtype']) + return cls(**kwargs) + + class EmbeddingColumn( DenseColumn, SequenceDenseColumn, @@ -3226,7 +3672,9 @@ def _get_sequence_dense_tensor(self, trainable=None): if not isinstance( self.categorical_column, - (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)): # pylint: disable=protected-access + (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn, + SequenceBucketizedColumn, SequenceNumericColumn, + SequenceWeightedCategoricalColumn)): # pylint: disable=protected-access raise ValueError( 'In embedding_column: {}. ' 'categorical_column must be of type SequenceCategoricalColumn ' @@ -4551,13 +4999,21 @@ def _parse_example_spec(self): def transform_feature(self, transformation_cache, state_manager): """See `FeatureColumn` base class.""" - return self.categorical_column.transform_feature(transformation_cache, - state_manager) + ret_tensor = self.categorical_column.transform_feature( + transformation_cache, state_manager) + shape = array_ops.shape(ret_tensor) + target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])] + ret_tensor = sparse_ops.sparse_reshape(ret_tensor, target_shape) + return ret_tensor @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, _FEATURE_COLUMN_DEPRECATION) def _transform_feature(self, inputs): - return self.categorical_column._transform_feature(inputs) # pylint: disable=protected-access + ret_tensor = self.categorical_column._transform_feature(inputs) + shape = array_ops.shape(ret_tensor) + target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])] + ret_tensor = sparse_ops.sparse_reshape(ret_tensor, target_shape) + return ret_tensor @property def num_buckets(self): diff --git a/easy_rec/python/compat/feature_column/sequence_feature_column.py b/easy_rec/python/compat/feature_column/sequence_feature_column.py index 10382bbb6..f4994103c 100644 --- a/easy_rec/python/compat/feature_column/sequence_feature_column.py +++ b/easy_rec/python/compat/feature_column/sequence_feature_column.py @@ -29,9 +29,11 @@ from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops from tensorflow.python.ops import check_ops +from tensorflow.python.ops import math_ops from tensorflow.python.ops import parsing_ops from tensorflow.python.ops import sparse_ops +from easy_rec.python.compat.feature_column import feature_column as fc_v1 from easy_rec.python.compat.feature_column import feature_column_v2 as fc from easy_rec.python.compat.feature_column import utils as fc_utils @@ -236,6 +238,47 @@ def sequence_categorical_column_with_identity(key, key=key, num_buckets=num_buckets, default_value=default_value)) +def sequence_numeric_column_with_bucketized_column(source_column, boundaries): + if not isinstance(source_column, (SequenceNumericColumn,)): # pylint: disable=protected-access + raise ValueError( + 'source_column must be a column generated with sequence_numeric_column(). ' + 'Given: {}'.format(source_column)) + if len(source_column.shape) > 1: + raise ValueError('source_column must be one-dimensional column. ' + 'Given: {}'.format(source_column)) + if not boundaries: + raise ValueError('boundaries must not be empty.') + if not (isinstance(boundaries, list) or isinstance(boundaries, tuple)): + raise ValueError('boundaries must be a sorted list.') + for i in range(len(boundaries) - 1): + if boundaries[i] >= boundaries[i + 1]: + raise ValueError('boundaries must be a sorted list.') + return fc.SequenceBucketizedColumn(source_column, tuple(boundaries)) + + +def sequence_numeric_column_with_raw_column(source_column, sequence_length): + if not isinstance(source_column, (SequenceNumericColumn,)): # pylint: disable=protected-access + raise ValueError( + 'source_column must be a column generated with sequence_numeric_column(). ' + 'Given: {}'.format(source_column)) + if len(source_column.shape) > 1: + raise ValueError('source_column must be one-dimensional column. ' + 'Given: {}'.format(source_column)) + + return fc.SequenceNumericColumn(source_column, sequence_length) + + +def sequence_weighted_categorical_column(categorical_column, + weight_feature_key, + dtype=dtypes.float32): + if (dtype is None) or not (dtype.is_integer or dtype.is_floating): + raise ValueError('dtype {} is not convertible to float.'.format(dtype)) + return fc.SequenceWeightedCategoricalColumn( + categorical_column=categorical_column, + weight_feature_key=weight_feature_key, + dtype=dtype) + + def sequence_categorical_column_with_hash_bucket(key, hash_bucket_size, dtype=dtypes.string): @@ -485,7 +528,7 @@ def _assert_all_equal_and_return(tensors, name=None): class SequenceNumericColumn( - fc.SequenceDenseColumn, + fc.SequenceDenseColumn, fc_v1._FeatureColumn, collections.namedtuple( 'SequenceNumericColumn', ('key', 'shape', 'default_value', 'dtype', 'normalizer_fn'))): @@ -500,11 +543,23 @@ def name(self): """See `FeatureColumn` base class.""" return self.key + @property + def raw_name(self): + """See `FeatureColumn` base class.""" + return self.key + @property def parse_example_spec(self): """See `FeatureColumn` base class.""" return {self.key: parsing_ops.VarLenFeature(self.dtype)} + def _transform_feature(self, inputs): + input_tensor = inputs.get(self.key) + return self._transform_input_tensor(input_tensor) + + def _transform_input_tensor(self, input_tensor): + return math_ops.cast(input_tensor, dtypes.float32) + def transform_feature(self, transformation_cache, state_manager): """See `FeatureColumn` base class. @@ -522,7 +577,7 @@ def transform_feature(self, transformation_cache, state_manager): input_tensor = transformation_cache.get(self.key, state_manager) if self.normalizer_fn is not None: input_tensor = self.normalizer_fn(input_tensor) - return input_tensor + return self._transform_input_tensor(input_tensor) @property def variable_shape(self): diff --git a/easy_rec/python/feature_column/feature_column.py b/easy_rec/python/feature_column/feature_column.py index 5a208591c..cf34cb7af 100644 --- a/easy_rec/python/feature_column/feature_column.py +++ b/easy_rec/python/feature_column/feature_column.py @@ -379,28 +379,74 @@ def parse_sequence_feature(self, config): """ feature_name = config.feature_name if config.HasField('feature_name') \ else config.input_names[0] - if config.HasField('hash_bucket_size'): - hash_bucket_size = config.hash_bucket_size - fc = sequence_feature_column.sequence_categorical_column_with_hash_bucket( - config.input_names[0], hash_bucket_size, dtype=tf.string) - elif config.vocab_list: - fc = sequence_feature_column.sequence_categorical_column_with_vocabulary_list( - config.input_names[0], - default_value=0, - vocabulary_list=config.vocab_list) - elif config.vocab_file: - fc = sequence_feature_column.sequence_categorical_column_with_vocabulary_file( - config.input_names[0], - default_value=0, - vocabulary_file=config.vocab_file, - vocabulary_size=self._get_vocab_size(config.vocab_file)) + sub_feature_type = config.sub_feature_type + assert sub_feature_type in [config.IdFeature, config.RawFeature], \ + 'Current sub_feature_type only support IdFeature and RawFeature.' + if sub_feature_type == config.IdFeature: + if config.HasField('hash_bucket_size'): + hash_bucket_size = config.hash_bucket_size + fc = sequence_feature_column.sequence_categorical_column_with_hash_bucket( + config.input_names[0], hash_bucket_size, dtype=tf.string) + elif config.vocab_list: + fc = sequence_feature_column.sequence_categorical_column_with_vocabulary_list( + config.input_names[0], + default_value=0, + vocabulary_list=config.vocab_list) + elif config.vocab_file: + fc = sequence_feature_column.sequence_categorical_column_with_vocabulary_file( + config.input_names[0], + default_value=0, + vocabulary_file=config.vocab_file, + vocabulary_size=self._get_vocab_size(config.vocab_file)) + else: + fc = sequence_feature_column.sequence_categorical_column_with_identity( + config.input_names[0], config.num_buckets, default_value=0) else: - fc = sequence_feature_column.sequence_categorical_column_with_identity( - config.input_names[0], config.num_buckets, default_value=0) - - assert config.embedding_dim > 0 + bounds = None + fc = sequence_feature_column.sequence_numeric_column( + config.input_names[0], shape=(1,)) + if config.hash_bucket_size > 0: + hash_bucket_size = config.hash_bucket_size + assert sub_feature_type == config.IdFeature, \ + 'You should set sub_feature_type to IdFeature to use hash_bucket_size.' + elif config.boundaries: + bounds = list(config.boundaries) + bounds.sort() + elif config.num_buckets > 1 and config.max_val > config.min_val: + # the feature values are already normalized into [0, 1] + bounds = [ + x / float(config.num_buckets) for x in range(0, config.num_buckets) + ] + logging.info('sequence feature discrete %s into %d buckets' % + (feature_name, config.num_buckets)) + if bounds: + try: + fc = sequence_feature_column.sequence_numeric_column_with_bucketized_column( + fc, bounds) + except Exception as e: + tf.logging.error( + 'sequence features bucketized_column [%s] with bounds %s error' % + (config.input_names[0], str(bounds))) + raise e + elif config.hash_bucket_size <= 0: + if config.embedding_dim > 0: + tmp_id_col = sequence_feature_column.sequence_categorical_column_with_identity( + config.input_names[0] + '_raw_proj_id', + config.raw_input_dim, + default_value=0) + wgt_fc = sequence_feature_column.sequence_weighted_categorical_column( + tmp_id_col, + weight_feature_key=config.input_names[0] + '_raw_proj_val', + dtype=tf.float32) + fc = wgt_fc + else: + fc = sequence_feature_column.sequence_numeric_column_with_raw_column( + fc, config.sequence_length) - self._add_deep_embedding_column(fc, config) + if config.embedding_dim > 0: + self._add_deep_embedding_column(fc, config) + else: + self._sequence_columns[feature_name] = fc def _build_partitioner(self, max_partitions): if max_partitions > 1: diff --git a/easy_rec/python/input/input.py b/easy_rec/python/input/input.py index 987f1acf9..c0d8653bf 100644 --- a/easy_rec/python/input/input.py +++ b/easy_rec/python/input/input.py @@ -350,6 +350,7 @@ def _preprocess(self, field_dict): elif feature_type == fc.SequenceFeature: input_0 = fc.input_names[0] field = field_dict[input_0] + sub_feature_type = fc.sub_feature_type # Construct the output of SeqFeature according to the dimension of field_dict. # When the input field exceeds 2 dimensions, convert SeqFeature to 2D output. if len(field.get_shape()) < 2: @@ -367,7 +368,7 @@ def _preprocess(self, field_dict): axis=0) parsed_dict[input_0] = tf.sparse.SparseTensor( out_indices, multi_vals.values, out_shape) - if fc.num_buckets > 0: + if (fc.num_buckets > 1 and fc.max_val == fc.min_val): parsed_dict[input_0] = tf.sparse.SparseTensor( parsed_dict[input_0].indices, tf.string_to_number( @@ -375,8 +376,96 @@ def _preprocess(self, field_dict): tf.int64, name='sequence_str_2_int_%s' % input_0), parsed_dict[input_0].dense_shape) + elif sub_feature_type == fc.RawFeature: + parsed_dict[input_0] = tf.sparse.SparseTensor( + parsed_dict[input_0].indices, + tf.string_to_number( + parsed_dict[input_0].values, + tf.float32, + name='sequence_str_2_float_%s' % input_0), + parsed_dict[input_0].dense_shape) + if fc.num_buckets > 1 and fc.max_val > fc.min_val: + normalized_values = (parsed_dict[input_0].values - fc.min_val) / ( + fc.max_val - fc.min_val) + parsed_dict[input_0] = tf.sparse.SparseTensor( + parsed_dict[input_0].indices, normalized_values, + parsed_dict[input_0].dense_shape) else: parsed_dict[input_0] = field + if not fc.boundaries and fc.num_buckets <= 1 and fc.hash_bucket_size <= 0 and \ + self._data_config.sample_weight != input_0 and sub_feature_type == fc.RawFeature and \ + fc.raw_input_dim == 1: + # may need by wide model and deep model to project + # raw values to a vector, it maybe better implemented + # by a ProjectionColumn later + logging.info( + 'Not set boundaries or num_buckets or hash_bucket_size, %s will process as two dimentsion raw feature' + % input_0) + parsed_dict[input_0] = tf.sparse_to_dense( + parsed_dict[input_0].indices, + [tf.shape(parsed_dict[input_0])[0], fc.sequence_length], + parsed_dict[input_0].values) + sample_num = tf.to_int64(tf.shape(parsed_dict[input_0])[0]) + indices_0 = tf.range(sample_num, dtype=tf.int64) + indices_1 = tf.range(fc.sequence_length, dtype=tf.int64) + indices_0 = indices_0[:, None] + indices_1 = indices_1[None, :] + indices_0 = tf.tile(indices_0, [1, fc.sequence_length]) + indices_1 = tf.tile(indices_1, [sample_num, 1]) + indices_0 = tf.reshape(indices_0, [-1, 1]) + indices_1 = tf.reshape(indices_1, [-1, 1]) + indices = tf.concat([indices_0, indices_1], axis=1) + parsed_dict[input_0 + '_raw_proj_id'] = tf.SparseTensor( + indices=indices, + values=indices_1[:, 0], + dense_shape=[sample_num, fc.sequence_length]) + parsed_dict[input_0 + '_raw_proj_val'] = tf.SparseTensor( + indices=indices, + values=tf.reshape(parsed_dict[input_0], [-1]), + dense_shape=[sample_num, fc.sequence_length]) + self._appended_fields.append(input_0 + '_raw_proj_id') + self._appended_fields.append(input_0 + '_raw_proj_val') + elif not fc.boundaries and fc.num_buckets <= 1 and fc.hash_bucket_size <= 0 and \ + self._data_config.sample_weight != input_0 and sub_feature_type == fc.RawFeature and \ + fc.raw_input_dim > 1: + # for 3 dimension sequence feature input. + # may need by wide model and deep model to project + # raw values to a vector, it maybe better implemented + # by a ProjectionColumn later + logging.info( + 'Not set boundaries or num_buckets or hash_bucket_size, %s will process as three dimentsion raw feature' + % input_0) + parsed_dict[input_0] = tf.sparse_to_dense( + parsed_dict[input_0].indices, [ + tf.shape(parsed_dict[input_0])[0], fc.sequence_length, + fc.raw_input_dim + ], parsed_dict[input_0].values) + sample_num = tf.to_int64(tf.shape(parsed_dict[input_0])[0]) + indices_0 = tf.range(sample_num, dtype=tf.int64) + indices_1 = tf.range(fc.sequence_length, dtype=tf.int64) + indices_2 = tf.range(fc.raw_input_dim, dtype=tf.int64) + indices_0 = indices_0[:, None, None] + indices_1 = indices_1[None, :, None] + indices_2 = indices_2[None, None, :] + indices_0 = tf.tile(indices_0, + [1, fc.sequence_length, fc.raw_input_dim]) + indices_1 = tf.tile(indices_1, [sample_num, 1, fc.raw_input_dim]) + indices_2 = tf.tile(indices_2, [sample_num, fc.sequence_length, 1]) + indices_0 = tf.reshape(indices_0, [-1, 1]) + indices_1 = tf.reshape(indices_1, [-1, 1]) + indices_2 = tf.reshape(indices_2, [-1, 1]) + indices = tf.concat([indices_0, indices_1, indices_2], axis=1) + + parsed_dict[input_0 + '_raw_proj_id'] = tf.SparseTensor( + indices=indices, + values=indices_1[:, 0], + dense_shape=[sample_num, fc.sequence_length, fc.raw_input_dim]) + parsed_dict[input_0 + '_raw_proj_val'] = tf.SparseTensor( + indices=indices, + values=tf.reshape(parsed_dict[input_0], [-1]), + dense_shape=[sample_num, fc.sequence_length, fc.raw_input_dim]) + self._appended_fields.append(input_0 + '_raw_proj_id') + self._appended_fields.append(input_0 + '_raw_proj_val') elif feature_type == fc.RawFeature: input_0 = fc.input_names[0] if field_dict[input_0].dtype == tf.string: diff --git a/easy_rec/python/layers/input_layer.py b/easy_rec/python/layers/input_layer.py index 3085adc7a..001873abb 100644 --- a/easy_rec/python/layers/input_layer.py +++ b/easy_rec/python/layers/input_layer.py @@ -38,20 +38,21 @@ def __init__(self, self._feature_groups = { x.group_name: FeatureGroup(x) for x in feature_groups_config } - self._seq_feature_groups_config = [ - x.sequence_features - for x in feature_groups_config - if x.HasField('sequence_features') - ] + self._seq_feature_groups_config = [] + for x in feature_groups_config: + for y in x.sequence_features: + self._seq_feature_groups_config.append(y) self._group_name_to_seq_features = { x.group_name: x.sequence_features for x in feature_groups_config - if x.HasField('sequence_features') + if len(x.sequence_features) > 0 } self._seq_input_layer = None if len(self._seq_feature_groups_config) > 0: self._seq_input_layer = seq_input_layer.SeqInputLayer( - feature_configs, self._seq_feature_groups_config) + feature_configs, + self._seq_feature_groups_config, + use_embedding_variable=use_embedding_variable) wide_and_deep_dict = self.get_wide_deep_dict() self._fc_parser = FeatureColumnParser( feature_configs, @@ -100,30 +101,38 @@ def target_attention(self, dnn_config, deep_fea, name): def call_seq_input_layer(self, features, - seq_att_map_config, + all_seq_att_map_config, feature_name_to_output_tensors=None): - group_name = seq_att_map_config.group_name - allow_key_search = seq_att_map_config.allow_key_search - seq_features = self._seq_input_layer(features, group_name, - feature_name_to_output_tensors, - allow_key_search) - regularizers.apply_regularization( - self._embedding_regularizer, weights_list=[seq_features['key']]) - regularizers.apply_regularization( - self._embedding_regularizer, - weights_list=[seq_features['hist_seq_emb']]) - seq_dnn_config = None - if seq_att_map_config.HasField('seq_dnn'): - seq_dnn_config = seq_att_map_config.seq_dnn - else: - logging.info( - 'seq_dnn not set in seq_att_groups, will use default settings') - from easy_rec.python.protos.dnn_pb2 import DNN - seq_dnn_config = DNN() - seq_dnn_config.hidden_units.extend([128, 64, 32, 1]) - seq_fea = self.target_attention( - seq_dnn_config, seq_features, name='seq_dnn') - return seq_fea + all_seq_fea = [] + # process all sequence features + for seq_att_map_config in all_seq_att_map_config: + group_name = seq_att_map_config.group_name + allow_key_search = seq_att_map_config.allow_key_search + seq_features = self._seq_input_layer(features, group_name, + feature_name_to_output_tensors, + allow_key_search) + regularizers.apply_regularization( + self._embedding_regularizer, weights_list=[seq_features['key']]) + regularizers.apply_regularization( + self._embedding_regularizer, + weights_list=[seq_features['hist_seq_emb']]) + seq_dnn_config = None + if seq_att_map_config.HasField('seq_dnn'): + seq_dnn_config = seq_att_map_config.seq_dnn + else: + logging.info( + 'seq_dnn not set in seq_att_groups, will use default settings') + # If not set seq_dnn, will use default settings + from easy_rec.python.protos.dnn_pb2 import DNN + seq_dnn_config = DNN() + seq_dnn_config.hidden_units.extend([128, 64, 32, 1]) + cur_target_attention_name = 'seq_dnn' + group_name + seq_fea = self.target_attention( + seq_dnn_config, seq_features, name=cur_target_attention_name) + all_seq_fea.append(seq_fea) + # concat all seq_fea + all_seq_fea = tf.concat(all_seq_fea, axis=1) + return all_seq_fea def __call__(self, features, group_name, is_combine=True): """Get features by group_name. @@ -145,9 +154,10 @@ def __call__(self, features, group_name, is_combine=True): group_name, ','.join([x for x in self._feature_groups])) feature_name_to_output_tensors = {} if group_name in self._group_name_to_seq_features: - for seq_att in self._group_name_to_seq_features[group_name].seq_att_map: - for k in seq_att.key: - feature_name_to_output_tensors[k] = None + for seq_feature in self._group_name_to_seq_features[group_name]: + for seq_att in seq_feature.seq_att_map: + for k in seq_att.key: + feature_name_to_output_tensors[k] = None if is_combine: concat_features, group_features = self.single_call_input_layer( features, group_name, is_combine, feature_name_to_output_tensors) diff --git a/easy_rec/python/layers/seq_input_layer.py b/easy_rec/python/layers/seq_input_layer.py index ee27f8039..4f0cdab0c 100644 --- a/easy_rec/python/layers/seq_input_layer.py +++ b/easy_rec/python/layers/seq_input_layer.py @@ -15,12 +15,18 @@ class SeqInputLayer(object): - def __init__(self, feature_configs, feature_groups_config): + def __init__(self, + feature_configs, + feature_groups_config, + use_embedding_variable=False): self._feature_groups_config = { x.group_name: x for x in feature_groups_config } wide_and_deep_dict = self.get_wide_deep_dict() - self._fc_parser = FeatureColumnParser(feature_configs, wide_and_deep_dict) + self._fc_parser = FeatureColumnParser( + feature_configs, + wide_and_deep_dict, + use_embedding_variable=use_embedding_variable) def __call__(self, features, diff --git a/easy_rec/python/model/autoint.py b/easy_rec/python/model/autoint.py index fc9c05ca5..b7013486e 100644 --- a/easy_rec/python/model/autoint.py +++ b/easy_rec/python/model/autoint.py @@ -28,11 +28,11 @@ def __init__(self, self._features, _ = self._input_layer(self._feature_dict, 'all') self._feature_num = len(self._model_config.feature_groups[0].feature_names) self._seq_key_num = 0 - if self._model_config.feature_groups[0].HasField('sequence_features'): - self._feature_num += len(self._model_config.feature_groups[0] - .sequence_features.seq_att_map[0].hist_seq) - self._seq_key_num = len(self._model_config.feature_groups[0] - .sequence_features.seq_att_map[0].key) + if len(self._model_config.feature_groups[0].sequence_features) > 0: + for seq_fea in self._model_config.feature_groups[0].sequence_features: + for seq_att in seq_fea.seq_att_map: + self._feature_num += len(seq_att.hist_seq) + self._seq_key_num += len(seq_att.key) self._model_config = self._model_config.autoint assert isinstance(self._model_config, AutoIntConfig) diff --git a/easy_rec/python/protos/feature_config.proto b/easy_rec/python/protos/feature_config.proto index 18ef12ea1..2a596e3d3 100644 --- a/easy_rec/python/protos/feature_config.proto +++ b/easy_rec/python/protos/feature_config.proto @@ -38,6 +38,15 @@ message FeatureConfig { SequenceFeature = 5; } + enum FieldType { + INT32 = 0; + INT64 = 1; + STRING = 2; + FLOAT = 4; + DOUBLE = 5; + BOOL = 6; + } + optional string feature_name = 1; // input field names: must be included in DatasetConfig.input_fields @@ -94,6 +103,12 @@ message FeatureConfig { // sequence feature combiner optional SequenceCombiner sequence_combiner = 25; + + // sub feature type for sequence feature + optional FeatureType sub_feature_type = 26 [default = IdFeature]; + + // sequence length + optional uint32 sequence_length = 27 [default = 1]; } message FeatureConfigV2 { @@ -105,7 +120,7 @@ message FeatureGroupConfig { repeated string feature_names = 2; optional WideOrDeep wide_deep = 3 [default = DEEP]; - optional SeqAttGroupConfig sequence_features = 4; + repeated SeqAttGroupConfig sequence_features = 4; } message SeqAttMap { diff --git a/easy_rec/python/test/train_eval_test.py b/easy_rec/python/test/train_eval_test.py index a3bc6fc92..a3078e090 100644 --- a/easy_rec/python/test/train_eval_test.py +++ b/easy_rec/python/test/train_eval_test.py @@ -549,11 +549,11 @@ def test_sequence_dssm(self): self._test_dir) self.assertTrue(self._success) - # def test_sequence_essm(self): - # self._success = test_utils.test_single_train_eval( - # 'samples/model_config/essm_on_sequence_feature_taobao.config', - # self._test_dir) - # self.assertTrue(self._success) + def test_sequence_esmm(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/esmm_on_sequence_feature_taobao.config', + self._test_dir) + self.assertTrue(self._success) def test_sequence_fm(self): self._success = test_utils.test_single_train_eval( @@ -589,6 +589,61 @@ def test_sequence_wide_and_deep(self): self._success = test_utils.test_single_train_eval( 'samples/model_config/wide_and_deep_on_sequence_feature_taobao.config', self._test_dir) + self.assertTrue(self._success) + + def test_numeric_boundary_sequence_dbmtl(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/dbmtl_on_numeric_boundary_sequence_feature_taobao.config', + self._test_dir) + self.assertTrue(self._success) + + def test_numeric_hash_bucket_sequence_dbmtl(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/dbmtl_on_numeric_hash_bucket_sequence_feature_taobao.config', + self._test_dir) + self.assertTrue(self._success) + + def test_numeric_raw_sequence_dbmtl(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/dbmtl_on_numeric_raw_sequence_feature_taobao.config', + self._test_dir) + self.assertTrue(self._success) + + def test_numeric_num_buckets_sequence_dbmtl(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/dbmtl_on_numeric_num_buckets_sequence_feature_taobao.config', + self._test_dir) + self.assertTrue(self._success) + + def test_multi_numeric_boundary_sequence_dbmtl(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/dbmtl_on_multi_numeric_boundary_sequence_feature_taobao.config', + self._test_dir) + self.assertTrue(self._success) + + def test_multi_numeric_hash_bucket_sequence_dbmtl(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/dbmtl_on_multi_numeric_hash_bucket_sequence_feature_taobao.config', + self._test_dir) + self.assertTrue(self._success) + + def test_multi_numeric_raw_sequence_dbmtl(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/dbmtl_on_multi_numeric_raw_sequence_feature_taobao.config', + self._test_dir) + self.assertTrue(self._success) + + def test_multi_numeric_num_buckets_sequence_dbmtl(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/dbmtl_on_multi_numeric_num_buckets_sequence_feature_taobao.config', + self._test_dir) + self.assertTrue(self._success) + + def test_multi_sequence_dbmtl(self): + self._success = test_utils.test_single_train_eval( + 'samples/model_config/dbmtl_on_multi_sequence_feature_taobao.config', + self._test_dir) + self.assertTrue(self._success) def test_multi_optimizer(self): self._success = test_utils.test_distributed_train_eval( diff --git a/easy_rec/python/utils/config_util.py b/easy_rec/python/utils/config_util.py index bb65bd278..38ce860d4 100644 --- a/easy_rec/python/utils/config_util.py +++ b/easy_rec/python/utils/config_util.py @@ -337,7 +337,11 @@ def add_boundaries_to_config(pipeline_config, tables): for feature_config in feature_configs: feature_name = feature_config.input_names[0] if feature_name in feature_boundaries_info: - feature_config.feature_type = feature_config.RawFeature + if feature_config.feature_type != feature_config.SequenceFeature: + logging.info( + 'feature = {0}, type = {1}, will turn to RawFeature.'.format( + feature_name, feature_config.feature_type)) + feature_config.feature_type = feature_config.RawFeature feature_config.hash_bucket_size = 0 feature_config.ClearField('boundaries') feature_config.boundaries.extend(feature_boundaries_info[feature_name]) diff --git a/easy_rec/python/utils/convert_rtp_fg.py b/easy_rec/python/utils/convert_rtp_fg.py index a6e8e1199..7b66520f5 100644 --- a/easy_rec/python/utils/convert_rtp_fg.py +++ b/easy_rec/python/utils/convert_rtp_fg.py @@ -72,7 +72,6 @@ def process_features(feature_type, pipeline_config, embedding_dim, incol_separator, - sub_value_type=None, is_sequence=False): feature_config = FeatureConfig() feature_config.input_names.append(feature_name) @@ -87,7 +86,22 @@ def process_features(feature_type, feature_config.is_cache = True is_multi = feature.get('is_multi', False) # is_seq = feature.get('is_seq', False) - if feature_type == 'id_feature': + if is_sequence: + feature_config.feature_type = feature_config.SequenceFeature + feature_config.embedding_dim = curr_embed_dim + if feature_type == 'raw_feature': + feature_config.sub_feature_type = feature_config.RawFeature + input_field.default_val = feature.get('default_value', '0.0') + raw_input_dim = feature.get('value_dimension', 1) + if 'boundaries' in feature: + feature_config.boundaries.extend(feature['boundaries']) + if raw_input_dim > 1: + feature_config.raw_input_dim = raw_input_dim + else: + feature_config.sub_feature_type = feature_config.IdFeature + _set_hash_bucket(feature, feature_config, input_field) + feature_config.combiner = curr_combiner + elif feature_type == 'id_feature': if is_multi: feature_config.feature_type = feature_config.TagFeature kv_separator = feature.get('kv_separator', None) @@ -154,8 +168,6 @@ def process_features(feature_type, if 'shared_name' in feature: feature_config.embedding_name = feature['shared_name'] # pipeline_config.feature_configs.append(feature_config) - if is_sequence: - feature_config.feature_type = feature_config.SequenceFeature if pipeline_config.feature_configs: pipeline_config.feature_configs.append(feature_config) else: @@ -229,9 +241,6 @@ def load_input_field_and_feature_config(rtp_fg, for sub_feature in feature['features']: sub_feature_type = sub_feature['feature_type'] sub_feature_name = sub_feature['feature_name'] - sub_value_type = None - if 'value_type' in sub_feature: - sub_value_type = sub_feature['value_type'] all_sub_feature_name = sequence_name + '_' + sub_feature_name pipeline_config = process_features( sub_feature_type, @@ -240,7 +249,6 @@ def load_input_field_and_feature_config(rtp_fg, pipeline_config, embedding_dim, incol_separator, - sub_value_type, is_sequence=True) except Exception as ex: print('Exception: %s %s' % (type(ex), str(ex))) diff --git a/pre-commit b/pre-commit index 964bd792e..c713d226a 100755 --- a/pre-commit +++ b/pre-commit @@ -29,3 +29,6 @@ if [ $result -eq 0 ];then else exit 1 fi +pwd + +python scripts/pre-commit diff --git a/samples/model_config/dbmtl_on_multi_numeric_boundary_sequence_feature_taobao.config b/samples/model_config/dbmtl_on_multi_numeric_boundary_sequence_feature_taobao.config new file mode 100644 index 000000000..26a9a615b --- /dev/null +++ b/samples/model_config/dbmtl_on_multi_numeric_boundary_sequence_feature_taobao.config @@ -0,0 +1,346 @@ +train_input_path: "data/test/tb_data/taobao_multi_seq_train_data" +eval_input_path: "data/test/tb_data/taobao_multi_seq_test_data" +model_dir: "experiments/dbmtl_taobao_ckpt" + +train_config { + optimizer_config { + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + use_moving_average: false + } + num_steps: 5000 + sync_replicas: true + save_checkpoints_steps: 100 + log_step_count_steps: 100 +} +eval_config { + metrics_set { + auc { + } + } +} +data_config { + batch_size: 4096 + label_fields: "clk" + label_fields: "buy" + prefetch_size: 1 + input_type: CSVInput + input_fields { + input_name: "clk" + input_type: INT32 + } + input_fields { + input_name: "buy" + input_type: INT32 + } + input_fields { + input_name: "pid" + input_type: STRING + } + input_fields { + input_name: "adgroup_id" + input_type: STRING + } + input_fields { + input_name: "cate_id" + input_type: STRING + } + input_fields { + input_name: "campaign_id" + input_type: STRING + } + input_fields { + input_name: "customer" + input_type: STRING + } + input_fields { + input_name: "brand" + input_type: STRING + } + input_fields { + input_name: "user_id" + input_type: STRING + } + input_fields { + input_name: "cms_segid" + input_type: STRING + } + input_fields { + input_name: "cms_group_id" + input_type: STRING + } + input_fields { + input_name: "final_gender_code" + input_type: STRING + } + input_fields { + input_name: "age_level" + input_type: STRING + } + input_fields { + input_name: "pvalue_level" + input_type: STRING + } + input_fields { + input_name: "shopping_level" + input_type: STRING + } + input_fields { + input_name: "occupation" + input_type: STRING + } + input_fields { + input_name: "new_user_class_level" + input_type: STRING + } + input_fields { + input_name: "tag_category_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "tag_brand_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "price" + input_type: INT32 + } +} +feature_configs { + input_names: "pid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "adgroup_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cate_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 +} +feature_configs { + input_names: "campaign_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "customer" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "brand" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "user_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cms_segid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "cms_group_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "final_gender_code" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "age_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "pvalue_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "shopping_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "occupation" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "new_user_class_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "tag_category_list" + feature_type: SequenceFeature + embedding_dim: 16 + boundaries: 15.0 + boundaries: 20.0 + boundaries: 21.0 + boundaries: 23.0 + boundaries: 30.0 + boundaries: 32.0 + boundaries: 40.0 + boundaries: 47.0 + boundaries: 66.0 + boundaries: 70.0 + boundaries: 77.0 + boundaries: 87.0 + boundaries: 99.0 + boundaries: 120.0 + boundaries: 148.0 + boundaries: 188.0 + boundaries: 199.0 + boundaries: 235.0 + boundaries: 301.0 + boundaries: 443.0 + boundaries: 597.0 + boundaries: 1314.0 + sub_feature_type: RawFeature + sequence_length: 300 + separator: "|" + seq_multi_sep: ";" +} +feature_configs { + input_names: "tag_brand_list" + feature_type: SequenceFeature + embedding_dim: 16 + boundaries: 15.0 + boundaries: 20.0 + boundaries: 21.0 + boundaries: 23.0 + boundaries: 30.0 + boundaries: 32.0 + boundaries: 40.0 + boundaries: 47.0 + boundaries: 66.0 + boundaries: 70.0 + boundaries: 77.0 + boundaries: 87.0 + boundaries: 99.0 + boundaries: 120.0 + boundaries: 148.0 + boundaries: 188.0 + boundaries: 199.0 + boundaries: 235.0 + boundaries: 301.0 + boundaries: 443.0 + boundaries: 597.0 + boundaries: 1314.0 + sub_feature_type: RawFeature + sequence_length: 300 + separator: "|" + seq_multi_sep: ";" +} +feature_configs { + input_names: "price" + feature_type: IdFeature + embedding_dim: 16 + num_buckets: 50 +} +model_config { + model_class: "DBMTL" + feature_groups { + group_name: "all" + feature_names: "user_id" + feature_names: "cms_segid" + feature_names: "cms_group_id" + feature_names: "age_level" + feature_names: "pvalue_level" + feature_names: "shopping_level" + feature_names: "occupation" + feature_names: "new_user_class_level" + feature_names: "adgroup_id" + feature_names: "cate_id" + feature_names: "campaign_id" + feature_names: "customer" + feature_names: "brand" + feature_names: "price" + feature_names: "pid" + wide_deep: DEEP + sequence_features: { + group_name: "seq_fea" + tf_summary: false + seq_att_map: { + key: "brand" + key: "cate_id" + hist_seq: "tag_brand_list" + hist_seq: "tag_category_list" + } + } + } + dbmtl { + bottom_dnn { + hidden_units: [1024, 512, 256] + } + task_towers { + tower_name: "ctr" + label_name: "clk" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + task_towers { + tower_name: "cvr" + label_name: "buy" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_tower_names: ["ctr"] + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + l2_regularization: 1e-6 + } + embedding_regularization: 5e-6 +} diff --git a/samples/model_config/dbmtl_on_multi_numeric_hash_bucket_sequence_feature_taobao.config b/samples/model_config/dbmtl_on_multi_numeric_hash_bucket_sequence_feature_taobao.config new file mode 100644 index 000000000..90a26ce93 --- /dev/null +++ b/samples/model_config/dbmtl_on_multi_numeric_hash_bucket_sequence_feature_taobao.config @@ -0,0 +1,302 @@ +train_input_path: "data/test/tb_data/taobao_multi_seq_train_data" +eval_input_path: "data/test/tb_data/taobao_multi_seq_test_data" +model_dir: "experiments/dbmtl_taobao_ckpt" + +train_config { + optimizer_config { + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + use_moving_average: false + } + num_steps: 5000 + sync_replicas: true + save_checkpoints_steps: 100 + log_step_count_steps: 100 +} +eval_config { + metrics_set { + auc { + } + } +} +data_config { + batch_size: 4 + label_fields: "clk" + label_fields: "buy" + prefetch_size: 1 + input_type: CSVInput + input_fields { + input_name: "clk" + input_type: INT32 + } + input_fields { + input_name: "buy" + input_type: INT32 + } + input_fields { + input_name: "pid" + input_type: STRING + } + input_fields { + input_name: "adgroup_id" + input_type: STRING + } + input_fields { + input_name: "cate_id" + input_type: STRING + } + input_fields { + input_name: "campaign_id" + input_type: STRING + } + input_fields { + input_name: "customer" + input_type: STRING + } + input_fields { + input_name: "brand" + input_type: STRING + } + input_fields { + input_name: "user_id" + input_type: STRING + } + input_fields { + input_name: "cms_segid" + input_type: STRING + } + input_fields { + input_name: "cms_group_id" + input_type: STRING + } + input_fields { + input_name: "final_gender_code" + input_type: STRING + } + input_fields { + input_name: "age_level" + input_type: STRING + } + input_fields { + input_name: "pvalue_level" + input_type: STRING + } + input_fields { + input_name: "shopping_level" + input_type: STRING + } + input_fields { + input_name: "occupation" + input_type: STRING + } + input_fields { + input_name: "new_user_class_level" + input_type: STRING + } + input_fields { + input_name: "tag_category_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "tag_brand_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "price" + input_type: INT32 + } +} +feature_configs { + input_names: "pid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "adgroup_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cate_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 +} +feature_configs { + input_names: "campaign_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "customer" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "brand" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "user_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cms_segid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "cms_group_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "final_gender_code" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "age_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "pvalue_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "shopping_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "occupation" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "new_user_class_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "tag_category_list" + feature_type: SequenceFeature + embedding_dim: 16 + hash_bucket_size: 100000 + sub_feature_type: IdFeature + separator: "|" + seq_multi_sep: ";" +} +feature_configs { + input_names: "tag_brand_list" + feature_type: SequenceFeature + embedding_dim: 16 + hash_bucket_size: 100000 + sub_feature_type: IdFeature + separator: "|" + seq_multi_sep: ";" +} +feature_configs { + input_names: "price" + feature_type: IdFeature + embedding_dim: 16 + num_buckets: 50 +} +model_config { + model_class: "DBMTL" + feature_groups { + group_name: "all" + feature_names: "user_id" + feature_names: "cms_segid" + feature_names: "cms_group_id" + feature_names: "age_level" + feature_names: "pvalue_level" + feature_names: "shopping_level" + feature_names: "occupation" + feature_names: "new_user_class_level" + feature_names: "adgroup_id" + feature_names: "cate_id" + feature_names: "campaign_id" + feature_names: "customer" + feature_names: "brand" + feature_names: "price" + feature_names: "pid" + wide_deep: DEEP + sequence_features: { + group_name: "seq_fea" + tf_summary: false + seq_att_map: { + key: "brand" + key: "cate_id" + hist_seq: "tag_brand_list" + hist_seq: "tag_category_list" + } + } + } + dbmtl { + bottom_dnn { + hidden_units: [1024, 512, 256] + } + task_towers { + tower_name: "ctr" + label_name: "clk" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + task_towers { + tower_name: "cvr" + label_name: "buy" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_tower_names: ["ctr"] + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + l2_regularization: 1e-6 + } + embedding_regularization: 5e-6 +} diff --git a/samples/model_config/dbmtl_on_multi_numeric_num_buckets_sequence_feature_taobao.config b/samples/model_config/dbmtl_on_multi_numeric_num_buckets_sequence_feature_taobao.config new file mode 100644 index 000000000..1d9c6852f --- /dev/null +++ b/samples/model_config/dbmtl_on_multi_numeric_num_buckets_sequence_feature_taobao.config @@ -0,0 +1,308 @@ +train_input_path: "data/test/tb_data/taobao_multi_seq_train_data" +eval_input_path: "data/test/tb_data/taobao_multi_seq_test_data" +model_dir: "experiments/dbmtl_taobao_ckpt" + +train_config { + optimizer_config { + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + use_moving_average: false + } + num_steps: 5000 + sync_replicas: true + save_checkpoints_steps: 100 + log_step_count_steps: 100 +} +eval_config { + metrics_set { + auc { + } + } +} +data_config { + batch_size: 4096 + label_fields: "clk" + label_fields: "buy" + prefetch_size: 1 + input_type: CSVInput + input_fields { + input_name: "clk" + input_type: INT32 + } + input_fields { + input_name: "buy" + input_type: INT32 + } + input_fields { + input_name: "pid" + input_type: STRING + } + input_fields { + input_name: "adgroup_id" + input_type: STRING + } + input_fields { + input_name: "cate_id" + input_type: STRING + } + input_fields { + input_name: "campaign_id" + input_type: STRING + } + input_fields { + input_name: "customer" + input_type: STRING + } + input_fields { + input_name: "brand" + input_type: STRING + } + input_fields { + input_name: "user_id" + input_type: STRING + } + input_fields { + input_name: "cms_segid" + input_type: STRING + } + input_fields { + input_name: "cms_group_id" + input_type: STRING + } + input_fields { + input_name: "final_gender_code" + input_type: STRING + } + input_fields { + input_name: "age_level" + input_type: STRING + } + input_fields { + input_name: "pvalue_level" + input_type: STRING + } + input_fields { + input_name: "shopping_level" + input_type: STRING + } + input_fields { + input_name: "occupation" + input_type: STRING + } + input_fields { + input_name: "new_user_class_level" + input_type: STRING + } + input_fields { + input_name: "tag_category_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "tag_brand_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "price" + input_type: INT32 + } +} +feature_configs { + input_names: "pid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "adgroup_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cate_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 +} +feature_configs { + input_names: "campaign_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "customer" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "brand" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "user_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cms_segid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "cms_group_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "final_gender_code" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "age_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "pvalue_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "shopping_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "occupation" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "new_user_class_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "tag_category_list" + feature_type: SequenceFeature + embedding_dim: 16 + num_buckets: 15 + max_val: 100000 + min_val: 0 + sub_feature_type: RawFeature + sequence_length: 300 + separator: "|" + seq_multi_sep: ";" +} +feature_configs { + input_names: "tag_brand_list" + feature_type: SequenceFeature + embedding_dim: 16 + num_buckets: 15 + max_val: 100000 + min_val: 0 + sub_feature_type: RawFeature + sequence_length: 300 + separator: "|" + seq_multi_sep: ";" +} +feature_configs { + input_names: "price" + feature_type: IdFeature + embedding_dim: 16 + num_buckets: 50 +} +model_config { + model_class: "DBMTL" + feature_groups { + group_name: "all" + feature_names: "user_id" + feature_names: "cms_segid" + feature_names: "cms_group_id" + feature_names: "age_level" + feature_names: "pvalue_level" + feature_names: "shopping_level" + feature_names: "occupation" + feature_names: "new_user_class_level" + feature_names: "adgroup_id" + feature_names: "cate_id" + feature_names: "campaign_id" + feature_names: "customer" + feature_names: "brand" + feature_names: "price" + feature_names: "pid" + wide_deep: DEEP + sequence_features: { + group_name: "seq_fea" + tf_summary: false + seq_att_map: { + key: "brand" + key: "cate_id" + hist_seq: "tag_brand_list" + hist_seq: "tag_category_list" + } + } + } + dbmtl { + bottom_dnn { + hidden_units: [1024, 512, 256] + } + task_towers { + tower_name: "ctr" + label_name: "clk" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + task_towers { + tower_name: "cvr" + label_name: "buy" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_tower_names: ["ctr"] + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + l2_regularization: 1e-6 + } + embedding_regularization: 5e-6 +} diff --git a/samples/model_config/dbmtl_on_multi_numeric_raw_sequence_feature_taobao.config b/samples/model_config/dbmtl_on_multi_numeric_raw_sequence_feature_taobao.config new file mode 100644 index 000000000..c2d005f8b --- /dev/null +++ b/samples/model_config/dbmtl_on_multi_numeric_raw_sequence_feature_taobao.config @@ -0,0 +1,304 @@ +train_input_path: "data/test/tb_data/taobao_multi_seq_train_data" +eval_input_path: "data/test/tb_data/taobao_multi_seq_train_data" +model_dir: "experiments/dbmtl_taobao_ckpt" + +train_config { + optimizer_config { + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + use_moving_average: false + } + num_steps: 5000 + sync_replicas: true + save_checkpoints_steps: 100 + log_step_count_steps: 100 +} +eval_config { + metrics_set { + auc { + } + } +} +data_config { + batch_size: 4 + label_fields: "clk" + label_fields: "buy" + prefetch_size: 1 + input_type: CSVInput + input_fields { + input_name: "clk" + input_type: INT32 + } + input_fields { + input_name: "buy" + input_type: INT32 + } + input_fields { + input_name: "pid" + input_type: STRING + } + input_fields { + input_name: "adgroup_id" + input_type: STRING + } + input_fields { + input_name: "cate_id" + input_type: STRING + } + input_fields { + input_name: "campaign_id" + input_type: STRING + } + input_fields { + input_name: "customer" + input_type: STRING + } + input_fields { + input_name: "brand" + input_type: STRING + } + input_fields { + input_name: "user_id" + input_type: STRING + } + input_fields { + input_name: "cms_segid" + input_type: STRING + } + input_fields { + input_name: "cms_group_id" + input_type: STRING + } + input_fields { + input_name: "final_gender_code" + input_type: STRING + } + input_fields { + input_name: "age_level" + input_type: STRING + } + input_fields { + input_name: "pvalue_level" + input_type: STRING + } + input_fields { + input_name: "shopping_level" + input_type: STRING + } + input_fields { + input_name: "occupation" + input_type: STRING + } + input_fields { + input_name: "new_user_class_level" + input_type: STRING + } + input_fields { + input_name: "tag_category_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "tag_brand_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "price" + input_type: INT32 + } +} +feature_configs { + input_names: "pid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "adgroup_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cate_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 +} +feature_configs { + input_names: "campaign_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "customer" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "brand" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "user_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cms_segid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "cms_group_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "final_gender_code" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "age_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "pvalue_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "shopping_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "occupation" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "new_user_class_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "tag_category_list" + feature_type: SequenceFeature + sub_feature_type: RawFeature + sequence_length:50 + embedding_dim: 16 + raw_input_dim: 4 + separator: "|" + seq_multi_sep: ";" +} +feature_configs { + input_names: "tag_brand_list" + feature_type: SequenceFeature + sub_feature_type: RawFeature + sequence_length:50 + embedding_dim: 16 + raw_input_dim: 4 + separator: "|" + seq_multi_sep: ";" +} +feature_configs { + input_names: "price" + feature_type: IdFeature + embedding_dim: 16 + num_buckets: 50 +} +model_config { + model_class: "DBMTL" + feature_groups { + group_name: "all" + feature_names: "user_id" + feature_names: "cms_segid" + feature_names: "cms_group_id" + feature_names: "age_level" + feature_names: "pvalue_level" + feature_names: "shopping_level" + feature_names: "occupation" + feature_names: "new_user_class_level" + feature_names: "adgroup_id" + feature_names: "cate_id" + feature_names: "campaign_id" + feature_names: "customer" + feature_names: "brand" + feature_names: "price" + feature_names: "pid" + wide_deep: DEEP + sequence_features: { + group_name: "seq_fea" + tf_summary: false + seq_att_map: { + key: "brand" + key: "cate_id" + hist_seq: "tag_brand_list" + hist_seq: "tag_category_list" + } + } + } + dbmtl { + bottom_dnn { + hidden_units: [1024, 512, 256] + } + task_towers { + tower_name: "ctr" + label_name: "clk" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + task_towers { + tower_name: "cvr" + label_name: "buy" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_tower_names: ["ctr"] + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + l2_regularization: 1e-6 + } + embedding_regularization: 5e-6 +} diff --git a/samples/model_config/dbmtl_on_multi_sequence_feature_taobao.config b/samples/model_config/dbmtl_on_multi_sequence_feature_taobao.config new file mode 100644 index 000000000..33a213fc8 --- /dev/null +++ b/samples/model_config/dbmtl_on_multi_sequence_feature_taobao.config @@ -0,0 +1,311 @@ +train_input_path: "data/test/tb_data/taobao_train_data" +eval_input_path: "data/test/tb_data/taobao_test_data" +model_dir: "experiments/dbmtl_taobao_ckpt" + +train_config { + optimizer_config { + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + use_moving_average: false + } + num_steps: 5000 + sync_replicas: true + save_checkpoints_steps: 100 + log_step_count_steps: 100 +} +eval_config { + metrics_set { + auc { + } + } +} +data_config { + batch_size: 4096 + label_fields: "clk" + label_fields: "buy" + prefetch_size: 32 + input_type: CSVInput + input_fields { + input_name: "clk" + input_type: INT32 + } + input_fields { + input_name: "buy" + input_type: INT32 + } + input_fields { + input_name: "pid" + input_type: STRING + } + input_fields { + input_name: "adgroup_id" + input_type: STRING + } + input_fields { + input_name: "cate_id" + input_type: STRING + } + input_fields { + input_name: "campaign_id" + input_type: STRING + } + input_fields { + input_name: "customer" + input_type: STRING + } + input_fields { + input_name: "brand" + input_type: STRING + } + input_fields { + input_name: "user_id" + input_type: STRING + } + input_fields { + input_name: "cms_segid" + input_type: STRING + } + input_fields { + input_name: "cms_group_id" + input_type: STRING + } + input_fields { + input_name: "final_gender_code" + input_type: STRING + } + input_fields { + input_name: "age_level" + input_type: STRING + } + input_fields { + input_name: "pvalue_level" + input_type: STRING + } + input_fields { + input_name: "shopping_level" + input_type: STRING + } + input_fields { + input_name: "occupation" + input_type: STRING + } + input_fields { + input_name: "new_user_class_level" + input_type: STRING + } + input_fields { + input_name: "tag_category_list" + input_type: STRING + } + input_fields { + input_name: "tag_brand_list" + input_type: STRING + } + input_fields { + input_name: "price" + input_type: INT32 + } +} +feature_configs { + input_names: "pid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "adgroup_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cate_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 +} +feature_configs { + input_names: "campaign_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "customer" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "brand" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "user_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cms_segid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "cms_group_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "final_gender_code" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "age_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "pvalue_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "shopping_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "occupation" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "new_user_class_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "tag_category_list" + feature_type: SequenceFeature + embedding_dim: 16 + hash_bucket_size: 100000 + separator: "|" +} +feature_configs { + input_names: "tag_brand_list" + feature_type: SequenceFeature + embedding_dim: 16 + hash_bucket_size: 100000 + separator: "|" +} +feature_configs { + input_names: "price" + feature_type: IdFeature + embedding_dim: 16 + num_buckets: 50 +} +model_config { + model_class: "DBMTL" + feature_groups { + group_name: "all" + feature_names: "user_id" + feature_names: "cms_segid" + feature_names: "cms_group_id" + feature_names: "age_level" + feature_names: "pvalue_level" + feature_names: "shopping_level" + feature_names: "occupation" + feature_names: "new_user_class_level" + feature_names: "adgroup_id" + feature_names: "cate_id" + feature_names: "campaign_id" + feature_names: "customer" + feature_names: "brand" + feature_names: "price" + feature_names: "pid" + wide_deep: DEEP + sequence_features: { + group_name: "seq_fea_1" + tf_summary: false + seq_att_map: { + key: "brand" + hist_seq: "tag_brand_list" + } + seq_att_map: { + key: "brand" + hist_seq: "tag_brand_list" + } + } + + sequence_features: { + group_name: "seq_fea_2" + tf_summary: false + seq_att_map: { + key: "cate_id" + hist_seq: "tag_category_list" + } + seq_att_map: { + key: "cate_id" + hist_seq: "tag_category_list" + } + } + } + dbmtl { + bottom_dnn { + hidden_units: [1024, 512, 256] + } + task_towers { + tower_name: "ctr" + label_name: "clk" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + task_towers { + tower_name: "cvr" + label_name: "buy" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_tower_names: ["ctr"] + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + l2_regularization: 1e-6 + } + embedding_regularization: 5e-6 +} diff --git a/samples/model_config/dbmtl_on_numeric_boundary_sequence_feature_taobao.config b/samples/model_config/dbmtl_on_numeric_boundary_sequence_feature_taobao.config new file mode 100644 index 000000000..5303a7af4 --- /dev/null +++ b/samples/model_config/dbmtl_on_numeric_boundary_sequence_feature_taobao.config @@ -0,0 +1,344 @@ +train_input_path: "data/test/tb_data/taobao_train_data" +eval_input_path: "data/test/tb_data/taobao_test_data" +model_dir: "experiments/dbmtl_taobao_ckpt" + +train_config { + optimizer_config { + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + use_moving_average: false + } + num_steps: 5000 + sync_replicas: true + save_checkpoints_steps: 100 + log_step_count_steps: 100 +} +eval_config { + metrics_set { + auc { + } + } +} +data_config { + batch_size: 4096 + label_fields: "clk" + label_fields: "buy" + prefetch_size: 1 + input_type: CSVInput + input_fields { + input_name: "clk" + input_type: INT32 + } + input_fields { + input_name: "buy" + input_type: INT32 + } + input_fields { + input_name: "pid" + input_type: STRING + } + input_fields { + input_name: "adgroup_id" + input_type: STRING + } + input_fields { + input_name: "cate_id" + input_type: STRING + } + input_fields { + input_name: "campaign_id" + input_type: STRING + } + input_fields { + input_name: "customer" + input_type: STRING + } + input_fields { + input_name: "brand" + input_type: STRING + } + input_fields { + input_name: "user_id" + input_type: STRING + } + input_fields { + input_name: "cms_segid" + input_type: STRING + } + input_fields { + input_name: "cms_group_id" + input_type: STRING + } + input_fields { + input_name: "final_gender_code" + input_type: STRING + } + input_fields { + input_name: "age_level" + input_type: STRING + } + input_fields { + input_name: "pvalue_level" + input_type: STRING + } + input_fields { + input_name: "shopping_level" + input_type: STRING + } + input_fields { + input_name: "occupation" + input_type: STRING + } + input_fields { + input_name: "new_user_class_level" + input_type: STRING + } + input_fields { + input_name: "tag_category_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "tag_brand_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "price" + input_type: INT32 + } +} +feature_configs { + input_names: "pid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "adgroup_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cate_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 +} +feature_configs { + input_names: "campaign_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "customer" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "brand" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "user_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cms_segid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "cms_group_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "final_gender_code" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "age_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "pvalue_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "shopping_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "occupation" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "new_user_class_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "tag_category_list" + feature_type: SequenceFeature + embedding_dim: 16 + boundaries: 15.0 + boundaries: 20.0 + boundaries: 21.0 + boundaries: 23.0 + boundaries: 30.0 + boundaries: 32.0 + boundaries: 40.0 + boundaries: 47.0 + boundaries: 66.0 + boundaries: 70.0 + boundaries: 77.0 + boundaries: 87.0 + boundaries: 99.0 + boundaries: 120.0 + boundaries: 148.0 + boundaries: 188.0 + boundaries: 199.0 + boundaries: 235.0 + boundaries: 301.0 + boundaries: 443.0 + boundaries: 597.0 + boundaries: 1314.0 + sub_feature_type: RawFeature + sequence_length: 300 + separator: "|" +} +feature_configs { + input_names: "tag_brand_list" + feature_type: SequenceFeature + embedding_dim: 16 + boundaries: 15.0 + boundaries: 20.0 + boundaries: 21.0 + boundaries: 23.0 + boundaries: 30.0 + boundaries: 32.0 + boundaries: 40.0 + boundaries: 47.0 + boundaries: 66.0 + boundaries: 70.0 + boundaries: 77.0 + boundaries: 87.0 + boundaries: 99.0 + boundaries: 120.0 + boundaries: 148.0 + boundaries: 188.0 + boundaries: 199.0 + boundaries: 235.0 + boundaries: 301.0 + boundaries: 443.0 + boundaries: 597.0 + boundaries: 1314.0 + sub_feature_type: RawFeature + sequence_length: 300 + separator: "|" +} +feature_configs { + input_names: "price" + feature_type: IdFeature + embedding_dim: 16 + num_buckets: 50 +} +model_config { + model_class: "DBMTL" + feature_groups { + group_name: "all" + feature_names: "user_id" + feature_names: "cms_segid" + feature_names: "cms_group_id" + feature_names: "age_level" + feature_names: "pvalue_level" + feature_names: "shopping_level" + feature_names: "occupation" + feature_names: "new_user_class_level" + feature_names: "adgroup_id" + feature_names: "cate_id" + feature_names: "campaign_id" + feature_names: "customer" + feature_names: "brand" + feature_names: "price" + feature_names: "pid" + wide_deep: DEEP + sequence_features: { + group_name: "seq_fea" + tf_summary: false + seq_att_map: { + key: "brand" + key: "cate_id" + hist_seq: "tag_brand_list" + hist_seq: "tag_category_list" + } + } + } + dbmtl { + bottom_dnn { + hidden_units: [1024, 512, 256] + } + task_towers { + tower_name: "ctr" + label_name: "clk" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + task_towers { + tower_name: "cvr" + label_name: "buy" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_tower_names: ["ctr"] + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + l2_regularization: 1e-6 + } + embedding_regularization: 5e-6 +} diff --git a/samples/model_config/dbmtl_on_numeric_hash_bucket_sequence_feature_taobao.config b/samples/model_config/dbmtl_on_numeric_hash_bucket_sequence_feature_taobao.config new file mode 100644 index 000000000..83e682c0b --- /dev/null +++ b/samples/model_config/dbmtl_on_numeric_hash_bucket_sequence_feature_taobao.config @@ -0,0 +1,300 @@ +train_input_path: "data/test/tb_data/taobao_train_data" +eval_input_path: "data/test/tb_data/taobao_test_data" +model_dir: "experiments/dbmtl_taobao_ckpt" + +train_config { + optimizer_config { + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + use_moving_average: false + } + num_steps: 5000 + sync_replicas: true + save_checkpoints_steps: 100 + log_step_count_steps: 100 +} +eval_config { + metrics_set { + auc { + } + } +} +data_config { + batch_size: 4096 + label_fields: "clk" + label_fields: "buy" + prefetch_size: 1 + input_type: CSVInput + input_fields { + input_name: "clk" + input_type: INT32 + } + input_fields { + input_name: "buy" + input_type: INT32 + } + input_fields { + input_name: "pid" + input_type: STRING + } + input_fields { + input_name: "adgroup_id" + input_type: STRING + } + input_fields { + input_name: "cate_id" + input_type: STRING + } + input_fields { + input_name: "campaign_id" + input_type: STRING + } + input_fields { + input_name: "customer" + input_type: STRING + } + input_fields { + input_name: "brand" + input_type: STRING + } + input_fields { + input_name: "user_id" + input_type: STRING + } + input_fields { + input_name: "cms_segid" + input_type: STRING + } + input_fields { + input_name: "cms_group_id" + input_type: STRING + } + input_fields { + input_name: "final_gender_code" + input_type: STRING + } + input_fields { + input_name: "age_level" + input_type: STRING + } + input_fields { + input_name: "pvalue_level" + input_type: STRING + } + input_fields { + input_name: "shopping_level" + input_type: STRING + } + input_fields { + input_name: "occupation" + input_type: STRING + } + input_fields { + input_name: "new_user_class_level" + input_type: STRING + } + input_fields { + input_name: "tag_category_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "tag_brand_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "price" + input_type: INT32 + } +} +feature_configs { + input_names: "pid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "adgroup_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cate_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 +} +feature_configs { + input_names: "campaign_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "customer" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "brand" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "user_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cms_segid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "cms_group_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "final_gender_code" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "age_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "pvalue_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "shopping_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "occupation" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "new_user_class_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "tag_category_list" + feature_type: SequenceFeature + embedding_dim: 16 + hash_bucket_size: 100000 + sub_feature_type: IdFeature + separator: "|" +} +feature_configs { + input_names: "tag_brand_list" + feature_type: SequenceFeature + embedding_dim: 16 + hash_bucket_size: 100000 + sub_feature_type: IdFeature + separator: "|" +} +feature_configs { + input_names: "price" + feature_type: IdFeature + embedding_dim: 16 + num_buckets: 50 +} +model_config { + model_class: "DBMTL" + feature_groups { + group_name: "all" + feature_names: "user_id" + feature_names: "cms_segid" + feature_names: "cms_group_id" + feature_names: "age_level" + feature_names: "pvalue_level" + feature_names: "shopping_level" + feature_names: "occupation" + feature_names: "new_user_class_level" + feature_names: "adgroup_id" + feature_names: "cate_id" + feature_names: "campaign_id" + feature_names: "customer" + feature_names: "brand" + feature_names: "price" + feature_names: "pid" + wide_deep: DEEP + sequence_features: { + group_name: "seq_fea" + tf_summary: false + seq_att_map: { + key: "brand" + key: "cate_id" + hist_seq: "tag_brand_list" + hist_seq: "tag_category_list" + } + } + } + dbmtl { + bottom_dnn { + hidden_units: [1024, 512, 256] + } + task_towers { + tower_name: "ctr" + label_name: "clk" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + task_towers { + tower_name: "cvr" + label_name: "buy" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_tower_names: ["ctr"] + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + l2_regularization: 1e-6 + } + embedding_regularization: 5e-6 +} diff --git a/samples/model_config/dbmtl_on_numeric_num_buckets_sequence_feature_taobao.config b/samples/model_config/dbmtl_on_numeric_num_buckets_sequence_feature_taobao.config new file mode 100644 index 000000000..d5d2f304f --- /dev/null +++ b/samples/model_config/dbmtl_on_numeric_num_buckets_sequence_feature_taobao.config @@ -0,0 +1,306 @@ +train_input_path: "data/test/tb_data/taobao_train_data" +eval_input_path: "data/test/tb_data/taobao_test_data" +model_dir: "experiments/dbmtl_taobao_ckpt" + +train_config { + optimizer_config { + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + use_moving_average: false + } + num_steps: 5000 + sync_replicas: true + save_checkpoints_steps: 100 + log_step_count_steps: 100 +} +eval_config { + metrics_set { + auc { + } + } +} +data_config { + batch_size: 4096 + label_fields: "clk" + label_fields: "buy" + prefetch_size: 1 + input_type: CSVInput + input_fields { + input_name: "clk" + input_type: INT32 + } + input_fields { + input_name: "buy" + input_type: INT32 + } + input_fields { + input_name: "pid" + input_type: STRING + } + input_fields { + input_name: "adgroup_id" + input_type: STRING + } + input_fields { + input_name: "cate_id" + input_type: STRING + } + input_fields { + input_name: "campaign_id" + input_type: STRING + } + input_fields { + input_name: "customer" + input_type: STRING + } + input_fields { + input_name: "brand" + input_type: STRING + } + input_fields { + input_name: "user_id" + input_type: STRING + } + input_fields { + input_name: "cms_segid" + input_type: STRING + } + input_fields { + input_name: "cms_group_id" + input_type: STRING + } + input_fields { + input_name: "final_gender_code" + input_type: STRING + } + input_fields { + input_name: "age_level" + input_type: STRING + } + input_fields { + input_name: "pvalue_level" + input_type: STRING + } + input_fields { + input_name: "shopping_level" + input_type: STRING + } + input_fields { + input_name: "occupation" + input_type: STRING + } + input_fields { + input_name: "new_user_class_level" + input_type: STRING + } + input_fields { + input_name: "tag_category_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "tag_brand_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "price" + input_type: INT32 + } +} +feature_configs { + input_names: "pid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "adgroup_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cate_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 +} +feature_configs { + input_names: "campaign_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "customer" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "brand" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "user_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cms_segid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "cms_group_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "final_gender_code" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "age_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "pvalue_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "shopping_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "occupation" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "new_user_class_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "tag_category_list" + feature_type: SequenceFeature + embedding_dim: 16 + num_buckets: 15 + max_val: 100000 + min_val: 0 + sub_feature_type: RawFeature + sequence_length: 300 + separator: "|" +} +feature_configs { + input_names: "tag_brand_list" + feature_type: SequenceFeature + embedding_dim: 16 + num_buckets: 15 + max_val: 100000 + min_val: 0 + sub_feature_type: RawFeature + sequence_length: 300 + separator: "|" +} +feature_configs { + input_names: "price" + feature_type: IdFeature + embedding_dim: 16 + num_buckets: 50 +} +model_config { + model_class: "DBMTL" + feature_groups { + group_name: "all" + feature_names: "user_id" + feature_names: "cms_segid" + feature_names: "cms_group_id" + feature_names: "age_level" + feature_names: "pvalue_level" + feature_names: "shopping_level" + feature_names: "occupation" + feature_names: "new_user_class_level" + feature_names: "adgroup_id" + feature_names: "cate_id" + feature_names: "campaign_id" + feature_names: "customer" + feature_names: "brand" + feature_names: "price" + feature_names: "pid" + wide_deep: DEEP + sequence_features: { + group_name: "seq_fea" + tf_summary: false + seq_att_map: { + key: "brand" + key: "cate_id" + hist_seq: "tag_brand_list" + hist_seq: "tag_category_list" + } + } + } + dbmtl { + bottom_dnn { + hidden_units: [1024, 512, 256] + } + task_towers { + tower_name: "ctr" + label_name: "clk" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + task_towers { + tower_name: "cvr" + label_name: "buy" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_tower_names: ["ctr"] + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + l2_regularization: 1e-6 + } + embedding_regularization: 5e-6 +} diff --git a/samples/model_config/dbmtl_on_numeric_raw_sequence_feature_taobao.config b/samples/model_config/dbmtl_on_numeric_raw_sequence_feature_taobao.config new file mode 100644 index 000000000..6b2a3559d --- /dev/null +++ b/samples/model_config/dbmtl_on_numeric_raw_sequence_feature_taobao.config @@ -0,0 +1,300 @@ +train_input_path: "data/test/tb_data/taobao_train_data" +eval_input_path: "data/test/tb_data/taobao_test_data" +model_dir: "experiments/dbmtl_taobao_ckpt" + +train_config { + optimizer_config { + adam_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.001 + decay_steps: 1000 + decay_factor: 0.5 + min_learning_rate: 1e-07 + } + } + } + use_moving_average: false + } + num_steps: 5000 + sync_replicas: true + save_checkpoints_steps: 100 + log_step_count_steps: 100 +} +eval_config { + metrics_set { + auc { + } + } +} +data_config { + batch_size: 4096 + label_fields: "clk" + label_fields: "buy" + prefetch_size: 1 + input_type: CSVInput + input_fields { + input_name: "clk" + input_type: INT32 + } + input_fields { + input_name: "buy" + input_type: INT32 + } + input_fields { + input_name: "pid" + input_type: STRING + } + input_fields { + input_name: "adgroup_id" + input_type: STRING + } + input_fields { + input_name: "cate_id" + input_type: STRING + } + input_fields { + input_name: "campaign_id" + input_type: STRING + } + input_fields { + input_name: "customer" + input_type: STRING + } + input_fields { + input_name: "brand" + input_type: STRING + } + input_fields { + input_name: "user_id" + input_type: STRING + } + input_fields { + input_name: "cms_segid" + input_type: STRING + } + input_fields { + input_name: "cms_group_id" + input_type: STRING + } + input_fields { + input_name: "final_gender_code" + input_type: STRING + } + input_fields { + input_name: "age_level" + input_type: STRING + } + input_fields { + input_name: "pvalue_level" + input_type: STRING + } + input_fields { + input_name: "shopping_level" + input_type: STRING + } + input_fields { + input_name: "occupation" + input_type: STRING + } + input_fields { + input_name: "new_user_class_level" + input_type: STRING + } + input_fields { + input_name: "tag_category_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "tag_brand_list" + input_type: STRING + default_val: "0" + } + input_fields { + input_name: "price" + input_type: INT32 + } +} +feature_configs { + input_names: "pid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "adgroup_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cate_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10000 +} +feature_configs { + input_names: "campaign_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "customer" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "brand" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "user_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100000 +} +feature_configs { + input_names: "cms_segid" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "cms_group_id" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 100 +} +feature_configs { + input_names: "final_gender_code" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "age_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "pvalue_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "shopping_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "occupation" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "new_user_class_level" + feature_type: IdFeature + embedding_dim: 16 + hash_bucket_size: 10 +} +feature_configs { + input_names: "tag_category_list" + feature_type: SequenceFeature + sub_feature_type: RawFeature + sequence_length:50 + embedding_dim: 16 + separator: "|" +} +feature_configs { + input_names: "tag_brand_list" + feature_type: SequenceFeature + sub_feature_type: RawFeature + sequence_length:50 + embedding_dim: 16 + separator: "|" +} +feature_configs { + input_names: "price" + feature_type: IdFeature + embedding_dim: 16 + num_buckets: 50 +} +model_config { + model_class: "DBMTL" + feature_groups { + group_name: "all" + feature_names: "user_id" + feature_names: "cms_segid" + feature_names: "cms_group_id" + feature_names: "age_level" + feature_names: "pvalue_level" + feature_names: "shopping_level" + feature_names: "occupation" + feature_names: "new_user_class_level" + feature_names: "adgroup_id" + feature_names: "cate_id" + feature_names: "campaign_id" + feature_names: "customer" + feature_names: "brand" + feature_names: "price" + feature_names: "pid" + wide_deep: DEEP + sequence_features: { + group_name: "seq_fea" + tf_summary: false + seq_att_map: { + key: "brand" + key: "cate_id" + hist_seq: "tag_brand_list" + hist_seq: "tag_category_list" + } + } + } + dbmtl { + bottom_dnn { + hidden_units: [1024, 512, 256] + } + task_towers { + tower_name: "ctr" + label_name: "clk" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + task_towers { + tower_name: "cvr" + label_name: "buy" + loss_type: CLASSIFICATION + metrics_set: { + auc {} + } + dnn { + hidden_units: [256, 128, 64, 32] + } + relation_tower_names: ["ctr"] + relation_dnn { + hidden_units: [32] + } + weight: 1.0 + } + l2_regularization: 1e-6 + } + embedding_regularization: 5e-6 +} diff --git a/samples/rtp_fg/fg_test_extensions_final.config b/samples/rtp_fg/fg_test_extensions_final.config index 58266998c..fe3e381ca 100644 --- a/samples/rtp_fg/fg_test_extensions_final.config +++ b/samples/rtp_fg/fg_test_extensions_final.config @@ -332,6 +332,7 @@ feature_config { hash_bucket_size: 100000 separator: "" combiner: "mean" + sub_feature_type: IdFeature } features { input_names: "opt_content_long_seq_source_type" @@ -340,6 +341,7 @@ feature_config { hash_bucket_size: 100000 separator: "" combiner: "mean" + sub_feature_type: IdFeature } } diff --git a/scripts/pre-commit b/scripts/pre-commit new file mode 100755 index 000000000..fbd34dfde --- /dev/null +++ b/scripts/pre-commit @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# File generated by pre-commit: https://pre-commit.com +# ID: 138fd403232d2ddd5efb44317e38bf03 +import os +import sys + +# we try our best, but the shebang of this script is difficult to determine: +# - macos doesn't ship with python3 +# - windows executables are almost always `python.exe` +# therefore we continue to support python2 for this small script +if sys.version_info < (3, 3): + from distutils.spawn import find_executable as which +else: + from shutil import which + +# work around https://github.com/Homebrew/homebrew-core/issues/30445 +os.environ.pop('__PYVENV_LAUNCHER__', None) + +# start templated +INSTALL_PYTHON = '/apsarapangu/disk2/yancheng.lgq/miniconda3/envs/tf_1_15/bin/python' +ARGS = [ + 'hook-impl', '--config=.pre-commit-config.yaml', '--hook-type=pre-commit' +] +# end templated +ARGS.extend(('--hook-dir', os.path.realpath(os.path.dirname(__file__)))) +ARGS.append('--') +ARGS.extend(sys.argv[1:]) + +DNE = '`pre-commit` not found. Did you forget to activate your virtualenv?' +if os.access(INSTALL_PYTHON, os.X_OK): + CMD = [INSTALL_PYTHON, '-mpre_commit'] +elif which('pre-commit'): + CMD = ['pre-commit'] +else: + raise SystemExit(DNE) + +CMD.extend(ARGS) +if sys.platform == 'win32': # https://bugs.python.org/issue19124 + import subprocess + + if sys.version_info < (3, 7): # https://bugs.python.org/issue25942 + raise SystemExit(subprocess.Popen(CMD).wait()) + else: + raise SystemExit(subprocess.call(CMD)) +else: + os.execvp(CMD[0], CMD) From de72c76e265214cb0c20cba7357cf3cecf094d95 Mon Sep 17 00:00:00 2001 From: lgqfhwy Date: Wed, 2 Mar 2022 18:03:42 +0800 Subject: [PATCH 09/10] new version 0.4.0 (#127) * new version 0.4.0 * fix bug for hiveinput --- easy_rec/python/protos/dataset.proto | 2 +- easy_rec/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/easy_rec/python/protos/dataset.proto b/easy_rec/python/protos/dataset.proto index bc6875cfc..326e03e88 100644 --- a/easy_rec/python/protos/dataset.proto +++ b/easy_rec/python/protos/dataset.proto @@ -178,7 +178,7 @@ message DatasetConfig { // input pipelines DummyInput = 8; KafkaInput = 13; - HiveInput = 16; + HiveInput = 17; } required InputType input_type = 10; diff --git a/easy_rec/version.py b/easy_rec/version.py index 4dbed0c5a..2d5248ce5 100644 --- a/easy_rec/version.py +++ b/easy_rec/version.py @@ -1,3 +1,3 @@ # -*- encoding:utf-8 -*- # Copyright (c) Alibaba, Inc. and its affiliates. -__version__ = '0.3.1' +__version__ = '0.4.0' From dfa375ba30c577bc3017d8cbdf4ca94fd90a8f87 Mon Sep 17 00:00:00 2001 From: 0xflotus <0xflotus@gmail.com> Date: Thu, 3 Mar 2022 02:21:01 +0100 Subject: [PATCH 10/10] fix: small error (#119) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index effdc2f42..3ae64b433 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ ### EasyRec is an easy to use framework for Recommendation -EasyRec implements state of the art deep learning models used in common recommedation tasks: candidate generation(matching), scoring(ranking), and multi-task learning. It improves the efficiency of generating high performance models by simple configuration and hyper parameter tuning(HPO). +EasyRec implements state of the art deep learning models used in common recommendation tasks: candidate generation(matching), scoring(ranking), and multi-task learning. It improves the efficiency of generating high performance models by simple configuration and hyper parameter tuning(HPO).